In [None]:
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import  types
from pyspark.sql import functions as F
import pandas as pd

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [None]:
df_green = spark.read.parquet('/home/Дмитрий/datacamp/dataeng-zoomcamp/week_5_batch_processing/data/pq/green/*/*')

In [None]:
df_green.columns

In [None]:
# convert spark table in spark SQL table
df_green.registerTempTable('green')

In [None]:
df_green_revenue = spark.sql("""
      select 
    -- Reveneue grouping 
    date_trunc('hour', lpep_pickup_datetime) as hour,
    PULocationID as zone,

    -- Revenue calculation 
    sum(total_amount) as amount,
    count(1) as number_records

    from green
    where lpep_pickup_datetime >= '2020-01-01'
    group by 1,2
""")

In [None]:
df_green_revenue.count()

In [None]:
df_green_revenue.orderBy(['hour', 'zone']).show()

In [None]:
df_green_revenue \
    .repartition(4) \
    .write.parquet('/home/Дмитрий/datacamp/dataeng-zoomcamp/week_5_batch_processing/data/report/revenue/green/', mode='overwrite')

In [None]:
df_yellow = spark.read.parquet('/home/Дмитрий/datacamp/dataeng-zoomcamp/week_5_batch_processing/data/pq/yellow/*/*')

In [None]:
# convert spark table in spark SQL table
df_yellow.registerTempTable('yellow')

In [None]:
df_yellow_revenue = spark.sql("""
      select 
    -- Reveneue grouping 
    date_trunc('hour', tpep_pickup_datetime) as hour,
    PULocationID as zone,

    -- Revenue calculation 
    sum(total_amount) as amount,
    count(1) as number_records

    from yellow
    where tpep_pickup_datetime >= '2020-01-01'
    group by 1,2
""")

In [None]:
df_yellow_revenue.count()

In [None]:
df_yellow_revenue.orderBy(['hour', 'zone']).show()

In [None]:
df_yellow_revenue \
    .repartition(4) \
    .write.parquet('/home/Дмитрий/datacamp/dataeng-zoomcamp/week_5_batch_processing/data/report/revenue/yellow/', mode='overwrite')

In [None]:
df_green_revenue = spark.read.parquet('/home/Дмитрий/datacamp/dataeng-zoomcamp/week_5_batch_processing/data/report/revenue/green/')
df_yellow_revenue = spark.read.parquet('/home/Дмитрий/datacamp/dataeng-zoomcamp/week_5_batch_processing/data/report/revenue/yellow/')

In [None]:
df_green_revenue_tmp = df_green_revenue \
    .withColumnRenamed('amount', 'green_amount') \
    .withColumnRenamed('number_records', 'green_amount_records')
    
df_yellow_revenue_tmp = df_yellow_revenue \
    .withColumnRenamed('amount', 'yellow_amount') \
    .withColumnRenamed('number_records', 'yellow_amount_records')

In [None]:
df_join = df_green_revenue_tmp.join(df_yellow_revenue_tmp, on=['hour', 'zone'], how='outer')

In [None]:
df_join \
    .show(20)

In [None]:
df_join.write.parquet('/home/Дмитрий/datacamp/dataeng-zoomcamp/week_5_batch_processing/data/report/revenue/total/', mode='overwrite') 

In [None]:
df_join = spark.read.parquet('/home/Дмитрий/datacamp/dataeng-zoomcamp/week_5_batch_processing/data/report/revenue/total/') 

In [None]:
df_zone = spark.read.parquet('/home/Дмитрий/datacamp/dataeng-zoomcamp/week_5_batch_processing/data/zones')

In [15]:
df_join

DataFrame[hour: timestamp, zone: int, green_amount: double, green_amount_records: bigint, yellow_amount: double, yellow_amount_records: bigint]

In [14]:
df_zone

DataFrame[LocationID: string, Borough: string, Zone: string, service_zone: string]

In [16]:
df_results = df_join.join(df_zone, df_join.zone == df_zone.LocationIDs)

In [18]:
df_results.drop('LocationID').write.parquet('/home/Дмитрий/datacamp/dataeng-zoomcamp/week_5_batch_processing/data/tmp/revenue-zones')

+-------------------+----+------------------+--------------------+------------------+---------------------+---------+--------------------+------------+
|               hour|zone|      green_amount|green_amount_records|     yellow_amount|yellow_amount_records|  Borough|                Zone|service_zone|
+-------------------+----+------------------+--------------------+------------------+---------------------+---------+--------------------+------------+
|2020-01-01 00:00:00|  10|              null|                null|             42.41|                    2|   Queens|        Baisley Park|   Boro Zone|
|2020-01-01 00:00:00|  14|              null|                null|               8.8|                    1| Brooklyn|           Bay Ridge|   Boro Zone|
|2020-01-01 00:00:00|  15|              null|                null|             34.09|                    1|   Queens|Bay Terrace/Fort ...|   Boro Zone|
|2020-01-01 00:00:00|  17|195.03000000000003|                   9|220.20999999999998|   