In [19]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [20]:
spark = (
    SparkSession.builder
        .master('local[*]')
        .appName('test')
        .config('spark.driver.cores', 3)
        .config('spark.driver.memory', '2g')
        .getOrCreate()
)

In [31]:
df_fact = spark.read.parquet('buckets/data/fact/*/*')
df_fact.printSchema()

root
 |-- tripid: integer (nullable = true)
 |-- vendorid: integer (nullable = true)
 |-- service_type: string (nullable = true)
 |-- ratecodeid: integer (nullable = true)
 |-- pickup_locationid: integer (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- pickup_zone: string (nullable = true)
 |-- dropoff_locationid: integer (nullable = true)
 |-- dropoff_borough: string (nullable = true)
 |-- dropoff_zone: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvemen

In [32]:
df_fact.groupBy('service_type').count().show()



+------------+---------+
|service_type|    count|
+------------+---------+
|       Green|  7975169|
|      Yellow|107741783|
+------------+---------+



                                                                                

In [33]:
df_fact.createOrReplaceTempView('trips_data')

In [39]:
df_result = spark.sql('''
    select 
    -- Reveneue grouping 
    pickup_zone as revenue_zone,
    date_trunc('month', pickup_datetime) as revenue_month, 
    --Note: For BQ use instead: date_trunc(pickup_datetime, month) as revenue_month, 

    service_type, 

    -- Revenue calculation 
    sum(fare_amount) as revenue_monthly_fare,
    sum(extra) as revenue_monthly_extra,
    sum(mta_tax) as revenue_monthly_mta_tax,
    sum(tip_amount) as revenue_monthly_tip_amount,
    sum(tolls_amount) as revenue_monthly_tolls_amount,
    sum(ehail_fee) as revenue_monthly_ehail_fee,
    sum(improvement_surcharge) as revenue_monthly_improvement_surcharge,
    sum(total_amount) as revenue_monthly_total_amount,
    sum(congestion_surcharge) as revenue_monthly_congestion_surcharge,

    -- Additional calculations
    count(tripid) as total_monthly_trips,
    avg(passenger_count) as avg_montly_passenger_count,
    avg(trip_distance) as avg_montly_trip_distance

    from trips_data
    group by 1,2,3
''')

In [41]:
df_result.write.parquet('buckets/report/revenue/', mode='overwrite')

                                                                                

In [42]:
df_revenue = spark.read.parquet('buckets/report/revenue/*')

In [43]:
df_revenue.count()

12626

In [None]:
spark.sparkContext.stop()