In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import types as T 
from pyspark.sql import functions as F 

In [2]:
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('test') \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/01 17:14:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/01 17:14:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Loading green taxi data

In [48]:
df_green = spark.read \
           .parquet('./data/unified/green/*/*')

In [49]:
df_green.createOrReplaceTempView('green_trips_data')

# Group By on green taxi data

In [50]:
df_green_result = spark.sql("""
SELECT 
    date_trunc('hour', lpep_pickup_datetime) AS hour, 
    PULocationID as zone,    
    
    ROUND(SUM(total_amount), 2)  AS amount,
    COUNT(1) AS number_of_records
FROM 
    green_trips_data
WHERE 
    lpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
ORDER BY
    1, 2; 
""")

In [51]:
df_green_result.show()



+-------------------+----+-------+-----------------+
|               hour|zone| amount|number_of_records|
+-------------------+----+-------+-----------------+
|2020-01-01 00:00:00|   7| 769.73|               45|
|2020-01-01 00:00:00|  17| 195.03|                9|
|2020-01-01 00:00:00|  18|    7.8|                1|
|2020-01-01 00:00:00|  22|   15.8|                1|
|2020-01-01 00:00:00|  24|   87.6|                3|
|2020-01-01 00:00:00|  25|  531.0|               26|
|2020-01-01 00:00:00|  29|   61.3|                1|
|2020-01-01 00:00:00|  32|  68.95|                2|
|2020-01-01 00:00:00|  33| 317.27|               11|
|2020-01-01 00:00:00|  35| 129.96|                5|
|2020-01-01 00:00:00|  36| 295.34|               11|
|2020-01-01 00:00:00|  37| 175.67|                6|
|2020-01-01 00:00:00|  38|  98.79|                2|
|2020-01-01 00:00:00|  40| 168.98|                8|
|2020-01-01 00:00:00|  41|1363.96|               84|
|2020-01-01 00:00:00|  42| 799.76|            

                                                                                

In [52]:
df_green_result.write.parquet('data/reports/revenue/green', mode='overwrite')

                                                                                

# Loading yellow taxi data

In [53]:
df_yellow = spark.read \
           .parquet('./data/unified/yellow/*/*')

In [54]:
df_yellow.createOrReplaceTempView('yellow_trips_data')

# Group By on yellow taxi data

In [55]:
df_yellow_result = spark.sql("""
SELECT 
    date_trunc('hour', tpep_pickup_datetime) AS hour, 
    PULocationID as zone,    
    
    ROUND(SUM(total_amount), 2)  AS amount,
    COUNT(1) AS number_of_records
FROM 
    yellow_trips_data
WHERE 
    tpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
ORDER BY
    1, 2; 
""")

In [56]:
df_yellow_result.show()

[Stage 78:>                                                         (0 + 4) / 4]

+-------------------+----+-------+-----------------+
|               hour|zone| amount|number_of_records|
+-------------------+----+-------+-----------------+
|2020-01-01 00:00:00|   3|   25.0|                1|
|2020-01-01 00:00:00|   4| 1004.3|               57|
|2020-01-01 00:00:00|   7| 455.17|               38|
|2020-01-01 00:00:00|  10|  42.41|                2|
|2020-01-01 00:00:00|  12|  107.0|                6|
|2020-01-01 00:00:00|  13| 1214.8|               56|
|2020-01-01 00:00:00|  14|    8.8|                1|
|2020-01-01 00:00:00|  15|  34.09|                1|
|2020-01-01 00:00:00|  17| 220.21|                8|
|2020-01-01 00:00:00|  18|    5.8|                1|
|2020-01-01 00:00:00|  24| 754.95|               45|
|2020-01-01 00:00:00|  25| 324.35|               16|
|2020-01-01 00:00:00|  32|   18.0|                1|
|2020-01-01 00:00:00|  33| 255.56|                8|
|2020-01-01 00:00:00|  34|   19.3|                1|
|2020-01-01 00:00:00|  36| 109.17|            



In [57]:
df_yellow_result.write.parquet('data/reports/revenue/yellow', mode='overwrite')

                                                                                

# Join

In [63]:
df_green_result_tmp = df_green_result \
    .withColumnRenamed('amount', 'green_amount') \
    .withColumnRenamed('number_of_records', 'green_records')

In [64]:
df_yellow_result_tmp = df_yellow_result \
    .withColumnRenamed('amount', 'yellow_amount') \
    .withColumnRenamed('number_of_records', 'yellow_records')

In [65]:
df_join = df_green_result_tmp.join(df_yellow_result_tmp, on=['hour', 'zone'], how='outer')

In [66]:
df_join.show(5)

[Stage 101:>                                                        (0 + 1) / 1]

+-------------------+----+------------+-------------+-------------+--------------+
|               hour|zone|green_amount|green_records|yellow_amount|yellow_records|
+-------------------+----+------------+-------------+-------------+--------------+
|2020-01-01 00:00:00|  10|        null|         null|        42.41|             2|
|2020-01-01 00:00:00|  17|      195.03|            9|       220.21|             8|
|2020-01-01 00:00:00|  35|      129.96|            5|         null|          null|
|2020-01-01 00:00:00|  36|      295.34|           11|       109.17|             3|
|2020-01-01 00:00:00|  42|      799.76|           52|       635.35|            46|
+-------------------+----+------------+-------------+-------------+--------------+
only showing top 5 rows



                                                                                

In [67]:
df_join.write.parquet('data/reports/revenue/total', mode='overwrite')

                                                                                

# Loading and Joining previously prepared data

In [68]:
df_green_result = spark.read \
           .parquet('data/reports/revenue/green')

df_yellow_result = spark.read \
           .parquet('data/reports/revenue/yellow')

In [69]:
df_green_result_tmp = df_green_result \
    .withColumnRenamed('amount', 'green_amount') \
    .withColumnRenamed('number_of_records', 'green_records')

In [70]:
df_yellow_result_tmp = df_yellow_result \
    .withColumnRenamed('amount', 'yellow_amount') \
    .withColumnRenamed('number_of_records', 'yellow_records')

In [71]:
df_join = df_green_result_tmp.join(df_yellow_result_tmp, on=['hour', 'zone'], how='outer')

In [72]:
df_join.show()



+-------------------+----+------------+-------------+-------------+--------------+
|               hour|zone|green_amount|green_records|yellow_amount|yellow_records|
+-------------------+----+------------+-------------+-------------+--------------+
|2020-01-01 00:00:00|  10|        null|         null|        42.41|             2|
|2020-01-01 00:00:00|  17|      195.03|            9|       220.21|             8|
|2020-01-01 00:00:00|  35|      129.96|            5|         null|          null|
|2020-01-01 00:00:00|  36|      295.34|           11|       109.17|             3|
|2020-01-01 00:00:00|  42|      799.76|           52|       635.35|            46|
|2020-01-01 00:00:00|  45|        null|         null|       732.48|            42|
|2020-01-01 00:00:00|  50|        null|         null|      4177.48|           183|
|2020-01-01 00:00:00|  68|        null|         null|      7825.07|           396|
|2020-01-01 00:00:00|  70|        54.9|            3|          9.3|             1|
|202

                                                                                

In [73]:
df_join.write.parquet('data/reports/revenue/total', mode='overwrite')

                                                                                

# Joining dfs of different size

In [74]:
df_join = spark.read \
           .parquet('data/reports/revenue/total')

In [81]:
df_zones = spark.read \
            .parquet('./zones')

In [82]:
print(df_join.columns)
print(df_zones.columns)

['hour', 'zone', 'green_amount', 'green_records', 'yellow_amount', 'yellow_records']
['LocationID', 'Borough', 'Zone', 'service_zone']


In [83]:
df_zones.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [92]:
df_joint_zones = df_join.join(df_zones, df_join.zone == df_zones.LocationID)

In [94]:
df_joint_zones.drop('zone', 'LocationID').write.parquet('data/tmp/revenue-zones')

                                                                                