In [16]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *


spark = SparkSession.builder.appName("Green_Analysis") \
.config("spark.sql.pyspark.jvm", "false") \
.config("spark.driver.memory", "16g") \
.config("spark.executor.memory", "8gb") \
.getOrCreate()

schema = StructType([
    StructField("VendorID",               IntegerType(),  nullable=True),
    StructField("tpep_pickup_datetime",   TimestampNTZType(), nullable=True),
    StructField("tpep_dropoff_datetime",  TimestampNTZType(), nullable=True),
    StructField("store_and_fwd_flag",     BooleanType(),  nullable=True),
    StructField("RatecodeID",             IntegerType(),  nullable=True),
    StructField("PULocationID",           IntegerType(),  nullable=True),
    StructField("DOLocationID",           IntegerType(),  nullable=True),
    StructField("passenger_count",        IntegerType(),  nullable=True),
    StructField("trip_distance",          FloatType(),    nullable=True),
    StructField("fare_amount",            FloatType(),    nullable=True),
    StructField("extra",                  FloatType(),    nullable=True),
    StructField("mta_tax",                FloatType(),    nullable=True),
    StructField("tip_amount",             FloatType(),    nullable=True),
    StructField("tolls_amount",           FloatType(),    nullable=True),
    StructField("improvement_surcharge",  FloatType(),    nullable=True),
    StructField("total_amount",           FloatType(),    nullable=True),
    StructField("payment_type",           IntegerType(),  nullable=True),
    StructField("congestion_surcharge",   FloatType(),    nullable=True)
])



In [19]:
df = (spark.read
        .schema(schema)
        .format("parquet")
        .load("data/taxi/green_taxi/*"))


spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [35]:
df

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,congestion_surcharge
2,2014-01-01 00:17:26,2014-01-01 00:37:11,True,1,17,225,1,2.28,13.5,0.5,0.5,0.0,0.0,,14.5,2,
1,2014-01-01 00:29:12,2014-01-01 00:37:43,True,1,127,241,1,2.1,9.0,0.0,0.5,0.0,0.0,,9.5,2,
2,2014-01-01 00:31:35,2014-01-01 00:44:09,True,1,166,243,1,4.72,15.5,0.5,0.5,4.0,0.0,,20.5,1,
2,2014-01-01 00:07:01,2014-01-01 00:21:54,True,1,7,157,1,2.88,13.0,0.5,0.5,2.88,0.0,,16.88,1,
2,2014-01-01 00:26:43,2014-01-01 00:37:17,True,1,83,197,2,3.8,13.0,0.5,0.5,0.0,0.0,,14.0,2,
2,2014-01-01 00:23:34,2014-01-01 00:32:50,True,1,226,7,6,1.85,9.0,0.5,0.5,2.85,0.0,,12.85,1,
2,2014-01-01 00:45:12,2014-01-01 00:54:03,True,1,7,95,6,7.01,20.0,0.5,0.5,4.1,0.0,,25.1,1,
2,2014-01-01 00:10:17,2014-01-01 00:48:42,True,1,61,74,2,13.91,42.5,0.5,0.5,8.6,0.0,,52.1,1,
2,2014-01-01 00:53:39,2014-01-01 01:05:19,True,1,74,116,2,2.0,10.0,0.5,0.5,2.1,0.0,,13.1,1,
2,2014-01-01 00:15:50,2014-01-01 00:20:59,True,1,61,83,2,1.28,6.5,0.5,0.5,0.0,0.0,,7.5,2,


In [25]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- congestion_surcharge: float (nullable = true)



In [26]:
df.groupBy("passenger_count") \
  .count() \
  .orderBy(col("passenger_count").asc()) \
  .show(20)

+---------------+--------+
|passenger_count|   count|
+---------------+--------+
|           NULL| 1885146|
|              0|   60805|
|              1|68698946|
|              2| 6221946|
|              3| 1668535|
|              4|  531213|
|              5| 3068841|
|              6| 1344714|
|              7|    1854|
|              8|    2071|
|              9|     615|
|             32|       1|
|             48|       1|
+---------------+--------+



In [64]:
df.groupBy("PULocationID") \
  .count() \
  .orderBy(col("count").desc()) \
  .show(20)

+------------+-------+
|PULocationID|  count|
+------------+-------+
|          74|5031919|
|          41|4346450|
|          75|4307229|
|           7|3827626|
|          82|3312398|
|         166|3234262|
|         255|3182752|
|          42|2865224|
|         181|2444305|
|         129|2309000|
|          97|2287692|
|          95|2046028|
|         244|1977116|
|          25|1758336|
|          33|1751404|
|         260|1647871|
|         256|1456156|
|          65|1446187|
|          66|1426963|
|         223|1378993|
+------------+-------+
only showing top 20 rows



In [37]:
df.groupBy("DOLocationID") \
  .count() \
  .orderBy(col("count").desc()) \
  .show(20)

+------------+-------+
|DOLocationID|  count|
+------------+-------+
|          74|2667561|
|          42|2621076|
|           7|2331674|
|          41|2315777|
|         129|2162944|
|         181|1782844|
|          75|1705656|
|          82|1508333|
|         166|1502475|
|          61|1445321|
|         223|1339323|
|         112|1250600|
|         244|1246487|
|          97|1224041|
|          49|1165438|
|          17|1158269|
|         116|1153821|
|         226|1125267|
|         255|1118343|
|         236|1091821|
+------------+-------+
only showing top 20 rows



In [76]:
df_with_distance = df \
    .withColumn("trip_distance_bucket",
                when(col("trip_distance") <= 1.0, lit("0-1"))
                .when((col("trip_distance") > 1.0) & (col("trip_distance") <= 2.0), lit("1-2"))
                .when((col("trip_distance") > 2.0) & (col("trip_distance") <= 3.0), lit("2-3"))
                .when((col("trip_distance") > 3.0) & (col("trip_distance") <= 4.0), lit("3-4"))
                .when((col("trip_distance") > 4.0) & (col("trip_distance") <= 5.0), lit("4-5"))
                .when((col("trip_distance") > 5.0) & (col("trip_distance") <= 10.0), lit("5-10"))
                .otherwise(lit(">10")))

In [77]:
df_with_distance.groupBy("trip_distance_bucket") \
  .count() \
  .orderBy(col("count").desc()) \
  .show()

+--------------------+--------+
|trip_distance_bucket|   count|
+--------------------+--------+
|                 1-2|24037382|
|                 0-1|19459248|
|                 2-3|13093190|
|                5-10|10303205|
|                 3-4| 8201022|
|                 4-5| 5075134|
|                 >10| 3315507|
+--------------------+--------+



In [51]:
df_with_hours = df \
    .withColumn("pickup_hour", date_format(col("tpep_pickup_datetime"), "HH"))

In [56]:
df_with_hours.groupBy("pickup_hour") \
  .count() \
  .orderBy(col("count").desc()) \
  .show(24)

+-----------+-------+
|pickup_hour|  count|
+-----------+-------+
|         18|5633574|
|         19|5429511|
|         17|5230272|
|         20|4835939|
|         16|4802373|
|         21|4545547|
|         15|4453940|
|         22|4294381|
|         14|4036387|
|         23|3949591|
|         09|3688884|
|         13|3596132|
|         12|3554521|
|         10|3529261|
|         11|3505560|
|         08|3481038|
|         00|3284775|
|         01|2555032|
|         07|2362746|
|         02|1884677|
|         03|1474864|
|         04|1247824|
|         06|1233301|
|         05| 874558|
+-----------+-------+



In [200]:
df_date = df \
    .withColumn("trip_date", date_format(col("tpep_pickup_datetime"), "yyyy"))

In [205]:
df_date.groupBy("trip_date") \
  .count() \
  .orderBy(col("count").asc()) \
  .show()

+---------+--------+
|trip_date|   count|
+---------+--------+
|     2081|       1|
|     2062|       1|
|     2041|       1|
|     2035|       1|
|     2030|       2|
|     2012|       3|
|     2025|      11|
|     2008|     114|
|     2009|     315|
|     2010|     348|
|     2024|  660198|
|     2023|  787055|
|     2022|  840394|
|     2021| 1068729|
|     2020| 1734166|
|     2019| 6300814|
|     2018| 8899314|
|     2017|11736906|
|     2014|15837009|
|     2016|16385541|
+---------+--------+
only showing top 20 rows



In [204]:
df_date.groupBy("payment_type", "trip_date") \
  .count() \
  .orderBy(col("trip_date").asc()) \
  .show(40)

+------------+---------+--------+
|payment_type|trip_date|   count|
+------------+---------+--------+
|           2|     2008|      84|
|           1|     2008|      28|
|           3|     2008|       2|
|           2|     2009|     239|
|           1|     2009|      73|
|           3|     2009|       3|
|           2|     2010|     216|
|           1|     2010|     131|
|           3|     2010|       1|
|           2|     2012|       1|
|           1|     2012|       2|
|           4|     2014|   38599|
|           2|     2014| 9206882|
|           3|     2014|   38463|
|           5|     2014|     439|
|           1|     2014| 6552626|
|           5|     2015|     959|
|           3|     2015|   64508|
|           4|     2015|   61267|
|           1|     2015| 8879597|
|           2|     2015|10227434|
|           2|     2016| 8149171|
|           1|     2016| 8117840|
|           4|     2016|   46113|
|           3|     2016|   71687|
|           5|     2016|     730|
|           5|

In [114]:
zones = (spark.read
        .option("header", "true")
        .format("csv")
        .load("data/taxi/green_taxi/taxi+_zone_lookup.csv"))


spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [116]:
zones

LocationID,Borough,Zone,service_zone
1,EWR,Newark Airport,EWR
2,Queens,Jamaica Bay,Boro Zone
3,Bronx,Allerton/Pelham G...,Boro Zone
4,Manhattan,Alphabet City,Yellow Zone
5,Staten Island,Arden Heights,Boro Zone
6,Staten Island,Arrochar/Fort Wad...,Boro Zone
7,Queens,Astoria,Boro Zone
8,Queens,Astoria Park,Boro Zone
9,Queens,Auburndale,Boro Zone
10,Queens,Baisley Park,Boro Zone


In [185]:
zones_pickup = zones.select(
    col("LocationID").alias("PULocationID"),
    col("Borough").alias("pickup_borough"),
    col("Zone").alias("pickup_zone")
)

zones_dropoff = zones.select(
    col("LocationID").alias("DOLocationID"),
    col("Borough").alias("dropoff_borough"),
    col("Zone").alias("dropoff_zone")
)

In [186]:
df_enriched = df \
    .join(broadcast(zones_pickup), on="PULocationID", how="left") \
    .join(broadcast(zones_dropoff), on="DOLocationID", how="left")

In [187]:
df_enriched = df_enriched.withColumn(
    "route",
    concat(
        col("pickup_borough"),      lit(", "),
        col("pickup_zone"),         lit(" → "),
        col("dropoff_borough"),     lit(", "),
        col("dropoff_zone")
    )
).drop(
    "pickup_borough",
    "pickup_zone",
    "dropoff_borough",
    "dropoff_zone"
)

In [191]:
df_enriched.groupBy("route") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(30, truncate=False)

+-------------------------------------------------------------------------+------+
|route                                                                    |count |
+-------------------------------------------------------------------------+------+
|Queens, Astoria → Queens, Astoria                                        |999875|
|Manhattan, East Harlem South → Manhattan, East Harlem North              |750756|
|Manhattan, Central Harlem → Manhattan, Central Harlem North              |707359|
|Queens, Jackson Heights → Queens, Jackson Heights                        |640927|
|Manhattan, East Harlem North → Manhattan, East Harlem South              |607977|
|Queens, Forest Hills → Queens, Forest Hills                              |600325|
|Queens, Elmhurst → Queens, Jackson Heights                               |576950|
|Manhattan, Central Harlem North → Manhattan, Central Harlem North        |575201|
|Brooklyn, Park Slope → Brooklyn, Park Slope                              |530874|
|Que

In [189]:
df_enriched

DOLocationID,PULocationID,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,congestion_surcharge,route
225,17,2,2014-01-01 00:17:26,2014-01-01 00:37:11,True,1,1,2.28,13.5,0.5,0.5,0.0,0.0,,14.5,2,,"Brooklyn, Bedford..."
241,127,1,2014-01-01 00:29:12,2014-01-01 00:37:43,True,1,1,2.1,9.0,0.0,0.5,0.0,0.0,,9.5,2,,"Manhattan, Inwood..."
243,166,2,2014-01-01 00:31:35,2014-01-01 00:44:09,True,1,1,4.72,15.5,0.5,0.5,4.0,0.0,,20.5,1,,"Manhattan, Mornin..."
157,7,2,2014-01-01 00:07:01,2014-01-01 00:21:54,True,1,1,2.88,13.0,0.5,0.5,2.88,0.0,,16.88,1,,"Queens, Astoria →..."
197,83,2,2014-01-01 00:26:43,2014-01-01 00:37:17,True,1,2,3.8,13.0,0.5,0.5,0.0,0.0,,14.0,2,,"Queens, Elmhurst/..."
7,226,2,2014-01-01 00:23:34,2014-01-01 00:32:50,True,1,6,1.85,9.0,0.5,0.5,2.85,0.0,,12.85,1,,"Queens, Sunnyside..."
95,7,2,2014-01-01 00:45:12,2014-01-01 00:54:03,True,1,6,7.01,20.0,0.5,0.5,4.1,0.0,,25.1,1,,"Queens, Astoria →..."
74,61,2,2014-01-01 00:10:17,2014-01-01 00:48:42,True,1,2,13.91,42.5,0.5,0.5,8.6,0.0,,52.1,1,,"Brooklyn, Crown H..."
116,74,2,2014-01-01 00:53:39,2014-01-01 01:05:19,True,1,2,2.0,10.0,0.5,0.5,2.1,0.0,,13.1,1,,"Manhattan, East H..."
83,61,2,2014-01-01 00:15:50,2014-01-01 00:20:59,True,1,2,1.28,6.5,0.5,0.5,0.0,0.0,,7.5,2,,"Brooklyn, Crown H..."
