### Creating spark session for analytics

Now, when all files have the same format, they can be loaded for analytics. By default, the code is for yellow taxi, but it can be easily adjusted for other type - just change 'yellow_taxi' to either one of: 'green_taxi', 'for_hire_vehicle' or 'high_volume_for_hire_vehicle'

In [3]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

#Below memory settings are sufficient for yellow_taxi and high_volume_vehicle. For other types:

#green_taxi:
#.config("spark.driver.memory", "4g") \
#.config("spark.executor.memory", "4g") \

#'for_hire_vehicle:
#.config("spark.driver.memory", "8g") \
#.config("spark.executor.memory", "4g") \

spark = SparkSession.builder.appName("Taxi_Analysis") \
.master("local[*]") \
.config("spark.driver.memory", "16g") \
.config("spark.executor.memory", "16g") \
.getOrCreate()


#Yellow and green taxi:

schema = StructType([
    StructField("VendorID",               IntegerType(),  nullable=True),
    StructField("tpep_pickup_datetime",   TimestampNTZType(), nullable=True),
    StructField("tpep_dropoff_datetime",  TimestampNTZType(), nullable=True),
    StructField("store_and_fwd_flag",     BooleanType(),  nullable=True),
    StructField("RatecodeID",             IntegerType(),  nullable=True),
    StructField("PULocationID",           IntegerType(),  nullable=True),
    StructField("DOLocationID",           IntegerType(),  nullable=True),
    StructField("passenger_count",        IntegerType(),  nullable=True),
    StructField("trip_distance",          FloatType(),    nullable=True),
    StructField("fare_amount",            FloatType(),    nullable=True),
    StructField("extra",                  FloatType(),    nullable=True),
    StructField("mta_tax",                FloatType(),    nullable=True),
    StructField("tip_amount",             FloatType(),    nullable=True),
    StructField("tolls_amount",           FloatType(),    nullable=True),
    StructField("improvement_surcharge",  FloatType(),    nullable=True),
    StructField("total_amount",           FloatType(),    nullable=True),
    StructField("payment_type",           IntegerType(),  nullable=True),
    StructField("congestion_surcharge",   FloatType(),    nullable=True)
])

#for_hire_vehicle:
####################
# schema = StructType([
#     StructField("dispatching_base_num",   StringType(),  nullable=True),
#     StructField("pickup_datetime",   TimestampNTZType(), nullable=True),
#     StructField("dropOff_datetime",  TimestampNTZType(), nullable=True),
#     StructField("PUlocationID",           IntegerType(),  nullable=True),
#     StructField("DOlocationID",           IntegerType(),  nullable=True)
# ])


#high_volume_vehicle:
####################
# schema = StructType([
# StructField("hvfhs_license_num", StringType(), True),
#     StructField("dispatching_base_num", StringType(), True),
#     StructField("originating_base_num", StringType(), True),
#     StructField("request_datetime", TimestampNTZType(), True),
#     StructField("on_scene_datetime", TimestampNTZType(), True),
#     StructField("pickup_datetime", TimestampNTZType(), True),
#     StructField("dropoff_datetime", TimestampNTZType(), True),
#     StructField("PULocationID", IntegerType(), True),
#     StructField("DOLocationID", IntegerType(), True),
#     StructField("trip_miles", DoubleType(), True),
#     StructField("trip_time", LongType(), True),
#     StructField("base_passenger_fare", DoubleType(), True),
#     StructField("tolls", DoubleType(), True),
#     StructField("bcf", DoubleType(), True),
#     StructField("sales_tax", DoubleType(), True),
#     StructField("congestion_surcharge", DoubleType(), True),
#     StructField("airport_fee", FloatType(), True),
#     StructField("tips", DoubleType(), True),
#     StructField("driver_pay", DoubleType(), True)
# ])



Loading all years together to DataFrame

In [4]:
df = (spark.read
        .schema(schema)
        .format("parquet")
        .load("data/taxi/yellow_taxi/p1/*"))

#To see a DataFrame in a more readable way, we can use:

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [10]:
df

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,congestion_surcharge
2,2014-01-01 00:17:26,2014-01-01 00:37:11,True,1,17,225,1,2.28,13.5,0.5,0.5,0.0,0.0,,14.5,2,
1,2014-01-01 00:29:12,2014-01-01 00:37:43,True,1,127,241,1,2.1,9.0,0.0,0.5,0.0,0.0,,9.5,2,
2,2014-01-01 00:31:35,2014-01-01 00:44:09,True,1,166,243,1,4.72,15.5,0.5,0.5,4.0,0.0,,20.5,1,
2,2014-01-01 00:07:01,2014-01-01 00:21:54,True,1,7,157,1,2.88,13.0,0.5,0.5,2.88,0.0,,16.88,1,
2,2014-01-01 00:26:43,2014-01-01 00:37:17,True,1,83,197,2,3.8,13.0,0.5,0.5,0.0,0.0,,14.0,2,
2,2014-01-01 00:23:34,2014-01-01 00:32:50,True,1,226,7,6,1.85,9.0,0.5,0.5,2.85,0.0,,12.85,1,
2,2014-01-01 00:45:12,2014-01-01 00:54:03,True,1,7,95,6,7.01,20.0,0.5,0.5,4.1,0.0,,25.1,1,
2,2014-01-01 00:10:17,2014-01-01 00:48:42,True,1,61,74,2,13.91,42.5,0.5,0.5,8.6,0.0,,52.1,1,
2,2014-01-01 00:53:39,2014-01-01 01:05:19,True,1,74,116,2,2.0,10.0,0.5,0.5,2.1,0.0,,13.1,1,
2,2014-01-01 00:15:50,2014-01-01 00:20:59,True,1,61,83,2,1.28,6.5,0.5,0.5,0.0,0.0,,7.5,2,


### Analytics

Aggregations for passenger count, pick up/drop off locations

In [None]:
#passenger_count data available only for yellow and green taxi
passenger_count = df.groupBy("passenger_count") \
  .count() \
  .orderBy(col("passenger_count").asc())

pul = df.groupBy("PULocationID") \
  .count() \
  .orderBy(col("count").desc())

dol = df.groupBy("DOLocationID") \
  .count() \
  .orderBy(col("count").desc())

In [None]:
passenger_count

In [None]:
pul

In [None]:
dol

Most popular trip distances based on ranges

In [1]:
#data available for yellow, green and high volume
df_with_distance = df \
    .withColumn("trip_distance_bucket",
                when(col("trip_distance") <= 1.0, lit("0-1"))
                .when((col("trip_distance") > 1.0) & (col("trip_distance") <= 2.0), lit("1-2"))
                .when((col("trip_distance") > 2.0) & (col("trip_distance") <= 3.0), lit("2-3"))
                .when((col("trip_distance") > 3.0) & (col("trip_distance") <= 4.0), lit("3-4"))
                .when((col("trip_distance") > 4.0) & (col("trip_distance") <= 5.0), lit("4-5"))
                .when((col("trip_distance") > 5.0) & (col("trip_distance") <= 10.0), lit("5-10"))
                .otherwise(lit(">10")))

NameError: name 'df' is not defined

In [None]:
df_with_distance = df_with_distance.groupBy("trip_distance_bucket") \
  .count() \
  .orderBy(col("count").desc())

In [None]:
df_with_distance 

Most popular pickup hours - extracting hour from pickup time and grouping based on it

In [None]:
df_with_hours = df \
    .withColumn("pickup_hour", date_format(col("tpep_pickup_datetime"), "HH"))

In [None]:
df_with_hours = df_with_hours.groupBy("pickup_hour") \
  .count() \
  .orderBy(col("count").desc())

In [None]:
df_with_hours

Yearly count of all trips

In [None]:
df_date = df \
    .withColumn("trip_date", date_format(col("tpep_pickup_datetime"), "yyyy"))

In [None]:
years = df_date.groupBy("trip_date") \
  .count() \
  .orderBy(col("count").asc())

In [None]:
years

Most popular payment types (based on the previously created 'trip_date' with years)

In [None]:
#data available for yellow and green taxi
payment = df_date.groupBy("payment_type", "trip_date") \
    .count() \
    .filter(col("trip_date").between(2011, 2024))

In [None]:
payment

Loading csv with NYC zone names, to see most popular pickup/drop off zones

In [None]:
zones = (spark.read
        .option("header", "true")
        .format("csv")
        .load("data/taxi/taxi+_zone_lookup.csv"))

In [None]:
zones

In [None]:
zones_pickup = zones.select(
    col("LocationID").alias("PULocationID"),
    col("Borough").alias("pickup_borough"),
    col("Zone").alias("pickup_zone")
)

zones_dropoff = zones.select(
    col("LocationID").alias("DOLocationID"),
    col("Borough").alias("dropoff_borough"),
    col("Zone").alias("dropoff_zone")
)

Broadcast join with a small 'zones' table

In [None]:
df_locations = df \
    .join(broadcast(zones_pickup), on="PULocationID", how="left") \
    .join(broadcast(zones_dropoff), on="DOLocationID", how="left")

Adding a column with pickup and drop off destinations combined together

In [None]:
df_locations = df_locations.withColumn(
    "route",
    concat(
        col("pickup_borough"),      lit(", "),
        col("pickup_zone"),         lit(" â†’ "),
        col("dropoff_borough"),     lit(", "),
        col("dropoff_zone")
    )
).drop(
    "pickup_borough",
    "pickup_zone",
    "dropoff_borough",
    "dropoff_zone"
)

Showing most popular routes based on created column

In [None]:
df_locations = df_locations.groupBy("route") \
    .count() \
    .orderBy(col("count").desc()) \
    .limit(15)

In [None]:
df_locations

### Sorting and Window Function

Here we can check the difference between sorting within partitions and a global sort, followed by an example of a window function. These two scenarios make a difference at scale, which is shown here, with just 10 GB of yellow taxi data (2011-2015).

In [5]:
from pyspark.sql.window import Window

Adding 'trip_month' column that can be partitioned by in window function

In [6]:
df_month = df.withColumn("trip_month", date_format(col("tpep_pickup_datetime"), "yyyy-MM"))

In [5]:
df_month

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,congestion_surcharge,trip_month
2,2014-01-01 00:17:26,2014-01-01 00:37:11,True,1,17,225,1,2.28,13.5,0.5,0.5,0.0,0.0,,14.5,2,,2014-01
1,2014-01-01 00:29:12,2014-01-01 00:37:43,True,1,127,241,1,2.1,9.0,0.0,0.5,0.0,0.0,,9.5,2,,2014-01
2,2014-01-01 00:31:35,2014-01-01 00:44:09,True,1,166,243,1,4.72,15.5,0.5,0.5,4.0,0.0,,20.5,1,,2014-01
2,2014-01-01 00:07:01,2014-01-01 00:21:54,True,1,7,157,1,2.88,13.0,0.5,0.5,2.88,0.0,,16.88,1,,2014-01
2,2014-01-01 00:26:43,2014-01-01 00:37:17,True,1,83,197,2,3.8,13.0,0.5,0.5,0.0,0.0,,14.0,2,,2014-01
2,2014-01-01 00:23:34,2014-01-01 00:32:50,True,1,226,7,6,1.85,9.0,0.5,0.5,2.85,0.0,,12.85,1,,2014-01
2,2014-01-01 00:45:12,2014-01-01 00:54:03,True,1,7,95,6,7.01,20.0,0.5,0.5,4.1,0.0,,25.1,1,,2014-01
2,2014-01-01 00:10:17,2014-01-01 00:48:42,True,1,61,74,2,13.91,42.5,0.5,0.5,8.6,0.0,,52.1,1,,2014-01
2,2014-01-01 00:53:39,2014-01-01 01:05:19,True,1,74,116,2,2.0,10.0,0.5,0.5,2.1,0.0,,13.1,1,,2014-01
2,2014-01-01 00:15:50,2014-01-01 00:20:59,True,1,61,83,2,1.28,6.5,0.5,0.5,0.0,0.0,,7.5,2,,2014-01


#### 2 Sorting Scenarios 

In first scenario, data is repartitioned and sorted within partitions based on monthly trip date

In [7]:
sorted = df_month.repartition(120, "trip_month") \
    .sortWithinPartitions(col("trip_month"), col("trip_distance").desc())

Details in .explain() are indicating that there is no global sort at this point ('+-Sort ..., <span style="color: #ff0000">false</span>, 0') and no range partitionig, just the hash repartitioning based on 'trip_month'

In [8]:
sorted.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [trip_month#72 ASC NULLS FIRST, trip_distance#44 DESC NULLS LAST], false, 0
   +- Exchange hashpartitioning(trip_month#72, 120), REPARTITION_BY_NUM, [plan_id=11]
      +- Project [VendorID#36, tpep_pickup_datetime#37, tpep_dropoff_datetime#38, store_and_fwd_flag#39, RatecodeID#40, PULocationID#41, DOLocationID#42, passenger_count#43, trip_distance#44, fare_amount#45, extra#46, mta_tax#47, tip_amount#48, tolls_amount#49, improvement_surcharge#50, total_amount#51, payment_type#52, congestion_surcharge#53, date_format(cast(tpep_pickup_datetime#37 as timestamp), yyyy-MM, Some(Etc/UTC)) AS trip_month#72]
         +- FileScan parquet [VendorID#36,tpep_pickup_datetime#37,tpep_dropoff_datetime#38,store_and_fwd_flag#39,RatecodeID#40,PULocationID#41,DOLocationID#42,passenger_count#43,trip_distance#44,fare_amount#45,extra#46,mta_tax#47,tip_amount#48,tolls_amount#49,improvement_surcharge#50,total_amount#51,payment_type#52,congestion_s

In second scenario, we apply a global sort in 'rangepartitioning', which triggers extra shuffle at this point and can be very expensive at scale

In [13]:
sortedTwo = df_month \
    .sort(col("trip_month"), col("trip_distance").desc())

In [14]:
sortedTwo.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [trip_month#72 ASC NULLS FIRST, trip_distance#44 DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(trip_month#72 ASC NULLS FIRST, trip_distance#44 DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=176]
      +- Project [VendorID#36, tpep_pickup_datetime#37, tpep_dropoff_datetime#38, store_and_fwd_flag#39, RatecodeID#40, PULocationID#41, DOLocationID#42, passenger_count#43, trip_distance#44, fare_amount#45, extra#46, mta_tax#47, tip_amount#48, tolls_amount#49, improvement_surcharge#50, total_amount#51, payment_type#52, congestion_surcharge#53, date_format(cast(tpep_pickup_datetime#37 as timestamp), yyyy-MM, Some(Etc/UTC)) AS trip_month#72]
         +- FileScan parquet [VendorID#36,tpep_pickup_datetime#37,tpep_dropoff_datetime#38,store_and_fwd_flag#39,RatecodeID#40,PULocationID#41,DOLocationID#42,passenger_count#43,trip_distance#44,fare_amount#45,extra#46,mta_tax#47,tip_amount#48,tolls_amount#49,improvement_surch

After all, we use a window function to show the longest trips each month in both sorting scenarios

In [15]:
window_spec = Window.partitionBy("trip_month").orderBy(col("trip_distance").desc())

df_ranked = sorted.withColumn("distance_rank", row_number().over(window_spec))

#df_ranked = sortedTwo.withColumn("distance_rank", row_number().over(window_spec))

In [16]:
df_ranked

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,congestion_surcharge,trip_month,distance_rank
1,2013-05-03 23:57:33,2013-05-04 00:15:06,False,1,113,48,1,100.0,13.0,0.5,0.5,2.0,0.0,0.0,16.0,1,,2013-05,1
1,2013-05-06 17:54:51,2013-05-06 18:05:07,False,1,246,211,1,100.0,9.5,1.0,0.5,2.75,0.0,0.0,13.75,1,,2013-05,2
1,2013-05-16 23:56:19,2013-05-17 00:26:23,False,1,234,61,1,100.0,25.0,0.5,0.5,5.2,0.0,0.0,31.2,1,,2013-05,3
1,2013-05-19 22:09:59,2013-05-19 22:19:09,False,1,79,50,1,100.0,10.5,0.5,0.5,2.87,0.0,0.0,14.37,1,,2013-05,4
1,2013-05-20 18:26:50,2013-05-20 18:52:42,False,1,107,50,1,100.0,17.5,1.0,0.5,0.0,0.0,0.0,19.0,2,,2013-05,5
1,2013-05-05 01:15:53,2013-05-05 01:28:45,False,1,74,230,1,98.0,14.0,0.5,0.5,0.0,0.0,0.0,15.0,2,,2013-05,6
1,2013-05-07 11:50:48,2013-05-07 11:51:20,False,1,226,226,1,97.4,2.5,0.0,0.5,0.0,0.0,0.0,3.0,3,,2013-05,7
1,2013-05-19 13:40:23,2013-05-19 16:05:16,False,4,10,265,1,94.1,432.0,0.0,0.5,0.0,5.33,0.0,437.83,2,,2013-05,8
1,2013-05-20 23:10:15,2013-05-20 23:22:44,False,5,208,265,1,92.8,93.5,0.0,0.0,10.0,5.33,0.0,108.83,1,,2013-05,9
1,2013-05-08 16:15:28,2013-05-08 16:29:22,False,1,43,43,1,90.0,9.5,1.0,0.5,1.0,0.0,0.0,12.0,1,,2013-05,10


Pre-partitioning approach (repartition by trip_month + sortWithinPartitions) was completed in ~20 minutes, making it 3x faster then global sort approach (.sort() that took over an hour).

Manual repartitioning by the window partition key (trip_month) + sorting within partitions allowed Spark to reuse the sort order in the Window function, avoiding expensive additional sorting.
Global partitioning generated significantly higher shuffle cost.