### Creating spark session for analytics

Now, when all files have the same format, they can be loaded for analytics. By default, the code is for yellow taxi, but it can be easily adjusted for other type - just change 'yellow_taxi' to either one of: 'green_taxi', 'for_hire_vehicle' or 'high_volume_for_hire_vehicle'

In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

#Below memory settings are sufficient for yellow_taxi and high_volume_vehicle. For other types:

#green_taxi:
#.config("spark.driver.memory", "4g") \
#.config("spark.executor.memory", "4g") \

#'for_hire_vehicle:
#.config("spark.driver.memory", "8g") \
#.config("spark.executor.memory", "4g") \

spark = SparkSession.builder.appName("Taxi_Analysis") \
.master("local[*]") \
.config("spark.driver.memory", "16g") \
.config("spark.executor.memory", "16g") \
.getOrCreate()

#Yellow and green taxi:

schema = StructType([
    StructField("VendorID",               IntegerType(),  nullable=True),
    StructField("tpep_pickup_datetime",   TimestampNTZType(), nullable=True),
    StructField("tpep_dropoff_datetime",  TimestampNTZType(), nullable=True),
    StructField("store_and_fwd_flag",     BooleanType(),  nullable=True),
    StructField("RatecodeID",             IntegerType(),  nullable=True),
    StructField("PULocationID",           IntegerType(),  nullable=True),
    StructField("DOLocationID",           IntegerType(),  nullable=True),
    StructField("passenger_count",        IntegerType(),  nullable=True),
    StructField("trip_distance",          FloatType(),    nullable=True),
    StructField("fare_amount",            FloatType(),    nullable=True),
    StructField("extra",                  FloatType(),    nullable=True),
    StructField("mta_tax",                FloatType(),    nullable=True),
    StructField("tip_amount",             FloatType(),    nullable=True),
    StructField("tolls_amount",           FloatType(),    nullable=True),
    StructField("improvement_surcharge",  FloatType(),    nullable=True),
    StructField("total_amount",           FloatType(),    nullable=True),
    StructField("payment_type",           IntegerType(),  nullable=True),
    StructField("congestion_surcharge",   FloatType(),    nullable=True)
])

#for_hire_vehicle:
####################
# schema = StructType([
#     StructField("dispatching_base_num",   StringType(),  nullable=True),
#     StructField("pickup_datetime",   TimestampNTZType(), nullable=True),
#     StructField("dropOff_datetime",  TimestampNTZType(), nullable=True),
#     StructField("PUlocationID",           IntegerType(),  nullable=True),
#     StructField("DOlocationID",           IntegerType(),  nullable=True)
# ])


#high_volume_vehicle:
####################
# schema = StructType([
# StructField("hvfhs_license_num", StringType(), True),
#     StructField("dispatching_base_num", StringType(), True),
#     StructField("originating_base_num", StringType(), True),
#     StructField("request_datetime", TimestampNTZType(), True),
#     StructField("on_scene_datetime", TimestampNTZType(), True),
#     StructField("pickup_datetime", TimestampNTZType(), True),
#     StructField("dropoff_datetime", TimestampNTZType(), True),
#     StructField("PULocationID", IntegerType(), True),
#     StructField("DOLocationID", IntegerType(), True),
#     StructField("trip_miles", DoubleType(), True),
#     StructField("trip_time", LongType(), True),
#     StructField("base_passenger_fare", DoubleType(), True),
#     StructField("tolls", DoubleType(), True),
#     StructField("bcf", DoubleType(), True),
#     StructField("sales_tax", DoubleType(), True),
#     StructField("congestion_surcharge", DoubleType(), True),
#     StructField("airport_fee", FloatType(), True),
#     StructField("tips", DoubleType(), True),
#     StructField("driver_pay", DoubleType(), True)
# ])



Loading all years together to DataFrame

In [None]:
df = (spark.read
        .schema(schema)
        .format("parquet")
        .load("data/taxi/yellow_taxi/*"))

#To see a DataFrame in a more readable way, we can use:

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [None]:
df

### Analytics - 8/10 examples

Aggregations for passenger count, pick up/drop off locations

In [None]:
#passenger_count data available only for yellow and green taxi
passenger_count = df.groupBy("passenger_count") \
  .count() \
  .orderBy(col("passenger_count").asc())

pul = df.groupBy("PULocationID") \
  .count() \
  .orderBy(col("count").desc())

dol = df.groupBy("DOLocationID") \
  .count() \
  .orderBy(col("count").desc())

In [None]:
passenger_count

In [None]:
pul

In [None]:
dol

Most popular trip distances based on ranges

In [1]:
#data available for yellow, green and high volume
df_with_distance = df \
    .withColumn("trip_distance_bucket",
                when(col("trip_distance") <= 1.0, lit("0-1"))
                .when((col("trip_distance") > 1.0) & (col("trip_distance") <= 2.0), lit("1-2"))
                .when((col("trip_distance") > 2.0) & (col("trip_distance") <= 3.0), lit("2-3"))
                .when((col("trip_distance") > 3.0) & (col("trip_distance") <= 4.0), lit("3-4"))
                .when((col("trip_distance") > 4.0) & (col("trip_distance") <= 5.0), lit("4-5"))
                .when((col("trip_distance") > 5.0) & (col("trip_distance") <= 10.0), lit("5-10"))
                .otherwise(lit(">10")))

NameError: name 'df' is not defined

In [None]:
df_with_distance = df_with_distance.groupBy("trip_distance_bucket") \
  .count() \
  .orderBy(col("count").desc())

In [None]:
df_with_distance 

Most popular pickup hours - extracting hour from pickup time and grouping based on it

In [None]:
df_with_hours = df \
    .withColumn("pickup_hour", date_format(col("tpep_pickup_datetime"), "HH"))

In [None]:
df_with_hours = df_with_hours.groupBy("pickup_hour") \
  .count() \
  .orderBy(col("count").desc())

In [None]:
df_with_hours

Yearly count of all trips

In [None]:
df_date = df \
    .withColumn("trip_date", date_format(col("tpep_pickup_datetime"), "yyyy"))

In [None]:
years = df_date.groupBy("trip_date") \
  .count() \
  .orderBy(col("count").asc())

In [None]:
years

Most popular payment types (based on the previously created 'trip_date' with years)

In [None]:
#data available for yellow and green taxi
payment = df_date.groupBy("payment_type", "trip_date") \
  .count() \
.filter(col("trip_date").between(2011, 2024))

In [None]:
payment

Loading csv with NYC zone names, to see most popular pickup/drop off zones

In [None]:
zones = (spark.read
        .option("header", "true")
        .format("csv")
        .load("data/taxi/taxi+_zone_lookup.csv"))

In [None]:
zones

In [None]:
zones_pickup = zones.select(
    col("LocationID").alias("PULocationID"),
    col("Borough").alias("pickup_borough"),
    col("Zone").alias("pickup_zone")
)

zones_dropoff = zones.select(
    col("LocationID").alias("DOLocationID"),
    col("Borough").alias("dropoff_borough"),
    col("Zone").alias("dropoff_zone")
)

Broadcast join with a small 'zones' table

In [None]:
df_locations = df \
    .join(broadcast(zones_pickup), on="PULocationID", how="left") \
    .join(broadcast(zones_dropoff), on="DOLocationID", how="left")

Adding a column with pickup and drop off destinations combined together

In [None]:
df_locations = df_locations.withColumn(
    "route",
    concat(
        col("pickup_borough"),      lit(", "),
        col("pickup_zone"),         lit(" â†’ "),
        col("dropoff_borough"),     lit(", "),
        col("dropoff_zone")
    )
).drop(
    "pickup_borough",
    "pickup_zone",
    "dropoff_borough",
    "dropoff_zone"
)

Showing most popular routes based on created column

In [None]:
df_locations = df_locations.groupBy("route") \
    .count() \
    .orderBy(col("count").desc()) \
    .limit(15)

In [None]:
df_locations