### Creating spark session

At this point it is for formatting and saving files only - so not huge memory is needed

In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *


spark = SparkSession.builder.appName("taxi") \
.master("local[*]") \
.config("spark.sql.pyspark.jvm", "false") \
.config("spark.executor.memory", "8g") \ 
.getOrCreate()

#.withColumn("event_date", expr ("DATE_TRUNC('day', event_time)"))


### Data formatting for yellow taxi files

Loading each year of yellow taxi and checking column names/formats against sample file

In [7]:
df_sample = (spark.read
        .format("parquet")
        .load("data/taxi/sample/sample_yellow.parquet*"))
	

df = (spark.read
        .format("parquet")
        .load("data/taxi/yellow_taxi/2011/*"))


This function does not change the data, except of showing differences - so it's chat generated :) 

In [None]:
def compare_schemas(df_sample, df):
    if df_sample.schema == df.schema:
        print("Schematy są IDENTYCZNE")
    else:
        print("RÓŻNICE W SCHEMATACH:")
        print("=" * 60)
       
        s1 = {f.name: (str(f.dataType), f.nullable) for f in df_sample.schema.fields}
        s2 = {f.name: (str(f.dataType), f.nullable) for f in df.schema.fields}
       
        all_cols = sorted(set(s1.keys()) | set(s2.keys()))
       
        for col in all_cols:
            in_df_sample = col in s1
            in_df = col in s2
           
            if not in_df_sample:
                print(f" tylko w df → {col:25} {s2[col][0]:20} (nullable={s2[col][1]})")
            elif not in_df:
                print(f" tylko w df_sample → {col:25} {s1[col][0]:20} (nullable={s1[col][1]})")
            elif s1[col] != s2[col]:
                print(f" RÓŻNICA → {col:25} df_sample: {s1[col][0]:15} (null={s1[col][1]}) | "
                      f"df: {s2[col][0]:15} (null={s2[col][1]})")
                
compare_schemas(df_sample, df)

Changing column names (all other are the same)

In [None]:
df = df.withColumnRenamed("lpep_pickup_datetime", "tpep_pickup_datetime") \
.withColumnRenamed("lpep_dropoff_datetime", "tpep_dropoff_datetime") \
.drop("trip_type") 

Casting each column to the same type as in the sample file, so it is possible to read all data later at once

In [None]:
df = df \
.withColumn("VendorID", col("VendorID").cast("integer")) \
.withColumn("tpep_pickup_datetime", col("tpep_pickup_datetime").cast("timestamp_ntz")) \
.withColumn("tpep_dropoff_datetime", col("tpep_dropoff_datetime").cast("timestamp_ntz")) \
.withColumn("passenger_count", col("passenger_count").cast("integer")) \
.withColumn("trip_distance", col("trip_distance").cast("float")) \
.withColumn("RatecodeID", col("RatecodeID").cast("integer")) \
.withColumn("store_and_fwd_flag", col("store_and_fwd_flag").cast("boolean")) \
.withColumn("PULocationID", col("PULocationID").cast("integer")) \
.withColumn("DOLocationID", col("DOLocationID").cast("integer")) \
.withColumn("payment_type", col("payment_type").cast("integer")) \
.withColumn("fare_amount", col("fare_amount").cast("float")) \
.withColumn("extra", col("extra").cast("float")) \
.withColumn("mta_tax", col("mta_tax").cast("float")) \
.withColumn("tip_amount", col("tip_amount").cast("float")) \
.withColumn("tolls_amount", col("tolls_amount").cast("float")) \
.withColumn("improvement_surcharge", col("improvement_surcharge").cast("float")) \
.withColumn("total_amount", col("total_amount").cast("float")) \
.withColumn("congestion_surcharge", col("congestion_surcharge").cast("float")) 
#.withColumn("airport_fee", col("airport_fee").cast("float"))

To be sure, schema can be checked after transformation

In [11]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- congestion_surcharge: float (nullable = true)
 |-- airport_fee: float (nullable = true)



Also describe() can be used to check what is there inside

In [6]:
df.describe()

summary,VendorID,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,803860.0,803860,803860.0,803860.0,803860.0,803860.0,803860.0,803860.0,803860.0,803860.0,803860.0,803860.0,0.0,14.0,803860.0,803860.0,355.0,0.0
mean,1.75689796730774,,1.121381832657428,123.55262981116115,136.5182904983455,1.4742915433035604,2.8166176324234486,11.838116450625694,0.3580091558231532,0.4890422834821984,0.9025804742118398,0.1039797103973568,,0.1714285714285714,13.692027032069824,1.6540056726295624,1.0056338028169014,
stddev,0.428956481817546,,0.7220787829855728,78.8331687542218,77.36119121329234,1.1952107641060672,2.836507226382903,9.72550303764364,0.7796521137299455,0.0736839485607721,2.0161152769662336,2.29668050527423,,0.1540657773039286,11.095465318837974,0.4974213844319095,0.0749525724687186,
min,1.0,N,1.0,1.0,1.0,0.0,0.0,-3.5,0.0,-0.5,0.0,0.0,,0.0,-4.0,1.0,1.0,
max,2.0,Y,99.0,265.0,265.0,9.0,115.61,2924.5,565.84,5.13,380.0,1481.87,,0.3,2925.0,4.0,2.0,


Saving formatted data to parquet

In [None]:
(df
.write
.mode("overwrite")
.option("parquet.compression", "gzip")
.parquet("data/taxi/yellow_taxi/formatted/"))

### Creating spark session for yellow/green taxi analysis 

Now when all files have the same format, they can be loaded together for analytics. As yellow and green taxi files are the same, below code can be used for both

In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

#For some analysis 8gb memory was not enough and the job failed, so especially the executor had to be increased

spark = SparkSession.builder.appName("Yellow_Analysis") \
.master("local[*]") \
.config("spark.driver.memory", "16g") \
.config("spark.executor.memory", "16g") \
.getOrCreate()

schema = StructType([
    StructField("VendorID",               IntegerType(),  nullable=True),
    StructField("tpep_pickup_datetime",   TimestampNTZType(), nullable=True),
    StructField("tpep_dropoff_datetime",  TimestampNTZType(), nullable=True),
    StructField("store_and_fwd_flag",     BooleanType(),  nullable=True),
    StructField("RatecodeID",             IntegerType(),  nullable=True),
    StructField("PULocationID",           IntegerType(),  nullable=True),
    StructField("DOLocationID",           IntegerType(),  nullable=True),
    StructField("passenger_count",        IntegerType(),  nullable=True),
    StructField("trip_distance",          FloatType(),    nullable=True),
    StructField("fare_amount",            FloatType(),    nullable=True),
    StructField("extra",                  FloatType(),    nullable=True),
    StructField("mta_tax",                FloatType(),    nullable=True),
    StructField("tip_amount",             FloatType(),    nullable=True),
    StructField("tolls_amount",           FloatType(),    nullable=True),
    StructField("improvement_surcharge",  FloatType(),    nullable=True),
    StructField("total_amount",           FloatType(),    nullable=True),
    StructField("payment_type",           IntegerType(),  nullable=True),
    StructField("congestion_surcharge",   FloatType(),    nullable=True)
])



In [None]:
df = (spark.read
        .schema(schema)
        .format("parquet")
        .load("data/taxi/yellow_taxi/*"))

#To see Data Frame in a more readable way, we can use:

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [None]:
df

Aggregations for passenger count, pick up/drop off locations

In [None]:
passenger_count = df.groupBy("passenger_count") \
  .count() \
  .orderBy(col("passenger_count").asc())

pul = df.groupBy("PULocationID") \
  .count() \
  .orderBy(col("count").desc())

dol = df.groupBy("DOLocationID") \
  .count() \
  .orderBy(col("count").desc())

In [None]:
passenger_count

Most popular trip distances based on ranges

In [None]:
df_with_distance = df \
    .withColumn("trip_distance_bucket",
                when(col("trip_distance") <= 1.0, lit("0-1"))
                .when((col("trip_distance") > 1.0) & (col("trip_distance") <= 2.0), lit("1-2"))
                .when((col("trip_distance") > 2.0) & (col("trip_distance") <= 3.0), lit("2-3"))
                .when((col("trip_distance") > 3.0) & (col("trip_distance") <= 4.0), lit("3-4"))
                .when((col("trip_distance") > 4.0) & (col("trip_distance") <= 5.0), lit("4-5"))
                .when((col("trip_distance") > 5.0) & (col("trip_distance") <= 10.0), lit("5-10"))
                .otherwise(lit(">10")))

In [None]:
df_with_distance = df_with_distance.groupBy("trip_distance_bucket") \
  .count() \
  .orderBy(col("count").desc())

In [None]:
df_with_distance 

Most popular pickup hours

In [None]:
df_with_hours = df \
    .withColumn("pickup_hour", date_format(col("tpep_pickup_datetime"), "HH"))

In [None]:
df_with_hours = df_with_hours.groupBy("pickup_hour") \
  .count() \
  .orderBy(col("count").desc())

In [None]:
df_with_hours

Yearly count of all trips

In [None]:
df_date = df \
    .withColumn("trip_date", date_format(col("tpep_pickup_datetime"), "yyyy"))

In [None]:
years = df_date.groupBy("trip_date") \
  .count() \
  .orderBy(col("count").asc())

In [None]:
years

Most popular payment types (based on the previously created 'trip_date' with years)

In [None]:
payment = df_date.groupBy("payment_type", "trip_date") \
  .count() \
.filter(col("trip_date").between(2011, 2024))

In [None]:
payment

Loading csv with NYC zone names, to see most popular pickup/drop off zones

In [None]:
zones = (spark.read
        .option("header", "true")
        .format("csv")
        .load("data/taxi/taxi+_zone_lookup.csv"))

In [None]:
zones

In [None]:
zones_pickup = zones.select(
    col("LocationID").alias("PULocationID"),
    col("Borough").alias("pickup_borough"),
    col("Zone").alias("pickup_zone")
)

zones_dropoff = zones.select(
    col("LocationID").alias("DOLocationID"),
    col("Borough").alias("dropoff_borough"),
    col("Zone").alias("dropoff_zone")
)

Simple broadcast join with small 'zones' table

In [None]:
df_locations = df \
    .join(broadcast(zones_pickup), on="PULocationID", how="left") \
    .join(broadcast(zones_dropoff), on="DOLocationID", how="left")

Adding a column with pickup and drop off destinations combined together

In [None]:
df_locations = df_locations.withColumn(
    "route",
    concat(
        col("pickup_borough"),      lit(", "),
        col("pickup_zone"),         lit(" → "),
        col("dropoff_borough"),     lit(", "),
        col("dropoff_zone")
    )
).drop(
    "pickup_borough",
    "pickup_zone",
    "dropoff_borough",
    "dropoff_zone"
)

Showing most popular routes based on created column

In [None]:
df_locations = df_locations.groupBy("route") \
    .count() \
    .orderBy(col("count").desc()) \
    .limit(15)

In [None]:
df_locations