In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, unix_timestamp, max, count, min

In [2]:
# Create SparkSession
spark = SparkSession.builder.master("local").appName("SolutionApp").getOrCreate()

# Question 1: Install Spark and PySpark

In [3]:
print(spark.version)

3.5.0


# Question 2: Yellow October 2024

In [4]:
# Read yellow trip data
df = spark.read.parquet("Data/yellow_tripdata_2024-10.parquet")

In [5]:
# Initial number of partitions
initial_partitions = df.rdd.getNumPartitions()
print(f"Initial Partitions: {initial_partitions}")

Initial Partitions: 1


In [6]:
# Repartition to 4 partitions
repartitioned_df = df.repartition(4)
new_partitions = repartitioned_df.rdd.getNumPartitions()
print(f"New Partitions after Repartition: {new_partitions}")

New Partitions after Repartition: 4


In [7]:
# Save the new partitioned yellow trip data 
repartitioned_df.write.parquet("yellow_tripdata/", mode="overwrite")

# Question 3: Count records

In [8]:
# Count the taxi trips were there on the 15th of October
df.filter(to_date(df.tpep_pickup_datetime) == "2024-10-15").count()

128893

# Question 4: Longest trip

In [9]:
# Compute trip duration in hours and find the maximum
longest_trip_hours = df \
    .withColumn("trip_duration_hours", 
                (unix_timestamp(df.tpep_dropoff_datetime) - unix_timestamp(df.tpep_pickup_datetime)) / 3600) \
    .select(max("trip_duration_hours")) \
    .collect()[0][0]

print(f"Longest trip duration: {longest_trip_hours} hours")

Longest trip duration: 162.61777777777777 hours


# Question 6: Least frequent pickup location zone

In [10]:
# Read taxi zones data
zones_df = spark.read.option("header", "true").csv("Data/taxi_zone_lookup.csv")

In [11]:
# Step 1: Join df with zones_df to get the Zone names
joined_df = df.join(zones_df, df.PULocationID == zones_df.LocationID)

In [12]:
# Step 2: Count trips per Zone
zone_counts = joined_df.groupBy("Zone").agg(count("*").alias("trip_count"))

# Step 3: Find the minimum trip count
min_trip_count = zone_counts.agg(min("trip_count")).collect()[0][0]

# Step 4: Get the Zone with the minimum trip count
least_frequent_zone = zone_counts.filter(col("trip_count") == min_trip_count).select("Zone")

# Display the result
least_frequent_zone.show()

+--------------------+
|                Zone|
+--------------------+
|Governor's Island...|
+--------------------+

