In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, broadcast, avg, coalesce, lit, count
from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt
import time

# Initialize SparkSession only once
spark = SparkSession.builder \
    .appName("lvb-spark") \
    .config('spark.master', 'local') \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.0') \
    .getOrCreate()

# Optimize shuffle partitions based on your machine's cores
spark.conf.set("spark.sql.shuffle.partitions", "8")  # Adjust as needed

In [None]:
start_time = time.time()

# Read enriched departures from Parquet file
df = spark.read.parquet("data/enriched_01.parquet")
print(f"Loaded Parquet data in {time.time() - start_time:.2f} seconds")

# Load stops data
start_time = time.time()
stops_df = spark.read.option("multiLine", "true").json("data/stops.json") \
    .select(explode("stops").alias("stop")) \
    .select(col("stop.id").alias("id"), col("stop.name").alias("name"))
print(f"Loaded stops data in {time.time() - start_time:.2f} seconds")

In [None]:
start_time = time.time()

# Calculate top 15 stop usage
top_stop_usage = df.groupBy('stopId') \
    .count() \
    .orderBy(col('count').desc()) \
    .limit(15) \
    .cache()

top_stop_usage.show()
print(f"Calculated top 15 stop usage in {time.time() - start_time:.2f} seconds")

# Join with stop names using broadcast join
start_time = time.time()
top_stops_with_names = top_stop_usage.join(
    broadcast(stops_df),
    top_stop_usage.stopId == stops_df.id,
    "inner"
).select(
    top_stop_usage.stopId,
    "name",
    "count"
).orderBy(col("count").desc()) \
 .cache()

top_stops_with_names.show(truncate=False)
print(f"Joined top 15 stop usage with names in {time.time() - start_time:.2f} seconds")

# Convert to Pandas for plotting
start_time = time.time()
stop_usage_pd = top_stops_with_names.toPandas()
print(f"Converted top 15 to Pandas in {time.time() - start_time:.2f} seconds")

# Plotting
plt.figure(figsize=(12, 8))
plt.bar(stop_usage_pd['name'], stop_usage_pd['count'], color='skyblue')
plt.title('Top 15 Stop Usage')
plt.xlabel('Stop Name')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
start_time = time.time()

# Calculate top 15 stops with the most delay
start_time = time.time()

top_delay_stops = df.groupBy('stopId') \
    .agg(avg('delay').alias('avg_delay')) \
    .orderBy(col('avg_delay').desc()) \
    .limit(15) \
    .cache()

top_delay_stops_with_names = top_delay_stops.join(
    broadcast(stops_df),
    top_delay_stops.stopId == stops_df.id,
    "inner"
).select(
    top_delay_stops.stopId,
    "name",
    "avg_delay"
).orderBy(col("avg_delay").desc()) \
 .cache()

top_delay_stops_with_names.show(truncate=False)
print(f"Calculated top 15 stops with the most delay in {time.time() - start_time:.2f} seconds")

# Convert to Pandas for plotting
start_time = time.time()
delay_stops_pd = top_delay_stops_with_names.toPandas()
execution_duration = time.time() - start_time
print(f"Converted top 15 to Pandas in {round(execution_duration, 2)} seconds")

# Get date range
date_range = df.agg(
    F.min("plannedWhen").alias("min_date"),
    F.max("plannedWhen").alias("max_date")
).collect()[0]

min_date = date_range["min_date"]
max_date = date_range["max_date"]
days_diff = (max_date - min_date).days + 1  # Add 1 to include both start and end dates

# Get stop usage for the most delayed stops
stop_ids = delay_stops_pd['stopId'].tolist()
stop_usage = df.filter(col('stopId').isin(stop_ids)) \
    .groupBy('stopId') \
    .count() \
    .withColumn('count_per_day', col('count') / days_diff) \
    .orderBy(col('stopId'))

# Convert to Pandas and merge with delay_stops_pd
stop_usage_pd = stop_usage.toPandas()
delay_stops_pd = delay_stops_pd.merge(stop_usage_pd, on='stopId')

# Plotting
fig, ax1 = plt.subplots(figsize=(14, 8))

# Plot stop usage per day on the primary y-axis
ax1.bar(delay_stops_pd['name'], delay_stops_pd['count_per_day'], alpha=0.3, color='skyblue')
ax1.set_xlabel('Stop Name')
ax1.set_ylabel('Average Stop Usage per Day', color='skyblue')
ax1.tick_params(axis='y', labelcolor='skyblue')

# Create a second y-axis for average delay
ax2 = ax1.twinx()
ax2.plot(delay_stops_pd['name'], delay_stops_pd['avg_delay'], marker='o', color='salmon', linestyle='-')
ax2.set_ylabel('Average Delay (minutes)', color='salmon')
ax2.tick_params(axis='y', labelcolor='salmon')

plt.title(f'Top 15 Stops with Most Delay and Their Daily Usage\n(Data from {min_date.date()} to {max_date.date()})')

# Rotate and align the tick labels so they look better
plt.setp(ax1.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')

# Adjust the subplot layout
fig.tight_layout()

# Show the plot
plt.show()