In [None]:
from pyspark.sql.functions import lag, col, avg, concat_ws, count, split, collect_set
from pyspark.sql.window import Window

%run "/usr/local/spark/notebooks/00-spark-connection.ipynb"
spark.sparkContext.setLogLevel("ERROR")

departures_df = spark.read.parquet("data/enriched_01.parquet")

In [None]:
import json
from pyspark.sql.types import StructType, StructField, StringType, FloatType

# schema for lines
lines_schema = StructType([
    StructField("id", StringType(), False),
    StructField("name", StringType(), True),
    StructField("product", StringType(), True)
])

# Load Lines from JSON
with open("data/lines_with_stops.json", "r") as file:
    lines_json = json.load(file)

# Extract lines data
lines = []
for line_id, details in lines_json.items():
    lines.append((line_id, details.get("name"), details.get("product")))

lines_df = spark.createDataFrame(lines, schema=lines_schema)

# Define the schema for the stops DataFrame
stops_schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("latitude", FloatType(), True),
    StructField("longitude", FloatType(), True)
])

# Load stops JSON data
with open("data/stops.json", "r") as file:
    stops_json = json.load(file)
stops = stops_json.get("stops", [])
stops_data = [(stop["id"], stop["name"], stop["latitude"], stop["longitude"]) for stop in stops]
stops_df = spark.createDataFrame(stops_data, schema=stops_schema)
stops2_df = stops_df.alias("s")


In [None]:
window_spec = Window.partitionBy("tripId").orderBy("plannedWhen")

# Add previous stopId to define the route
# join with lines for pretty names
routes_df = departures_df.withColumn("prev_stopId", lag("stopId").over(window_spec)) \
                       .filter(col("prev_stopId").isNotNull()) \
                       .withColumn("route", concat_ws(" -> ", col("prev_stopId"), col("stopId"))) \
                       .select("*", "lineId") \
                       .join(lines_df, col("lineId") == lines_df.id, "left") \
                        .cache()

# Calculate average added_delay for each route
route_delay_df = routes_df.groupBy("route") \
                           .agg(
                               avg("added_delay").alias("avg_added_delay"),
                               avg("delay").alias("avg_delay"),
                               count("*").alias("count"),
                               collect_set("name").alias("lines"),
                               collect_set("product").alias("products")
                           ) \
                           .orderBy(col("count").desc()) \
                           .cache()

total_routes = routes_df.count()
unique_routes = routes_df.select("route").distinct().count()
print(f"Total Routes: {total_routes}")
print(f"Unique Routes: {unique_routes}")

In [None]:
# Get routes by Average Added Delay
# Minimum number of departures for a route to be included
minimum_departures = 1000
route_delay_with_names = route_delay_df.filter(col("count") > minimum_departures) \
                                       .withColumn("stop_a_id", split(col("route"), " -> ").getItem(0)) \
                                       .withColumn("stop_b_id", split(col("route"), " -> ").getItem(1)) \
                                       .join(stops_df, col("stop_a_id") == stops_df.id, "left") \
                                       .withColumnRenamed("name", "stop_a_name") \
                                       .join(stops2_df, col("stop_b_id") == stops2_df.id, "left") \
                                       .withColumnRenamed("name", "stop_b_name") \
                                       .select(
                                           "route",
                                           "stop_a_name",
                                           "stop_b_name",
                                           "avg_added_delay",
                                           "avg_delay",
                                           "count",
                                           "lines"
                                       ) \
                                       .orderBy(col("avg_added_delay").desc()) \
                                       .cache()

In [None]:
import matplotlib.pyplot as plt

# Show top 20 routes with stop names
route_delay_with_names.show(20, truncate=False)

route_delay_pd = route_delay_with_names.toPandas()
# Plot the top 10 routes with highest average added_delay

top_n = 5
top_routes = route_delay_pd.head(top_n)

plt.figure(figsize=(12, 4))
plt.barh(top_routes['stop_a_name'] + " -> " + top_routes['stop_b_name'], top_routes['avg_added_delay'], color='skyblue')
plt.xlabel('Average Added Delay (seconds)')
plt.title(f'Top {top_n} Routes with Highest Average Added Delay (Min. {minimum_departures} Departures)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
from pyspark.sql.functions import col, split, array_contains, lit
import matplotlib.pyplot as plt

# Get routes by Average Added Delay
# Minimum number of departures for a route to be included
# Must have products "tram" or "bus"
minimum_departures = 1000
bus_or_tram_routes = route_delay_with_names.filter(array_contains(col("products"), "tram") | array_contains(col("products"), "bus")).cache()

bus_or_tram_routes.show(20, truncate=False)
route_delay_pd = bus_or_tram_routes.toPandas()

# Plot the top 10 routes with highest average added_delay
top_n = 5
top_routes = route_delay_pd.head(top_n)

plt.figure(figsize=(12, 4))
plt.barh(top_routes['stop_a_name'] + " -> " + top_routes['stop_b_name'], top_routes['avg_added_delay'], color='skyblue')
plt.xlabel('Average Added Delay (seconds)')
plt.title(f'Top {top_n} Tram or Bus Routes with Highest Average Added Delay (Min. {minimum_departures} Departures)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
from pyspark.sql.functions import col, split, array_contains, lit
import matplotlib.pyplot as plt

# Get routes by Average Added Delay
# Minimum number of departures for a route to be included
# Must have products "tram" or "bus"
minimum_departures = 1000
asc_routes = bus_or_tram_routes.orderBy(col("avg_added_delay").asc())

asc_routes.show(20, truncate=False)
route_delay_pd = asc_routes.toPandas()

# Plot the top 10 routes with highest average added_delay
top_n = 5
top_routes = route_delay_pd.head(top_n)

plt.figure(figsize=(12, 4))
plt.barh(top_routes['stop_a_name'] + " -> " + top_routes['stop_b_name'], top_routes['avg_added_delay'], color='skyblue')
plt.xlabel('Average Added Delay (seconds)')
plt.title(f'Top {top_n} Tram or Bus Routes with Lowest Average Added Delay (Min. {minimum_departures} Departures)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()