In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, when, unix_timestamp, lit
from pyspark.sql.window import Window
import time

spark = SparkSession.builder.appName("lvb-spark") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.0') \
    .master("local[*]") \
    .getOrCreate()

# Read from the filtered Parquet file
start_time = time.time()
df = spark.read.parquet("data/filtered_01.parquet")
load_time = time.time() - start_time
print(f"Loaded Parquet data in {load_time:.2f} seconds")

In [None]:
from pyspark.sql.functions import col, lag, when, unix_timestamp, lit
from pyspark.sql.window import Window

# Added Delay
window_spec = Window.partitionBy("tripId").orderBy("plannedWhen")

enriched_df = df.withColumn("prev_delay", lag("delay").over(window_spec)) \
    .withColumn("added_delay", when(col("prev_delay").isNotNull(), col("delay") - col("prev_delay")).otherwise(lit(0))) \
    .drop("prev_delay")

enriched_df.cache()

enriched_df.explain(extended=True)

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, when, first, last

# Stop Type
window_spec_trip = Window.partitionBy("tripId")

df_with_stop_type = enriched_df.withColumn(
    "stop_type",
    when(
        col("stopId") == first("stopId").over(window_spec_trip),
        "start"
    ).when(
        col("stopId") == last("stopId").over(window_spec_trip),
        "end"
    ).otherwise("pass")
)

df_with_stop_type.show()

In [None]:
# Save to Parquet
df_with_stop_type.write.mode("overwrite").parquet("data/enriched_01.parquet")
print("Data saved to Parquet")

In [None]:
# Read from Parquet
start_time = time.time()
parquet_df = spark.read.parquet("data/enriched_01.parquet")
load_time = time.time() - start_time
print(f"Loaded Parquet data in {load_time:.2f} seconds")

In [None]:
from pyspark.sql.functions import rand

# Get a random tripId
random_trip = parquet_df.select("tripId").distinct().orderBy(rand()).limit(1).collect()[0]["tripId"]
# Filter the dataframe for the selected trip and order by plannedWhen
trip_stops = parquet_df.filter(col("tripId") == random_trip) \
                       .select("*") \
                       .orderBy("plannedWhen")

print(f"Stops for trip {random_trip}:")
trip_stops.show(truncate=False, n=trip_stops.count())
