In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import explode
import pandas as pd
import time

# Stop any existing Spark applications
SparkSession.builder \
    .config('spark.master', 'local') \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.0') \
    .getOrCreate().stop()

# connect to spark and load data

spark = SparkSession.builder.appName("lvb-spark")\
        .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.0')\
        .getOrCreate()

df = spark.read.format("mongo") \
    .option("uri", "mongodb://mongo:27017/") \
    .option("database", "lvb") \
    .option("collection", "departures") \
    .load()

In [None]:
# calculate stop usage

start_time = time.time()

stop_usage = df.groupBy('stopId').count().orderBy('count', ascending=False)
stop_usage.show()

execution_duration = time.time() - start_time
print(f"Execution Duration: {round(execution_duration, 2)} seconds")

In [None]:
# Join stop_usage with real stop names based on stopId

# Read the stops.json file into a DataFrame
stops_df = spark.read.option("multiLine", "true").json("data/stops.json")

# Explode the "stops" array and select the required columns
stops_df = stops_df.select(explode("stops").alias("stop")).select("stop.*")

# Join stop_usage with stops_df based on stopId
stop_usage_with_names = stop_usage.join(stops_df, stop_usage.stopId == stops_df.id, "inner") \
    .select(stop_usage.stopId, "name", "count") \
    .orderBy("count", ascending=False)

stop_usage_with_names.show(truncate=False)

execution_duration = time.time() - start_time
print(f"Execution Duration: {round(execution_duration, 2)} seconds")