In [None]:
# Filter out all lines and stops that are not relevant for the analysis

import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as spark_max, row_number
from pyspark.sql.window import Window

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("lvb-spark") \
    .config('spark.master', 'local') \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.0') \
    .getOrCreate()

# Load departures data from MongoDB
df = spark.read.format("mongo") \
    .option("uri", "mongodb://mongo:27017/") \
    .option("database", "lvb") \
    .option("collection", "departures") \
    .load()

# Load relevant stops
with open('data/stops_fromRelevantLines.json', 'r') as f:
    relevant_stops = json.load(f)

# Load relevant lines
with open('data/lines_with_stops.json', 'r') as f:
    relevant_lines = json.load(f)

# Get list of relevant stop IDs and line IDs
relevant_stop_ids = list(relevant_stops.keys())
relevant_line_ids = list(relevant_lines.keys())

# Filter departures by relevant stops
filtered_df = df.filter(col("stopId").isin(relevant_stop_ids))

# Count filtered stops
total_stops = df.select("stopId").distinct().count()
remaining_stops = filtered_df.select("stopId").distinct().count()
filtered_stops = total_stops - remaining_stops

print(f"Filtered out {filtered_stops} stops. {remaining_stops} stops remain.")

# Further filter by relevant lines
final_df = filtered_df.filter(col("lineId").isin(relevant_line_ids))

# Count filtered lines
total_lines = filtered_df.select("lineId").distinct().count()
remaining_lines = final_df.select("lineId").distinct().count()
filtered_lines = total_lines - remaining_lines

print(f"Filtered out {filtered_lines} lines. {remaining_lines} lines remain.")

# Output as Parquet
output_path = "data/filtered_01.parquet"
final_df.write.mode("overwrite").parquet(output_path)

print(f"Filtered data saved to {output_path}")

# Show sample of the filtered data
final_df.show(5)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window
# Read the saved Parquet file
parquet_df = spark.read.parquet("data/filtered_01.parquet")

# Deduplicate by tripId, stopId, and plannedWhen
# Create a window specification
window_spec = Window.partitionBy("tripId", "stopId", "plannedWhen").orderBy(col("crawlDate").desc())

# Add row numbers within each window
final_df_with_row_num = final_df.withColumn("row_num", row_number().over(window_spec))

# Keep only the first row (latest crawlDate) for each window
final_df = final_df_with_row_num.filter(col("row_num") == 1).drop("row_num")

# Count the number of rows before and after deduplication
rows_before = final_df_with_row_num.count()
rows_after = final_df.count()
removed_rows = rows_before - rows_after

print(f"Removed {removed_rows} duplicate rows. {rows_after} rows remain after deduplication.")

# Output as Parquet
output_path = "data/filtered_02.parquet"
final_df.write.mode("overwrite").parquet(output_path)

print(f"Filtered data saved to {output_path}")

In [None]:
# Read the saved Parquet file
parquet_df = spark.read.parquet(output_path)

# Show sample of the filtered data from the Parquet file
print("Sample of data from the Parquet file:")
parquet_df.show(5)

# Optional: Display schema of the Parquet file
print("Schema of the Parquet file:")
parquet_df.printSchema()

# Optional: Get some basic statistics
print("Summary statistics:")
parquet_df.describe().show()


In [None]:
# Count total number of rows
total_rows = parquet_df.count()
mongo_count = spark.read.format("mongo") \
    .option("uri", "mongodb://mongo:27017/") \
    .option("database", "lvb") \
    .option("collection", "departures") \
    .load().count()

# Calculate the percentage of reduced data rows
reduced_percentage = ((mongo_count - total_rows) / mongo_count) * 100 if mongo_count > 0 else 0
print(f"Total number of rows: {total_rows}")
print(f"Percentage of reduced data rows: {reduced_percentage:.2f}%")