In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

%run "/usr/local/spark/notebooks/00-spark-connection.ipynb"
spark.sparkContext.setLogLevel("ERROR")

# Define schema for weather data
weather_schema = StructType([
    StructField("time", TimestampType(), True),
    StructField("temperature_2m", DoubleType(), True),
    StructField("rain", DoubleType(), True),
    StructField("snowfall", DoubleType(), True),
    StructField("weather_code", StringType(), True)
])

# Load weather data
weather_df = spark.read.csv("data/open-meteo.csv",
                            schema=weather_schema,
                            header=True,
                            dateFormat="yyyy-MM-dd'T'HH:mm")

# Load departures data
departures_df = spark.read.parquet("data/enriched_01.parquet")

# Join weather and departures data
merged_df = departures_df.join(weather_df, departures_df['when'] == weather_df['time'], "inner")


In [None]:
# Pearson correlation between weather factors and delay
print("Correlation between weather factors and delay:")
for col in ["temperature_2m", "rain", "snowfall"]:
    correlation = merged_df.stat.corr("delay", col)
    print(f"Correlation between delay and {col}: {correlation}")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Aggregate data to get average delay and minimum temperature per day
daily_stats_df = merged_df.withColumn('date', F.to_date('time')) \
    .groupBy('date') \
    .agg(
        F.avg('delay').alias('avg_delay'),
        F.min('temperature_2m').alias('min_temp')
    )

# Collect data to a Pandas DataFrame
daily_stats_pd = daily_stats_df.orderBy('date').toPandas()

# Plotting
fig, ax1 = plt.subplots(figsize=(12,6))

ax1.set_xlabel('Date')
ax1.set_ylabel('Average Delay', color='tab:red')
ax1.plot(daily_stats_pd['date'], daily_stats_pd['avg_delay'], color='tab:red', label='Average Delay')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.set_ylabel('Minimum Temperature (°C)', color='tab:blue')
ax2.plot(daily_stats_pd['date'], daily_stats_pd['min_temp'], color='tab:blue', label='Minimum Temperature')
ax2.tick_params(axis='y', labelcolor='tab:blue')

fig.tight_layout()
plt.title('Average Delay per Day and Minimum Temperature')
plt.show()


In [None]:
# Prepare data for machine learning
feature_cols = ["temperature_2m", "rain", "snowfall"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
ml_data = assembler.transform(merged_df.select("delay", *feature_cols))

# Handle potential null values
ml_data = ml_data.na.drop(subset=["features", "delay"])

# Split the data
train_data, test_data = ml_data.randomSplit([0.8, 0.2], seed=42)

# Train a linear regression model
lr = LinearRegression(featuresCol="features", labelCol="delay")
model = lr.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="delay", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error: {rmse}")

# Show feature coefficients
coefficients = model.coefficients
for feature, coef in zip(feature_cols, coefficients):
    print(f"{feature}: {coef}")
