In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import time

%run "/usr/local/spark/notebooks/00-spark-connection.ipynb"

In [None]:
df = spark.read.parquet('data/enriched_01.parquet')

# Extract hour from the departure time
df = df.withColumn('hour', F.hour('when'))

# Calculate average departures and delay per hour
avg_departures = df.groupBy('hour').count().orderBy('hour')
avg_delay = df.groupBy('hour').agg(F.avg('delay').alias('avg_delay')).orderBy('hour')

# Convert to Pandas DataFrame for plotting
avg_departures_pd = avg_departures.toPandas()
avg_delay_pd = avg_delay.toPandas()

# Plot average departures
plt.figure(figsize=(12, 6))
plt.bar(avg_departures_pd['hour'], avg_departures_pd['count'], color='skyblue', label='Departures')
plt.title('Average Departures and Delay per Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Departures')

# secondary y-axis for delay
ax2 = plt.twinx()
ax2.plot(avg_delay_pd['hour'], avg_delay_pd['avg_delay'], color='red', linewidth=2, label='Average Delay')
ax2.set_ylabel('Average Delay (minutes)', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# legend
handles1, labels1 = plt.gca().get_legend_handles_labels()
plt.legend(handles1, labels1, loc='upper left')

plt.tight_layout()
plt.show()