In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder
    .appName("FlightDelayETL")
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")


In [11]:
# Flights
flights_df = spark.read.parquet("data/source/flight_all")

# Weather (Open-Meteo JSON)
weather_df = (
    spark.read
    .option("multiline", "true")
    .option("mode", "PERMISSIVE")
    .json("data/source/weather_all")
)

# Airports
airports_df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("data/source/airport-codes.csv")
)

print("Flights:", flights_df.count())
print("Weather:", weather_df.count())
print("Airports:", airports_df.count())


Flights: 29193782
Weather: 91
Airports: 82808


In [12]:
from pyspark.sql.functions import (
    col, to_date, concat_ws, lpad, try_to_timestamp, lit
)

flights_clean = (
    flights_df
    .filter(col("Cancelled") == False)
    .withColumn("flight_date", to_date(col("FlightDate")))
    .withColumn(
        "scheduled_dep",
        try_to_timestamp(
            concat_ws(
                " ",
                col("flight_date").cast("string"),
                lpad(col("CRSDepTime").cast("string"), 4, "0")
            ),
            lit("yyyy-MM-dd HHmm")
        )
    )
)


In [13]:
airports_fixed = (
    airports_df
    .filter(col("iata_code").isNotNull())  # only usable airports
    .withColumn("lat_str", trim(split(col("coordinates"), ",")[0]))
    .withColumn("lon_str", trim(split(col("coordinates"), ",")[1]))
    .withColumn("latitude", col("lat_str").cast("double"))
    .withColumn("longitude", col("lon_str").cast("double"))
    .select(
        col("iata_code"),
        col("name"),
        col("latitude"),
        col("longitude")
    )
)


In [14]:
orig_airports = airports_fixed.select(
    col("iata_code").alias("origin"),
    col("latitude").alias("origin_lat"),
    col("longitude").alias("origin_lon"),
    col("name").alias("origin_name")
)

dest_airports = airports_fixed.select(
    col("iata_code").alias("dest"),
    col("latitude").alias("dest_lat"),
    col("longitude").alias("dest_lon"),
    col("name").alias("dest_name")
)
weather_clean = (
    weather_df
    .withColumnRenamed("latitude", "weather_lat")
    .withColumnRenamed("longitude", "weather_lon")
    .select(
        "weather_lat",
        "weather_lon",
        posexplode("hourly.time").alias("idx", "weather_time"),
        col("hourly.temperature_2m").alias("temperature_2m"),
        col("hourly.precipitation").alias("precipitation"),
        col("hourly.windspeed_10m").alias("windspeed_10m")
    )
    .withColumn("weather_time", to_timestamp("weather_time"))
    .withColumn("weather_hour", date_trunc("hour", col("weather_time")))
    .select(
        "weather_lat",
        "weather_lon",
        "weather_hour",
        col("temperature_2m")[col("idx")].alias("temperature_2m"),
        col("precipitation")[col("idx")].alias("precipitation"),
        col("windspeed_10m")[col("idx")].alias("windspeed_10m")
    )
)


flights_enriched = (
    flights_clean
    .join(broadcast(orig_airports), "origin", "left")
    .join(broadcast(dest_airports), "dest", "left")
)


In [15]:
weather_clean = (
    weather_df
    .withColumnRenamed("latitude", "weather_lat")
    .withColumnRenamed("longitude", "weather_lon")
    .select(
        "weather_lat",
        "weather_lon",
        posexplode("hourly.time").alias("idx", "weather_time"),
        col("hourly.temperature_2m").alias("temperature_2m"),
        col("hourly.precipitation").alias("precipitation"),
        col("hourly.windspeed_10m").alias("windspeed_10m")
    )
    .withColumn("weather_time", to_timestamp("weather_time"))
    .withColumn("weather_hour", date_trunc("hour", col("weather_time")))
    .select(
        "weather_lat",
        "weather_lon",
        "weather_hour",
        col("temperature_2m")[col("idx")].alias("temperature_2m"),
        col("precipitation")[col("idx")].alias("precipitation"),
        col("windspeed_10m")[col("idx")].alias("windspeed_10m")
    )
)


In [16]:
flights_final = (
    flights_enriched
    .withColumn("scheduled_hour", date_trunc("hour", col("scheduled_dep")))
    .join(
        weather_clean,
        (flights_enriched.origin_lat == weather_clean.weather_lat) &
        (flights_enriched.origin_lon == weather_clean.weather_lon) &
        (col("scheduled_hour") == weather_clean.weather_hour),
        "left"
    )
)


In [17]:


final_df = flights_final.select(
    "flight_date",
    "origin",
    "origin_name",
    "dest",
    "dest_name",
    "scheduled_dep",
    "temperature_2m",
    "precipitation",
    "windspeed_10m",
    "ArrDelay",
    "DepDelay"
)

final_df.show(10, truncate=False)
print("Final row count:", final_df.count())


+-----------+------+------------------------------------------------+----+------------------------------------------------+-------------------+--------------+-------------+-------------+--------+--------+
|flight_date|origin|origin_name                                     |dest|dest_name                                       |scheduled_dep      |temperature_2m|precipitation|windspeed_10m|ArrDelay|DepDelay|
+-----------+------+------------------------------------------------+----+------------------------------------------------+-------------------+--------------+-------------+-------------+--------+--------+
|2018-01-23 |ABY   |Southwest Georgia Regional Airport              |ATL |Hartsfield Jackson Atlanta International Airport|2018-01-23 12:02:00|NULL          |NULL         |NULL         |-8.0    |-5.0    |
|2018-01-24 |ABY   |Southwest Georgia Regional Airport              |ATL |Hartsfield Jackson Atlanta International Airport|2018-01-24 12:02:00|NULL          |NULL         |NULL    

In [18]:
final_df.select("flight_date").where(col("flight_date").isNull()).count()


0

In [19]:
final_df = final_df.coalesce(1)  # or 2 / 4 if data is large


In [20]:
spark = (
    SparkSession.builder
    .appName("BigData")
    .config(
        "spark.sql.sources.commitProtocolClass",
        "org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol"
    )
    .config("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "1")
    .getOrCreate()
)


In [21]:
(
    final_df
    .write
    .mode("overwrite")
    .partitionBy("flight_date")
    .parquet("data/final/flights_enriched.parquet")
)


In [22]:
import duckdb
import pandas as pd

# Connect to a persistent DuckDB database file
con = duckdb.connect(database="flights_analysis.duckdb", read_only=False)

# Load all Parquet files into a DuckDB table
con.execute("""
    CREATE OR REPLACE TABLE flights_enriched AS
    SELECT * FROM read_parquet('data/final/flights_enriched.parquet/**/*.parquet')
""")

print("Data loaded into DuckDB table: flights_enriched")

# Quick test query
con.execute("DESCRIBE flights_enriched").df()

Data loaded into DuckDB table: flights_enriched


Unnamed: 0,column_name,column_type,null,key,default,extra
0,origin,VARCHAR,YES,,,
1,origin_name,VARCHAR,YES,,,
2,dest,VARCHAR,YES,,,
3,dest_name,VARCHAR,YES,,,
4,scheduled_dep,TIMESTAMP,YES,,,
5,temperature_2m,DOUBLE,YES,,,
6,precipitation,DOUBLE,YES,,,
7,windspeed_10m,DOUBLE,YES,,,
8,ArrDelay,DOUBLE,YES,,,
9,DepDelay,DOUBLE,YES,,,


In [23]:
# Example 1: Average delay by origin airport
avg_delay_by_origin = con.execute("""
    SELECT 
        origin,
        origin_name,
        COUNT(*) AS flights,
        AVG(DepDelay) AS avg_dep_delay,
        AVG(ArrDelay) AS avg_arr_delay
    FROM flights_enriched
    WHERE DepDelay IS NOT NULL
    GROUP BY origin, origin_name
    ORDER BY avg_dep_delay DESC
    LIMIT 15
""").df()

avg_delay_by_origin

Unnamed: 0,origin,origin_name,flights,avg_dep_delay,avg_arr_delay
0,YNG,Youngstown Warren Regional Airport,2,63.0,75.0
1,PPG,Pago Pago International Airport,296,43.871622,45.307432
2,BIH,Eastern Sierra Regional Airport,286,30.101399,20.643357
3,HYA,Cape Cod Gateway Airport,365,28.493151,21.435616
4,PQI,Presque Isle International Airport,2133,24.602438,19.016015
5,MMH,Mammoth Yosemite Airport,1060,24.599057,23.09717
6,OTH,Southwest Oregon Regional Airport,1523,23.411031,17.892904
7,ILG,Wilmington Airport,158,22.955696,23.43038
8,ACK,Nantucket Memorial Airport,5786,21.510543,16.464292
9,DUT,Tom Madsen (Dutch Harbor) Airport,1157,21.016422,7.739394
