In [2]:
import pandas as pd
import numpy as np
flights_df = pd.read_csv("../data/flights.csv")


In [3]:
flights_df["scheduled_time"] = pd.to_datetime(
    flights_df["scheduled_time"], errors="coerce"
)

flights_df["actual_time"] = pd.to_datetime(
    flights_df["actual_time"], errors="coerce"
)
flights_df["delay_date"] = flights_df["scheduled_time"].dt.date


In [4]:
flights_df["delay_min"] = (
    (flights_df["actual_time"] - flights_df["scheduled_time"])
    .dt.total_seconds() / 60
)

# keep only positive delays
flights_df.loc[flights_df["delay_min"] < 0, "delay_min"] = 0


In [5]:
# delayed flight flag (>=15 min)
flights_df["is_delayed"] = flights_df["delay_min"] >= 15

# cancelled flight flag
flights_df["is_cancelled"] = flights_df["status"].isin(
    ["Cancelled", "Canceled"]
)


In [7]:
airport_delays_df = (
    flights_df
    .groupby(["origin_iata", "delay_date"])
    .agg(
        total_flights=("flight_number", "count"),
        delayed_flights=("is_delayed", "sum"),
        avg_delay_min=("delay_min", "mean"),
        median_delay_min=("delay_min", "median"),
        canceled_flights=("is_cancelled", "sum")
    )
    .reset_index()
)


In [8]:
airport_delays_df = airport_delays_df.rename(
    columns={"origin_iata": "airport_iata"}
)

airport_delays_df["avg_delay_min"] = (
    airport_delays_df["avg_delay_min"]
    .round()
    .astype("Int64")
)

airport_delays_df["median_delay_min"] = (
    airport_delays_df["median_delay_min"]
    .round()
    .astype("Int64")
)


In [9]:
airport_delays_df.to_csv(
    "../data/airport_delays.csv",
    index=False
)


In [10]:
airport_delays_df

Unnamed: 0,airport_iata,delay_date,total_flights,delayed_flights,avg_delay_min,median_delay_min,canceled_flights
0,ABJ,2026-01-02,2,0,0,0,0
1,ABX,2026-01-02,1,0,0,0,0
2,ABZ,2026-01-02,7,1,6,0,0
3,ACC,2026-01-02,1,0,0,0,0
4,ADB,2026-01-02,1,0,0,0,0
...,...,...,...,...,...,...,...
469,YYC,2026-01-02,3,0,0,0,0
470,YYZ,2026-01-02,11,1,27,0,0
471,ZAG,2026-01-02,4,1,6,0,0
472,ZNZ,2026-01-02,2,1,18,18,0


In [16]:
selected_airports = [
    "DEL", "BOM", "BLR", "HYD", "MAA", "CCU", "COK",   # India
    "JFK", "LHR", "DXB", "SIN", "CDG", "HND", "SYD"   # International
]
airports_in_data = set(flights_df["origin_iata"].dropna().unique())


In [17]:
present_airports = sorted(set(selected_airports) & airports_in_data)
present_airports


['BLR',
 'BOM',
 'CCU',
 'CDG',
 'COK',
 'DEL',
 'DXB',
 'HND',
 'HYD',
 'JFK',
 'LHR',
 'MAA',
 'SIN',
 'SYD']

In [18]:
missing_airports = sorted(set(selected_airports) - airports_in_data)
missing_airports


[]

datetime64[ns, UTC]