# Setup

In [None]:
import pandas as pd
import numpy as np
import holidays
import requests
import os
import re

no_holidays = holidays.NO()
DATA_PATH = "../data/raw_data/historical_flights.csv"
PREDICTION_PATH="../data/raw_data/scheduled_october2025.csv"

# Functions

In [None]:
def load_flights(path: str, prediction: bool) -> pd.DataFrame:
    df = pd.read_csv(path)
    if prediction:
        for col in ["std", "sta"]:
            df[col] = pd.to_datetime(df[col], errors="coerce")
    else:
        for col in ["std", "sta", "atd", "ata"]:
            df[col] = pd.to_datetime(df[col], errors="coerce")

    return df

def handle_wrong_times(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    n0 = len(df)

    # 1) Planlagt varighet
    df['duration'] = df['sta'] - df['std']
    df1 = df[(df['duration'] >= pd.Timedelta(0)) & (df['duration'] <= pd.Timedelta(hours=10))].copy()
    n1 = len(df1)

    # 2) Ekstremt tidlige avvik (delay < -500 min)
    dep_too_early = df1["atd"].notna() & ((df1["atd"] - df1["std"]) < pd.Timedelta(minutes=-500))
    arr_too_early = df1["ata"].notna() & ((df1["ata"] - df1["sta"]) < pd.Timedelta(minutes=-500))
    df2 = df1.loc[~(dep_too_early | arr_too_early)].copy()
    n2 = len(df2)

    print(f"Totalt: {n0}")
    print(f"  Fjernet på varighet: {n0 - n1}")
    print(f"  Fjernet på ekstremt tidlig dep/arr: {n1 - n2}")
    print(f"Beholdt: {n2}")

    return df2

def build_full_grid(df: pd.DataFrame, prediction:bool) -> pd.DataFrame:
    groups = pd.concat([df["dep_airport_group"], df["arr_airport_group"]]).dropna().unique()
    
    if prediction:
        tmin = pd.to_datetime(df[["std","sta"]].min(numeric_only=False).min()).floor("h")
        tmax = pd.to_datetime(df[["std","sta"]].max(numeric_only=False).max()).ceil("h")
    else:
        tmin = pd.to_datetime(df[["std","sta","atd","ata"]].min(numeric_only=False).min()).floor("h")
        tmax = pd.to_datetime(df[["std","sta","atd","ata"]].max(numeric_only=False).max()).ceil("h")

    all_hours = pd.date_range(tmin, tmax, freq="h")

    return pd.MultiIndex.from_product([groups, all_hours], names=["airport_group","timestamp"]).to_frame(index=False)


def make_intervals(df: pd.DataFrame, actual: bool = True) -> pd.DataFrame:
    if actual:
        dep = df.dropna(subset=["atd"]).copy()
        dep["start"] = dep["atd"] - pd.to_timedelta(15, "m")
        dep["end"]   = dep["atd"] + pd.to_timedelta(8, "m")

        arr = df.dropna(subset=["ata"]).copy()
        arr["start"] = arr["ata"] - pd.to_timedelta(16, "m")
        arr["end"]   = arr["ata"] + pd.to_timedelta(5, "m")
    else:
        dep = df.dropna(subset=["std"]).copy()
        dep["start"] = dep["std"] - pd.to_timedelta(15, "m")
        dep["end"]   = dep["std"] + pd.to_timedelta(8, "m")

        arr = df.dropna(subset=["sta"]).copy()
        arr["start"] = arr["sta"] - pd.to_timedelta(16, "m")
        arr["end"]   = arr["sta"] + pd.to_timedelta(5, "m")

    dep["airport_group"] = dep["dep_airport_group"]
    dep["type"] = "departure"
    arr["airport_group"] = arr["arr_airport_group"]
    arr["type"] = "arrival"

    intervals = pd.concat([dep, arr], ignore_index=True)
    intervals = intervals.dropna(subset=["airport_group"])

    return intervals


def expand_to_hours(intervals: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, row in intervals.iterrows():
        hour_start = row["start"].floor("h")
        hour_end = row["end"].floor("h")
        hours = pd.date_range(hour_start, hour_end, freq="h")
        for h in hours:
            rows.append({**row, "timestamp": h})
    return pd.DataFrame(rows)

def hourly_overlap_group(group: pd.DataFrame) -> pd.DataFrame:
    hour = group["timestamp"].iloc[0]
    airport = group["airport_group"].iloc[0]
    events = []

    for _, row in group.iterrows():
        events.append((row["start"], +1))
        events.append((row["end"], -1))

    events.sort()
    active, overlap = 0, 0

    for _, change in events:
        active += change
        if active > 1:
            overlap = 1
            break

    return pd.DataFrame([{"airport_group": airport, "timestamp": hour, "target": overlap}])

def make_hourly_features_from(intervals_any: pd.DataFrame) -> pd.DataFrame:
    """
    Feature-aggregater, men generisk (kan bruke actual ELLER scheduled intervaller).
    """

    df = intervals_any.copy()
    df["duration_min"] = ((df["sta"] - df["std"]).dt.total_seconds() / 60)
    
    if "flight_id" in df:
        df["airline"] = df["flight_id"].astype(str).str.extract(r"^([A-Za-z]+)")
    else:
        df["airline"] = ""

    feats = df.groupby(["airport_group", "timestamp"]).agg(
        flights_cnt     = ("flight_id", "count"),
        avg_duration    = ("duration_min", "mean"),
        max_duration    = ("duration_min", "max"),
        passenger_share = ("service_type", lambda x: (x == "J").mean()),
        cargo_share     = ("service_type", lambda x: (x == "P").mean()),
        charter_share   = ("service_type", lambda x: (x == "C").mean()),
        airline = ("airline", lambda x: x.mode().iloc[0] if not x.mode().empty else "")
    ).reset_index()

    return feats

def add_time_features(feats: pd.DataFrame) -> pd.DataFrame:
    feats["dow"]     = feats["timestamp"].dt.dayofweek
    feats["holiday"] = feats["timestamp"].apply(lambda x: x.date() in no_holidays)
    feats["month"]   = feats["timestamp"].dt.month
    feats["hournum"] = feats["timestamp"].dt.hour
    feats["weekend"] = (feats["dow"] >= 5).astype(int)
    feats["date"] = feats["timestamp"].dt.normalize()
    feats["daily_flights_cnt"] = feats.groupby(["airport_group", "date"])["flights_cnt"].transform("sum")
    feats = feats.sort_values(["airport_group", "timestamp"])
    
    return feats

def add_prev_next(feats: pd.DataFrame) -> pd.DataFrame:
    feats = feats.sort_values(["airport_group", "timestamp"])
    feats["flights_cnt_prev"] = feats.groupby("airport_group")["flights_cnt"].shift(1)
    feats["flights_cnt_next"] = feats.groupby("airport_group")["flights_cnt"].shift(-1)
    feats[["flights_cnt_prev", "flights_cnt_next"]] = feats[["flights_cnt_prev", "flights_cnt_next"]].fillna(0).astype(int)

    return feats


# Load data

In [None]:
df_raw = load_flights(DATA_PATH, prediction=False)
df_pred = load_flights(PREDICTION_PATH, prediction=True)
df_raw

# Clean and filter

In [None]:
df = handle_wrong_times(df_raw)

# Building full grid for all hours

In [None]:
grid = build_full_grid(df, prediction=False)
grid_pred = build_full_grid(df_pred, prediction=True)

# Find *actual* intervals and hourly overlap

In [None]:
intervals_actual = make_intervals(df, actual=True)
intervals_actual_expanded = expand_to_hours(intervals_actual)

hourly_actual = (
    intervals_actual_expanded
    .groupby(["airport_group", "timestamp"], group_keys=False)
    .apply(hourly_overlap_group)
    .rename(columns={"target": "target_actual"})
)

hourly_actual.head()

# Find *scheduled* intervals and hourly overlap

In [None]:
intervals_sched = make_intervals(df, actual=False)
intervals_sched_expanded = expand_to_hours(intervals_sched)

hourly_sched = (
    intervals_sched_expanded
    .groupby(["airport_group", "timestamp"], group_keys=False)
    .apply(hourly_overlap_group)
    .rename(columns={"target": "target_sched"})
)

# Find *actual* + *scheduled* intervals and hourly overlap for prediction data

In [None]:
intervals_pred = make_intervals(df_pred, actual=False)
intervals_pred_expanded = expand_to_hours(intervals_pred)

hourly_pred=(
    intervals_pred_expanded
    .groupby(["airport_group", "timestamp"], group_keys=False)
    .apply(hourly_overlap_group)
    .rename(columns={"target": "target_sched"})
)

# Merge targets datasets

In [None]:
hourly = (grid
    .merge(hourly_actual, on=["airport_group","timestamp"], how="left")
    .merge(hourly_sched,  on=["airport_group","timestamp"], how="left"))
hourly[["target_actual","target_sched"]] = hourly[["target_actual","target_sched"]].fillna(0).astype(int)
hourly.head()

hourly_pred=grid_pred.merge(hourly_pred, on=["airport_group","timestamp"], how="left")
hourly_pred[["target_sched"]] = hourly_pred[["target_sched"]].fillna(0).astype(int)
hourly_pred.head()

# Feature engineering

### Feature-agg 

In [None]:
feat_raw = make_hourly_features_from(intervals_sched_expanded.copy())
feat_raw.head()

### Feature-agg for prediction dataset

In [None]:
feat_pred_raw = make_hourly_features_from(intervals_pred_expanded.copy())
feat_pred_raw.head()

### Merging with full grid for every hour, and fills in values for hours where there's no planes

In [None]:
feat_full = grid.merge(feat_raw, on=["airport_group","timestamp"], how="left")
feat_full_pred = grid_pred.merge(feat_pred_raw, on=["airport_group","timestamp"], how="left")

for df_ in (feat_full, feat_full_pred):
    for c in ["flights_cnt","avg_duration","max_duration",
              "passenger_share","cargo_share","charter_share","airline", "hour"]:
        if c in df_.columns:
            if c == "flights_cnt":
                df_[c] = df_[c].fillna(0).astype(int)
            elif c == "airline":
                df_[c] = df_[c].fillna("")
            else:
                df_[c] = df_[c].fillna(0)

### Time features

In [None]:
feat_full      = add_time_features(feat_full)
feat_full_pred = add_time_features(feat_full_pred)

### Next and previous hour flights count

In [None]:
feat_full      = add_prev_next(feat_full)
feat_full_pred = add_prev_next(feat_full_pred)

## Weather

#### Get latitude/longitude of airports

In [None]:
airport_locations_df = pd.read_csv("../data/raw_data/airports.csv")
airport_groups_df = pd.read_csv("../data/raw_data/airportgroups.csv")

airport_locations_df = airport_locations_df.merge(airport_groups_df, left_on="iata_code", right_on="airport")
airport_locations_df = airport_locations_df.groupby(by = "airport_group")[["latitude_deg", "longitude_deg"]].mean()

#### Find relevant weather sensors

In [None]:
sensors = {}

for (group, location) in airport_locations_df.iterrows():
    endpoint = "https://frost.met.no/sources/v0.jsonld"
    parameters = {
        "types": "SensorSystem",
        "geometry": f"nearest(POINT({location["longitude_deg"]} {location["latitude_deg"]}))"
    }
    r = requests.get(endpoint, parameters, auth=(os.getenv("FROST_ID"),''))
    resp = r.json()

    sensors[group] = resp

for key in sensors.keys():
    sensors[key] = sorted(sensors[key]["data"], key=lambda v: v["distance"])

### Find weather observations on said sensors

In [None]:
weather = {}
elements = ["mean(air_temperature P1D)", "sum(precipitation_amount P1D)"]
count = 0

for (i, row) in feat_full.iterrows():
    count += 1
    print(f"{count} of {feat_full.shape[0]}")

    timestamp = row["timestamp"].strftime("%Y-%m-%d")

    if weather.get(row["airport_group"]) == None:
        weather[row["airport_group"]] = {}
    if weather[row["airport_group"]].get(timestamp) == None:
        weather[row["airport_group"]][timestamp] = {}

    if len(weather[row["airport_group"]][timestamp].keys()) == 0:
        endpoint = 'https://frost.met.no/observations/v0.jsonld'
        parameters = {
            'sources': sensors[row["airport_group"]][0]["id"],
            'elements': ",".join(elements),
            'referencetime': timestamp,
        }
        r = requests.get(endpoint, parameters, auth=(os.getenv("FROST_ID"),''), timeout=10)
    
        if r.status_code == 200:
            json = r.json()

            for (k, v) in list(map(lambda v: (v["elementId"], v["value"]), filter(lambda v: v["timeOffset"] == "PT6H", json["data"][0]["observations"]))):
                weather[row["airport_group"]][timestamp][k] = float(v)
        else:
            for k in elements:
                weather[row["airport_group"]][timestamp][k] = float('nan')
    
    for observations in weather[row["airport_group"]].values():
        for (observation_k, observation_v) in observations.items():
            feat_full.at[i, observation_k] = observation_v

### Find weather forecast on said sensors

In [None]:
weather = {}

for airport_group in set(feat_full_pred["airport_group"]):
    weather[airport_group] = {}

    endpoint = "https://api.met.no/weatherapi/subseasonal/1.0/complete"
    parameters = {
        "lat": sensors[airport_group][0]["geometry"]["coordinates"][1],
        "lon": sensors[airport_group][0]["geometry"]["coordinates"][0]
    }
    r = requests.get(endpoint, parameters, auth=(os.getenv("FROST_ID"),''), headers= { "User-Agent": "uionowciabs" }) # random user agent

    if r.status_code == 200:
        json = r.json()
        for record in json["properties"]["timeseries"]:
            k = pd.to_datetime(record["time"]).strftime("%Y-%m-%d")
            weather[airport_group][k] = {}

            weather[airport_group][k]["mean(air_temperature P1D)"] = record["data"]["next_24_hours"]["details"]["air_temperature_mean"]
            weather[airport_group][k]["sum(precipitation_amount P1D)"] = record["data"]["next_24_hours"]["details"]["precipitation_amount"]

### Add weather forecast (closest to the prediction date)

In [None]:
for (i, row) in feat_full_pred.iterrows():
    available_forecast_dates = list(enumerate(map(lambda v: pd.to_datetime(v), weather[row["airport_group"]].keys())))
    prediction_date = (len(available_forecast_dates), row["timestamp"])

    dates = available_forecast_dates + [prediction_date]
    dates = sorted(dates, key=lambda v: v[1])
    prediction_date_index = dates.index(prediction_date)

    closest_available_forecast_index = -1
    if prediction_date[1].hour >= 12 and prediction_date_index != len(dates) - 1:
        closest_available_forecast_index = min(len(dates) - 1, prediction_date_index + 1)
    else:
        closest_available_forecast_index = max(0, prediction_date_index - 1)
    
    closest_available_forecast = dates[closest_available_forecast_index][1]
    k = closest_available_forecast.strftime("%Y-%m-%d")

    for (forecast_k, forecast_v) in weather[row["airport_group"]][k].items():
        feat_full_pred.at[i, forecast_k] = forecast_v

# Combine datasets and split into training/validation/test

In [None]:
dataset = hourly.merge(feat_full, on=["airport_group", "timestamp"], how="left").sort_values("timestamp")
pred = hourly_pred.merge(feat_full_pred,on=["airport_group","timestamp"], how="left").sort_values("timestamp")

dataset["timestamp"] = pd.to_datetime(dataset["timestamp"])
pred["timestamp"]    = pd.to_datetime(pred["timestamp"])

CUTOFF_VAL  = pd.Timestamp("2025-01-01")  
CUTOFF_TEST = pd.Timestamp("2025-04-01")

train = dataset[dataset["timestamp"] <  CUTOFF_VAL].copy()
val   = dataset[(dataset["timestamp"] >= CUTOFF_VAL) & 
                (dataset["timestamp"] <  CUTOFF_TEST)].copy()
test  = dataset[dataset["timestamp"] >= CUTOFF_TEST].copy()

print(train.shape, val.shape, test.shape, pred.shape)

# Export as `.csv` files

In [None]:
train.to_csv('../data/processed_data/train.csv', index=False)
val.to_csv('../data/processed_data/val.csv', index=False)
test.to_csv('../data/processed_data/test.csv', index=False)
pred.to_csv('../data/processed_data/predict_oct2025.csv', index=False)