# Import

In [13]:
import pandas as pd
import holidays
no_holidays = holidays.NO()

# Parametre & stier

In [14]:
DATA_PATH = "../data/raw_data/historical_flights.csv"
CUTOFF = "2024-01-01"

# Hjelpefunksjoner

In [15]:
def load_flights(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df = df[df["cancelled"] == 0].copy()
    for col in ["std", "sta", "atd", "ata"]:
        df[col] = pd.to_datetime(df[col], errors="coerce")
    return df

def handle_wrong_times(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['duration'] = df['sta'] - df['std']
    df = df[df['duration'] >= pd.Timedelta(0)].copy()
    df = df[df['duration'] <= pd.Timedelta(hours=10)].copy()
    return df

def make_intervals(df: pd.DataFrame, actual: bool = True) -> pd.DataFrame:
    if actual:
        dep = df.dropna(subset=["atd"]).copy()
        dep["start"] = dep["atd"] - pd.to_timedelta(15, "m")
        dep["end"]   = dep["atd"] + pd.to_timedelta(8, "m")
        dep["delay"] = (dep["atd"] - dep["std"]).dt.total_seconds() / 60

        arr = df.dropna(subset=["ata"]).copy()
        arr["start"] = arr["ata"] - pd.to_timedelta(16, "m")
        arr["end"]   = arr["ata"] + pd.to_timedelta(5, "m")
        arr["delay"] = (arr["ata"] - arr["sta"]).dt.total_seconds() / 60
    else:
        dep = df.dropna(subset=["std"]).copy()
        dep["start"] = dep["std"] - pd.to_timedelta(15, "m")
        dep["end"]   = dep["std"] + pd.to_timedelta(8, "m")
        dep["delay"] = 0

        arr = df.dropna(subset=["sta"]).copy()
        arr["start"] = arr["sta"] - pd.to_timedelta(16, "m")
        arr["end"]   = arr["sta"] + pd.to_timedelta(5, "m")
        arr["delay"] = 0  # <- FIX: delay på arrivals i scheduled

    dep["airport_group"] = dep["dep_airport_group"]
    dep["type"] = "departure"
    arr["airport_group"] = arr["arr_airport_group"]
    arr["type"] = "arrival"

    intervals = pd.concat([dep, arr], ignore_index=True)
    intervals = intervals.dropna(subset=["airport_group"])
    return intervals

def expand_to_hours(intervals: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, row in intervals.iterrows():
        hour_start = row["start"].floor("h")
        hour_end = row["end"].floor("h")
        hours = pd.date_range(hour_start, hour_end, freq="h")
        for h in hours:
            rows.append({**row, "hour": h})
    return pd.DataFrame(rows)

def hourly_overlap(group: pd.DataFrame) -> pd.DataFrame:
    hour = group["hour"].iloc[0]
    h_start, h_end = hour, hour + pd.Timedelta(hours=1)
    events = []
    for _, row in group.iterrows():
        s = max(row["start"], h_start)
        e = min(row["end"],   h_end)
        if s < e:
            events.append((s, +1))
            events.append((e, -1))
    events.sort()
    active = 0
    for _, d in events:
        active += d
        if active > 1:
            return pd.DataFrame([{"airport_group": group["airport_group"].iloc[0], "hour": hour, "target": 1}])
    return pd.DataFrame([{"airport_group": group["airport_group"].iloc[0], "hour": hour, "target": 0}])

def make_hourly_features(intervals_actual: pd.DataFrame) -> pd.DataFrame:
    intervals_actual = intervals_actual.copy()
    intervals_actual["duration_min"] = ((intervals_actual["sta"] - intervals_actual["std"]).dt.total_seconds() / 60)

    feats = intervals_actual.groupby(["airport_group", "hour"]).agg(
        flights_cnt     = ("flight_id", "count"),
        avg_duration    = ("duration_min", "mean"),
        max_duration    = ("duration_min", "max"),
        avg_delay       = ("delay", "mean"),
        max_delay       = ("delay", "max"),
        passenger_share = ("service_type", lambda x: (x == "J").mean()),
        cargo_share     = ("service_type", lambda x: (x == "P").mean()),
        charter_share   = ("service_type", lambda x: (x == "C").mean()),
    ).reset_index()

    feats["dow"]     = feats["hour"].dt.dayofweek
    feats["holiday"] = feats["hour"].apply(lambda x: x.date() in no_holidays)
    feats["month"]   = feats["hour"].dt.month
    feats["hournum"] = feats["hour"].dt.hour
    feats["weekend"] = (feats["dow"] >= 5).astype(int)

    feats = feats.sort_values(["airport_group", "hour"])
    feats["flights_cnt_prev"] = feats.groupby("airport_group")["flights_cnt"].shift(1)
    feats["flights_cnt_next"] = feats.groupby("airport_group")["flights_cnt"].shift(-1)
    feats[["flights_cnt_prev", "flights_cnt_next"]] = feats[["flights_cnt_prev", "flights_cnt_next"]].fillna(0).astype(int)
    return feats


# Last rådata

In [16]:
df_raw = load_flights(DATA_PATH)
df_raw.head()

Unnamed: 0,flight_id,dep_airport,dep_airport_group,arr_airport,arr_airport_group,service_type,std,sta,cancelled,atd,ata
0,WF149,HOV,B,OSL,,J,2018-01-02 16:40:00,2018-01-02 17:15:00,0,NaT,2018-01-02 18:53:00
1,WF722,OSL,,MJF,D,J,2018-01-28 13:04:00,2018-01-28 14:50:00,0,NaT,NaT
2,WF188,FDE,A,OSL,,J,2018-04-07 07:10:00,2018-04-07 08:10:00,0,NaT,2018-04-07 07:55:00
3,WF176,HOV,B,OSL,,J,2018-04-07 11:00:00,2018-04-07 12:05:00,0,NaT,2018-04-07 12:00:00
4,WF148,HOV,B,OSL,,J,2018-04-30 08:25:00,2018-04-30 09:26:00,0,NaT,2018-04-30 09:36:00


# Rens og filter

In [17]:
df = handle_wrong_times(df_raw)
len(df), df.isna().mean().round(3).sort_values()

(399341,
 flight_id            0.000
 dep_airport          0.000
 arr_airport          0.000
 service_type         0.000
 std                  0.000
 sta                  0.000
 cancelled            0.000
 duration             0.000
 atd                  0.013
 ata                  0.015
 dep_airport_group    0.433
 arr_airport_group    0.434
 dtype: float64)

# Intervaller (actual) + hourly overlap (actual)

In [18]:
intervals_actual = make_intervals(df, actual=True)

intervals_actual_expanded = expand_to_hours(intervals_actual)
intervals_actual_expanded.head()

hourly_actual = (
    intervals_actual_expanded
    .groupby("airport_group", group_keys=False)
    .apply(hourly_overlap)
    .rename(columns={"target": "target_actual"})
)
hourly_actual.head()

KeyboardInterrupt: 

# Intervaller (scheduled) + hourly overlap (scheduled)

In [None]:
intervals_sched = make_intervals(df, actual=False)
intervals_sched_expanded = expand_to_hours(intervals_sched)

hourly_sched = (
    intervals_sched_expanded
    .groupby("airport_group", group_keys=False)
    .apply(hourly_overlap)
    .rename(columns={"target": "target_sched"})
)
hourly_sched.head()

# Merge targets

In [None]:
hourly = hourly_actual.merge(hourly_sched, on=["airport_group","hour"], how="left")
hourly["target_sched"] = hourly["target_sched"].fillna(0).astype(int)
hourly["hour"] = pd.to_datetime(hourly["hour"])
hourly.head()

# Feature-agg (actual)

In [None]:
hourly_features = make_hourly_features(intervals_actual_expanded.copy())
hourly_features["hour"] = pd.to_datetime(hourly_features["hour"])
hourly_features.head()

# Samle datasett + split

In [None]:
dataset = hourly.merge(hourly_features, on=["airport_group", "hour"], how="left").sort_values("hour")
train = dataset[dataset["hour"] < CUTOFF]
val   = dataset[dataset["hour"] >= CUTOFF]

train.shape, val.shape


# lagre mellomfiler

In [None]:
train.to_csv('data/processed_data/train.csv', index=False)
val.to_csv('data/processed_data/val.csv', index=False)