# Setup

In [None]:
import pandas as pd
import numpy as np
import holidays
import requests
import os
import re

no_holidays = holidays.NO()
DATA_PATH = "../data/raw_data/historical_flights.csv"
PREDICTION_PATH="../data/raw_data/scheduled_october2025.csv"

# Functions

In [None]:
def load_flights(path: str, prediction: bool) -> pd.DataFrame:
    df = pd.read_csv(path)
    if prediction:
        for col in ["std", "sta"]:
            df[col] = pd.to_datetime(df[col], errors="coerce")
    else:
        for col in ["std", "sta", "atd", "ata"]:
            df[col] = pd.to_datetime(df[col], errors="coerce")

    return df

def handle_wrong_times(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    n0 = len(df)

    # 1) Planlagt varighet
    df['duration'] = df['sta'] - df['std']
    df1 = df[(df['duration'] >= pd.Timedelta(0)) & (df['duration'] <= pd.Timedelta(hours=10))].copy()
    n1 = len(df1)

    # 2) Ekstremt tidlige avvik (delay < -500 min)
    dep_too_early = df1["atd"].notna() & ((df1["atd"] - df1["std"]) < pd.Timedelta(minutes=-500))
    arr_too_early = df1["ata"].notna() & ((df1["ata"] - df1["sta"]) < pd.Timedelta(minutes=-500))
    df2 = df1.loc[~(dep_too_early | arr_too_early)].copy()
    n2 = len(df2)

    print(f"Totalt: {n0}")
    print(f"  Fjernet på varighet: {n0 - n1}")
    print(f"  Fjernet på ekstremt tidlig dep/arr: {n1 - n2}")
    print(f"Beholdt: {n2}")

    return df2

def build_full_grid(df: pd.DataFrame, prediction:bool) -> pd.DataFrame:
    groups = pd.concat([df["dep_airport_group"], df["arr_airport_group"]]).dropna().unique()
    
    if prediction:
        tmin = pd.to_datetime(df[["std","sta"]].min(numeric_only=False).min()).floor("h")
        tmax = pd.to_datetime(df[["std","sta"]].max(numeric_only=False).max()).ceil("h")
    else:
        tmin = pd.to_datetime(df[["std","sta","atd","ata"]].min(numeric_only=False).min()).floor("h")
        tmax = pd.to_datetime(df[["std","sta","atd","ata"]].max(numeric_only=False).max()).ceil("h")

    all_hours = pd.date_range(tmin, tmax, freq="h")

    return pd.MultiIndex.from_product([groups, all_hours], names=["airport_group","timestamp"]).to_frame(index=False)


def make_intervals(df: pd.DataFrame, actual: bool = True) -> pd.DataFrame:
    if actual:
        dep = df.dropna(subset=["atd"]).copy()
        dep["start"] = dep["atd"] - pd.to_timedelta(15, "m")
        dep["end"]   = dep["atd"] + pd.to_timedelta(8, "m")

        arr = df.dropna(subset=["ata"]).copy()
        arr["start"] = arr["ata"] - pd.to_timedelta(16, "m")
        arr["end"]   = arr["ata"] + pd.to_timedelta(5, "m")
    else:
        dep = df.dropna(subset=["std"]).copy()
        dep["start"] = dep["std"] - pd.to_timedelta(15, "m")
        dep["end"]   = dep["std"] + pd.to_timedelta(8, "m")

        arr = df.dropna(subset=["sta"]).copy()
        arr["start"] = arr["sta"] - pd.to_timedelta(16, "m")
        arr["end"]   = arr["sta"] + pd.to_timedelta(5, "m")

    dep["airport_group"] = dep["dep_airport_group"]
    dep["type"] = "departure"
    arr["airport_group"] = arr["arr_airport_group"]
    arr["type"] = "arrival"

    intervals = pd.concat([dep, arr], ignore_index=True)
    intervals = intervals.dropna(subset=["airport_group"])

    return intervals


def expand_to_hours(intervals: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, row in intervals.iterrows():
        hour_start = row["start"].floor("h")
        hour_end = row["end"].floor("h")
        hours = pd.date_range(hour_start, hour_end, freq="h")
        for h in hours:
            rows.append({**row, "timestamp": h})
    return pd.DataFrame(rows)

def hourly_overlap_group(group: pd.DataFrame) -> pd.DataFrame:
    hour = group["timestamp"].iloc[0]
    airport = group["airport_group"].iloc[0]
    events = []

    for _, row in group.iterrows():
        events.append((row["start"], +1))
        events.append((row["end"], -1))

    events.sort()
    active, overlap = 0, 0

    for _, change in events:
        active += change
        if active > 1:
            overlap = 1
            break

    return pd.DataFrame([{"airport_group": airport, "timestamp": hour, "target": overlap}])

def make_hourly_features_from(intervals_any: pd.DataFrame) -> pd.DataFrame:
    """
    Feature-aggregater, men generisk (kan bruke actual ELLER scheduled intervaller).
    """

    df = intervals_any.copy()
    df["duration_min"] = ((df["sta"] - df["std"]).dt.total_seconds() / 60)
    
    if "flight_id" in df:
        df["airline"] = df["flight_id"].astype(str).str.extract(r"^([A-Za-z]+)")
    else:
        df["airline"] = ""

    feats = df.groupby(["airport_group", "timestamp"]).agg(
        flights_cnt     = ("flight_id", "count"),
        avg_duration    = ("duration_min", "mean"),
        max_duration    = ("duration_min", "max"),
        passenger_share = ("service_type", lambda x: (x == "J").mean()),
        cargo_share     = ("service_type", lambda x: (x == "P").mean()),
        charter_share   = ("service_type", lambda x: (x == "C").mean()),
        airline = ("airline", lambda x: x.mode().iloc[0] if not x.mode().empty else "")
    ).reset_index()

    return feats

def add_time_features(feats: pd.DataFrame) -> pd.DataFrame:
    feats["dow"]     = feats["timestamp"].dt.dayofweek
    feats["holiday"] = feats["timestamp"].apply(lambda x: x.date() in no_holidays)
    feats["month"]   = feats["timestamp"].dt.month
    feats["hournum"] = feats["timestamp"].dt.hour
    feats["weekend"] = (feats["dow"] >= 5).astype(int)
    feats["date"] = feats["timestamp"].dt.normalize()
    feats["daily_flights_cnt"] = feats.groupby(["airport_group", "date"])["flights_cnt"].transform("sum")
    feats = feats.sort_values(["airport_group", "timestamp"])
    
    return feats

def add_prev_next(feats: pd.DataFrame) -> pd.DataFrame:
    feats = feats.sort_values(["airport_group", "timestamp"])
    feats["flights_cnt_prev"] = feats.groupby("airport_group")["flights_cnt"].shift(1)
    feats["flights_cnt_next"] = feats.groupby("airport_group")["flights_cnt"].shift(-1)
    feats[["flights_cnt_prev", "flights_cnt_next"]] = feats[["flights_cnt_prev", "flights_cnt_next"]].fillna(0).astype(int)

    return feats


# Load data

In [4]:
df_raw = load_flights(DATA_PATH, prediction=False)
df_pred = load_flights(PREDICTION_PATH, prediction=True)
df_raw

Unnamed: 0,flight_id,dep_airport,dep_airport_group,arr_airport,arr_airport_group,service_type,std,sta,cancelled,atd,ata
0,WF149,HOV,B,OSL,,J,2018-01-02 16:40:00,2018-01-02 17:15:00,0,NaT,2018-01-02 18:53:00
1,WF722,OSL,,MJF,D,J,2018-01-28 13:04:00,2018-01-28 14:50:00,0,NaT,NaT
2,WF188,FDE,A,OSL,,J,2018-04-07 07:10:00,2018-04-07 08:10:00,0,NaT,2018-04-07 07:55:00
3,WF176,HOV,B,OSL,,J,2018-04-07 11:00:00,2018-04-07 12:05:00,0,NaT,2018-04-07 12:00:00
4,WF148,HOV,B,OSL,,J,2018-04-30 08:25:00,2018-04-30 09:26:00,0,NaT,2018-04-30 09:36:00
...,...,...,...,...,...,...,...,...,...,...,...
410437,WF153,SOG,A,HOV,B,J,2025-05-03 09:25:00,2025-05-03 10:00:00,0,2025-05-03 09:47:00,2025-05-03 10:19:00
410438,WF153,BGO,,SOG,A,J,2025-05-03 08:35:00,2025-05-03 09:10:00,0,2025-05-03 08:29:00,2025-05-03 09:18:00
410439,WF158,OSL,,HOV,B,J,2025-05-03 14:40:00,2025-05-03 15:50:00,0,2025-05-03 14:35:00,2025-05-03 15:39:00
410440,WF721,SSJ,D,TRD,,J,2025-05-03 08:50:00,2025-05-03 09:45:00,0,2025-05-03 08:49:00,2025-05-03 09:39:00


# Clean and filter

In [5]:
df = handle_wrong_times(df_raw)

Totalt: 410442
  Fjernet på varighet: 86
  Fjernet på ekstremt tidlig dep/arr: 97
Beholdt: 410259


# Building full grid for all hours

In [None]:
grid = build_full_grid(df, prediction=False)
grid_pred = build_full_grid(df_pred, prediction=True)

# Find *actual* intervals and hourly overlap

In [7]:
intervals_actual = make_intervals(df, actual=True)
intervals_actual_expanded = expand_to_hours(intervals_actual)

hourly_actual = (
    intervals_actual_expanded
    .groupby(["airport_group", "timestamp"], group_keys=False)
    .apply(hourly_overlap_group)
    .rename(columns={"target": "target_actual"})
)

hourly_actual.head()

  .apply(hourly_overlap_group)


Unnamed: 0,airport_group,timestamp,target_actual
0,A,2018-01-01 12:00:00,0
0,A,2018-01-01 13:00:00,1
0,A,2018-01-01 14:00:00,0
0,A,2018-01-01 16:00:00,1
0,A,2018-01-01 17:00:00,1


# Find *scheduled* intervals and hourly overlap

In [8]:
intervals_sched = make_intervals(df, actual=False)
intervals_sched_expanded = expand_to_hours(intervals_sched)

hourly_sched = (
    intervals_sched_expanded
    .groupby(["airport_group", "timestamp"], group_keys=False)
    .apply(hourly_overlap_group)
    .rename(columns={"target": "target_sched"})
)

  .apply(hourly_overlap_group)


# Find *actual* + *scheduled* intervals and hourly overlap for prediction data

In [9]:
intervals_pred = make_intervals(df_pred, actual=False)
intervals_pred_expanded = expand_to_hours(intervals_pred)

hourly_pred=(
    intervals_pred_expanded
    .groupby(["airport_group", "timestamp"], group_keys=False)
    .apply(hourly_overlap_group)
    .rename(columns={"target": "target_sched"})
)

  .apply(hourly_overlap_group)


# Merge targets datasets

In [10]:
hourly = (grid
    .merge(hourly_actual, on=["airport_group","timestamp"], how="left")
    .merge(hourly_sched,  on=["airport_group","timestamp"], how="left"))
hourly[["target_actual","target_sched"]] = hourly[["target_actual","target_sched"]].fillna(0).astype(int)
hourly.head()

hourly_pred=grid_pred.merge(hourly_pred, on=["airport_group","timestamp"], how="left")
hourly_pred[["target_sched"]] = hourly_pred[["target_sched"]].fillna(0).astype(int)
hourly_pred.head()

Unnamed: 0,airport_group,timestamp,target_sched
0,G,2025-10-01 03:00:00,1
1,G,2025-10-01 04:00:00,0
2,G,2025-10-01 05:00:00,0
3,G,2025-10-01 06:00:00,1
4,G,2025-10-01 07:00:00,0


# Feature engineering

### Feature-agg 

In [11]:
feat_raw = make_hourly_features_from(intervals_sched_expanded.copy())
feat_raw.head()

Unnamed: 0,airport_group,timestamp,flights_cnt,avg_duration,max_duration,passenger_share,cargo_share,charter_share,airline
0,A,2018-01-01 11:00:00,3,56.666667,60.0,1.0,0.0,0.0,
1,A,2018-01-01 12:00:00,3,46.666667,60.0,1.0,0.0,0.0,
2,A,2018-01-01 15:00:00,2,60.0,60.0,1.0,0.0,0.0,
3,A,2018-01-01 16:00:00,4,46.25,65.0,1.0,0.0,0.0,
4,A,2018-01-01 17:00:00,4,42.5,60.0,1.0,0.0,0.0,


### Feature-agg for prediction dataset

In [12]:
feat_pred_raw = make_hourly_features_from(intervals_pred_expanded.copy())
feat_pred_raw.head()

Unnamed: 0,airport_group,timestamp,flights_cnt,avg_duration,max_duration,passenger_share,cargo_share,charter_share,airline
0,A,2025-10-01 03:00:00,2,47.5,65.0,1.0,0.0,0.0,
1,A,2025-10-01 04:00:00,6,40.833333,65.0,1.0,0.0,0.0,
2,A,2025-10-01 06:00:00,3,51.666667,65.0,1.0,0.0,0.0,
3,A,2025-10-01 07:00:00,6,41.666667,65.0,1.0,0.0,0.0,
4,A,2025-10-01 08:00:00,2,47.5,65.0,1.0,0.0,0.0,


### Merging with full grid for every hour, and fills in values for hours where there's no planes

In [13]:
feat_full = grid.merge(feat_raw, on=["airport_group","timestamp"], how="left")
feat_full_pred = grid_pred.merge(feat_pred_raw, on=["airport_group","timestamp"], how="left")

for df_ in (feat_full, feat_full_pred):
    for c in ["flights_cnt","avg_duration","max_duration",
              "passenger_share","cargo_share","charter_share","airline", "hour"]:
        if c in df_.columns:
            if c == "flights_cnt":
                df_[c] = df_[c].fillna(0).astype(int)
            elif c == "airline":
                df_[c] = df_[c].fillna("")
            else:
                df_[c] = df_[c].fillna(0)

### Time features

In [14]:
feat_full      = add_time_features(feat_full)
feat_full_pred = add_time_features(feat_full_pred)

### Next and previous hour flights count

In [15]:
feat_full      = add_prev_next(feat_full)
feat_full_pred = add_prev_next(feat_full_pred)

## Weather

#### Get latitude/longitude of airports

In [16]:
airport_locations_df = pd.read_csv("../data/raw_data/airports.csv")
airport_groups_df = pd.read_csv("../data/raw_data/airportgroups.csv")

airport_locations_df = airport_locations_df.merge(airport_groups_df, left_on="iata_code", right_on="airport")
airport_locations_df = airport_locations_df.groupby(by = "airport_group")[["latitude_deg", "longitude_deg"]].mean()

#### Find relevant weather sensors

In [17]:
sensors = {}

for (group, location) in airport_locations_df.iterrows():
    endpoint = "https://frost.met.no/sources/v0.jsonld"
    parameters = {
        "types": "SensorSystem",
        "geometry": f"nearest(POINT({location["longitude_deg"]} {location["latitude_deg"]}))"
    }
    r = requests.get(endpoint, parameters, auth=(os.getenv("FROST_ID"),''))
    resp = r.json()

    sensors[group] = resp

for key in sensors.keys():
    sensors[key] = sorted(sensors[key]["data"], key=lambda v: v["distance"])

### Find weather observations on said sensors

In [None]:
weather = {}
elements = ["mean(air_temperature P1D)", "sum(precipitation_amount P1D)"]
count = 0

for (i, row) in feat_full.iterrows():
    count += 1
    print(f"{count} of {feat_full.shape[0]}")
    if (count < 271672): # 259206
        continue

    timestamp = row["timestamp"].strftime("%Y-%m-%d")

    if weather.get(row["airport_group"]) == None:
        weather[row["airport_group"]] = {}
    if weather[row["airport_group"]].get(timestamp) == None:
        weather[row["airport_group"]][timestamp] = {}

    if len(weather[row["airport_group"]][timestamp].keys()) == 0:
        endpoint = 'https://frost.met.no/observations/v0.jsonld'
        parameters = {
            'sources': sensors[row["airport_group"]][0]["id"],
            'elements': ",".join(elements),
            'referencetime': timestamp,
        }
        r = requests.get(endpoint, parameters, auth=(os.getenv("FROST_ID"),''), timeout=10)
    
        if r.status_code == 200:
            json = r.json()

            for (k, v) in list(map(lambda v: (v["elementId"], v["value"]), filter(lambda v: v["timeOffset"] == "PT6H", json["data"][0]["observations"]))):
                weather[row["airport_group"]][timestamp][k] = float(v)
        else:
            for k in elements:
                weather[row["airport_group"]][timestamp][k] = float('nan')
    
    for observations in weather[row["airport_group"]].values():
        for (observation_k, observation_v) in observations.items():
            feat_full.at[i, observation_k] = observation_v

1 of 465143
2 of 465143
3 of 465143
4 of 465143
5 of 465143
6 of 465143
7 of 465143
8 of 465143
9 of 465143
10 of 465143
11 of 465143
12 of 465143
13 of 465143
14 of 465143
15 of 465143
16 of 465143
17 of 465143
18 of 465143
19 of 465143
20 of 465143
21 of 465143
22 of 465143
23 of 465143
24 of 465143
25 of 465143
26 of 465143
27 of 465143
28 of 465143
29 of 465143
30 of 465143
31 of 465143
32 of 465143
33 of 465143
34 of 465143
35 of 465143
36 of 465143
37 of 465143
38 of 465143
39 of 465143
40 of 465143
41 of 465143
42 of 465143
43 of 465143
44 of 465143
45 of 465143
46 of 465143
47 of 465143
48 of 465143
49 of 465143
50 of 465143
51 of 465143
52 of 465143
53 of 465143
54 of 465143
55 of 465143
56 of 465143
57 of 465143
58 of 465143
59 of 465143
60 of 465143
61 of 465143
62 of 465143
63 of 465143
64 of 465143
65 of 465143
66 of 465143
67 of 465143
68 of 465143
69 of 465143
70 of 465143
71 of 465143
72 of 465143
73 of 465143
74 of 465143
75 of 465143
76 of 465143
77 of 465143
78 of 46

KeyboardInterrupt: 

### Find weather forecast on said sensors

In [19]:
weather = {}

for airport_group in set(feat_full_pred["airport_group"]):
    weather[airport_group] = {}

    endpoint = "https://api.met.no/weatherapi/subseasonal/1.0/complete"
    parameters = {
        "lat": sensors[airport_group][0]["geometry"]["coordinates"][1],
        "lon": sensors[airport_group][0]["geometry"]["coordinates"][0]
    }
    r = requests.get(endpoint, parameters, auth=(os.getenv("FROST_ID"),''), headers= { "User-Agent": "uionowciabs" }) # random user agent

    if r.status_code == 200:
        json = r.json()
        for record in json["properties"]["timeseries"]:
            k = pd.to_datetime(record["time"]).strftime("%Y-%m-%d")
            weather[airport_group][k] = {}

            weather[airport_group][k]["mean(air_temperature P1D)"] = record["data"]["next_24_hours"]["details"]["air_temperature_mean"]
            weather[airport_group][k]["sum(precipitation_amount P1D)"] = record["data"]["next_24_hours"]["details"]["precipitation_amount"]

### Add weather forecast (closest to the prediction date)

In [20]:
for (i, row) in feat_full_pred.iterrows():
    available_forecast_dates = list(enumerate(map(lambda v: pd.to_datetime(v), weather[row["airport_group"]].keys())))
    prediction_date = (len(available_forecast_dates), row["timestamp"])

    dates = available_forecast_dates + [prediction_date]
    dates = sorted(dates, key=lambda v: v[1])
    prediction_date_index = dates.index(prediction_date)

    closest_available_forecast_index = -1
    if prediction_date[1].hour >= 12 and prediction_date_index != len(dates) - 1:
        closest_available_forecast_index = min(len(dates) - 1, prediction_date_index + 1)
    else:
        closest_available_forecast_index = max(0, prediction_date_index - 1)
    
    closest_available_forecast = dates[closest_available_forecast_index][1]
    k = closest_available_forecast.strftime("%Y-%m-%d")

    for (forecast_k, forecast_v) in weather[row["airport_group"]][k].items():
        feat_full_pred.at[i, forecast_k] = forecast_v

# Combine datasets and split into training/validation/test

In [21]:
dataset = hourly.merge(feat_full, on=["airport_group", "timestamp"], how="left").sort_values("timestamp")
pred = hourly_pred.merge(feat_full_pred,on=["airport_group","timestamp"], how="left").sort_values("timestamp")

dataset["timestamp"] = pd.to_datetime(dataset["timestamp"])
pred["timestamp"]    = pd.to_datetime(pred["timestamp"])

CUTOFF_VAL  = pd.Timestamp("2025-01-01")  
CUTOFF_TEST = pd.Timestamp("2025-04-01")

train = dataset[dataset["timestamp"] <  CUTOFF_VAL].copy()
val   = dataset[(dataset["timestamp"] >= CUTOFF_VAL) & 
                (dataset["timestamp"] <  CUTOFF_TEST)].copy()
test  = dataset[dataset["timestamp"] >= CUTOFF_TEST].copy()

print(train.shape, val.shape, test.shape, pred.shape)

(429527, 22) (15120, 22) (20496, 22) (5026, 21)


# Export as `.csv` files

In [22]:
train.to_csv('../data/processed_data/train.csv', index=False)
val.to_csv('../data/processed_data/val.csv', index=False)
test.to_csv('../data/processed_data/test.csv', index=False)
pred.to_csv('../data/processed_data/predict_oct2025.csv', index=False)

In [23]:
feat_full.to_csv('../data/processed_data/temp.csv', index=False)

In [50]:
train_cache = pd.read_csv("../data/processed_data/train.csv")
val_cache = pd.read_csv("../data/processed_data/val.csv")
test_cache = pd.read_csv("../data/processed_data/test.csv")

In [51]:
cache_combined = pd.concat([pd.concat([train_cache, val_cache]), test_cache])

In [29]:
cache_combined.shape

(465143, 22)

In [52]:
temp_cache = pd.read_csv("../data/processed_data/temp.csv")

In [31]:
temp_cache.shape

(465143, 20)

In [53]:
temp_cache = temp_cache.sort_values(["airport_group", "timestamp"]).reset_index()
cache_combined = cache_combined.sort_values(["airport_group", "timestamp"]).reset_index()

In [54]:
cache_combined.head(10)

Unnamed: 0,index,airport_group,timestamp,target_actual,target_sched,flights_cnt,avg_duration,max_duration,passenger_share,cargo_share,...,holiday,month,hournum,weekend,date,daily_flights_cnt,flights_cnt_prev,flights_cnt_next,mean(air_temperature P1D),sum(precipitation_amount P1D)
0,2,A,2018-01-01 07:00:00,0,0,0,0.0,0.0,0.0,0.0,...,True,1,7,0,2018-01-01,25,0,0,1.0,0.3
1,13,A,2018-01-01 08:00:00,0,0,0,0.0,0.0,0.0,0.0,...,True,1,8,0,2018-01-01,25,0,0,1.0,0.3
2,20,A,2018-01-01 09:00:00,0,0,0,0.0,0.0,0.0,0.0,...,True,1,9,0,2018-01-01,25,0,0,1.0,0.3
3,25,A,2018-01-01 10:00:00,0,0,0,0.0,0.0,0.0,0.0,...,True,1,10,0,2018-01-01,25,0,3,1.0,0.3
4,31,A,2018-01-01 11:00:00,0,1,3,56.666667,60.0,1.0,0.0,...,True,1,11,0,2018-01-01,25,0,3,1.0,0.3
5,41,A,2018-01-01 12:00:00,0,1,3,46.666667,60.0,1.0,0.0,...,True,1,12,0,2018-01-01,25,3,0,1.0,0.3
6,45,A,2018-01-01 13:00:00,1,0,0,0.0,0.0,0.0,0.0,...,True,1,13,0,2018-01-01,25,3,0,1.0,0.3
7,54,A,2018-01-01 14:00:00,0,0,0,0.0,0.0,0.0,0.0,...,True,1,14,0,2018-01-01,25,0,2,1.0,0.3
8,60,A,2018-01-01 15:00:00,0,0,2,60.0,60.0,1.0,0.0,...,True,1,15,0,2018-01-01,25,0,4,1.0,0.3
9,69,A,2018-01-01 16:00:00,1,1,4,46.25,65.0,1.0,0.0,...,True,1,16,0,2018-01-01,25,2,4,1.0,0.3


In [57]:
for (i, row) in cache_combined.iterrows():
    if pd.isna(row["mean(air_temperature P1D)"]) and pd.isna(row["sum(precipitation_amount P1D)"]):
        cache_combined.at[i, "mean(air_temperature P1D)"] = temp_cache.at[i, "mean(air_temperature P1D)"]
        cache_combined.at[i, "sum(precipitation_amount P1D)"] = temp_cache.at[i, "sum(precipitation_amount P1D)"]

In [62]:
cache_combined = cache_combined.drop('index', axis=1)
cache_combined.to_csv('../data/processed_data/final.csv', index=False)

In [64]:
cache_combined.isna().sum()

airport_group                         0
timestamp                             0
target_actual                         0
target_sched                          0
flights_cnt                           0
avg_duration                          0
max_duration                          0
passenger_share                       0
cargo_share                           0
charter_share                         0
airline                          465143
dow                                   0
holiday                               0
month                                 0
hournum                               0
weekend                               0
date                                  0
daily_flights_cnt                     0
flights_cnt_prev                      0
flights_cnt_next                      0
mean(air_temperature P1D)        332245
sum(precipitation_amount P1D)    265796
dtype: int64