In [40]:
# install required packages
!pip install -r ../requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [55]:
import pandas as pd
import numpy as np

PATH = "../test_data/data/processed/preprocessed_data.csv"  # <— change if needed
TARGET = "DELAY_FLAG_15"

df = pd.read_csv(PATH)
print(df.shape)
df.head(3)

(735750, 15)


Unnamed: 0,AIRLINE,ORIGIN,DEST,DISTANCE,DELAY_FLAG_15,dep_rain,dep_ice,dep_wind,arr_rain,arr_ice,arr_wind,day_of_week,month,hour_of_day,is_bank_holiday
0,Delta Air Lines Inc.,MSP,SEA,1399.0,0,0.0,0.0,0.0,0.0,0.0,0.0,5,11,21,0
1,Delta Air Lines Inc.,MSP,SFO,1589.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0,3,16,0
2,American Airlines Inc.,DCA,BOS,399.0,0,1.0,0.0,0.0,0.0,0.0,0.0,6,6,10,0


In [56]:
# class balance
df[TARGET].value_counts(normalize=True).rename("share")


DELAY_FLAG_15
0    0.781256
1    0.218744
Name: share, dtype: float64

In [57]:
# basic NA check
df.isna().mean().sort_values(ascending=False).head(10)

dep_rain    0.753813
dep_ice     0.753813
dep_wind    0.753813
arr_rain    0.753813
arr_ice     0.753813
arr_wind    0.753813
AIRLINE     0.000000
ORIGIN      0.000000
DEST        0.000000
DISTANCE    0.000000
dtype: float64

In [43]:
# Categorical identifiers (keep raw strings here)
candidate_cats = ["AIRLINE", "ORIGIN", "DEST"]

# Weather flags (we'll impute missing with 0)
candidate_weather = ["dep_rain","dep_ice","dep_wind","arr_rain","arr_ice","arr_wind"]

# Other numeric features (common ones; add/remove as your file dictates)
candidate_num = ["day_of_week", "month", "hour_of_day", "is_bank_holiday", "DISTANCE"]

# Keep only those that actually exist in the DataFrame
cat_cols     = [c for c in candidate_cats if c in df.columns]
weather_cols = [c for c in candidate_weather if c in df.columns]
num_cols     = [c for c in candidate_num if c in df.columns and c not in weather_cols]

print("Categorical:", cat_cols)
print("Weather:", weather_cols)
print("Numeric:", num_cols)

# Make sure target is int
df[TARGET] = df[TARGET].astype("int8")

Categorical: ['AIRLINE', 'ORIGIN', 'DEST']
Weather: ['dep_rain', 'dep_ice', 'dep_wind', 'arr_rain', 'arr_ice', 'arr_wind']
Numeric: ['day_of_week', 'month', 'hour_of_day', 'is_bank_holiday', 'DISTANCE']


In [44]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.shape, X_test.shape, y_train.mean(), y_test.mean()

((1119948, 14), (279988, 14), 0.21170000750034823, 0.21169835850107863)

In [45]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

pre = ColumnTransformer(
    transformers=[
        ("weather", SimpleImputer(strategy="constant", fill_value=0), weather_cols),
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
    ],
    remainder="drop"
)

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

models = {
    "dummy": Pipeline([("pre", pre), ("clf", DummyClassifier(strategy="most_frequent"))]),
    "logreg": Pipeline([
        ("pre", pre),
        ("clf", LogisticRegression(max_iter=500, class_weight="balanced", n_jobs=-1))
    ]),
    "lgbm": Pipeline([
        ("pre", pre),
        ("clf", LGBMClassifier(
            n_estimators=800,
            learning_rate=0.05,
            num_leaves=63,
            subsample=0.8,
            colsample_bytree=0.8,
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        ))
    ]),
}

In [47]:
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score,
    precision_recall_fscore_support, classification_report
)
import numpy as np

def evaluate(model, Xtr, ytr, Xte, yte, name="model"):
    model.fit(Xtr, ytr)
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(Xte)[:, 1]
    elif hasattr(model, "decision_function"):
        s = model.decision_function(Xte)
        proba = (s - s.min()) / (s.max() - s.min() + 1e-9)  # normalize to 0..1
    else:
        proba = model.predict(Xte)
    pred = (proba >= 0.5).astype(int)

    metrics = {
        "acc": accuracy_score(yte, pred),
        "f1": f1_score(yte, pred),
        "roc_auc": roc_auc_score(yte, proba),
        "pr_auc": average_precision_score(yte, proba),
    }
    print(f"\n{name}: {metrics}")
    print(classification_report(yte, pred, digits=3))
    return model, metrics, proba, pred

In [48]:
results = {}
fitted = {}

for name, mdl in models.items():
    m, met, proba, pred = evaluate(mdl, X_train, y_train, X_test, y_test, name=name)
    results[name] = met
    fitted[name] = (m, proba, pred)

pd.DataFrame(results).T.sort_values("pr_auc", ascending=False)


dummy: {'acc': 0.7883016414989213, 'f1': 0.0, 'roc_auc': 0.5, 'pr_auc': 0.21169835850107863}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0      0.788     1.000     0.882    220715
           1      0.000     0.000     0.000     59273

    accuracy                          0.788    279988
   macro avg      0.394     0.500     0.441    279988
weighted avg      0.621     0.788     0.695    279988



KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay, ConfusionMatrixDisplay

best_name = max(results, key=lambda k: results[k]["pr_auc"])
best_model = fitted[best_name][0]
proba = fitted[best_name][1]
pred  = fitted[best_name][2]

print("Best model:", best_name)

fig, ax = plt.subplots()
PrecisionRecallDisplay.from_predictions(y_test, proba, ax=ax)
plt.title(f"Precision-Recall: {best_name}")
plt.show()

fig, ax = plt.subplots()
RocCurveDisplay.from_predictions(y_test, proba, ax=ax)
plt.title(f"ROC: {best_name}")
plt.show()

fig, ax = plt.subplots()
ConfusionMatrixDisplay.from_predictions(y_test, pred, normalize="true", ax=ax)
plt.title(f"Confusion Matrix (normalized): {best_name}")
plt.show()