In [78]:
# install required packages
!pip install -r ../requirements.txt

Collecting xgboost (from -r ../requirements.txt (line 14))
  Downloading xgboost-3.0.3-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.3-py3-none-macosx_12_0_arm64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [69]:
# Cell 1: imports & data
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score,
    classification_report
)

from collections import Counter

In [87]:
# Cell 1 — imports & load
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score,
    classification_report
)
from xgboost import XGBClassifier
import joblib

# Load your preprocessed CSV
df = pd.read_csv("../test_date/data/processed/preprocessed_data.csv")  # <- adjust if needed

TARGET = "DELAY_FLAG_15"
cat_cols = ["AIRLINE", "ORIGIN", "DEST"]
num_cols = [
    "DISTANCE", "day_of_week", "month", "hour_of_day", "is_bank_holiday",
    "dep_rain", "dep_ice", "dep_wind", "arr_rain", "arr_ice", "arr_wind"
]

use_cols = [c for c in cat_cols + num_cols + [TARGET] if c in df.columns]
df = df[use_cols].dropna()

X = df[cat_cols + num_cols]
y = df[TARGET].astype(int)

print("Class balance (all):", Counter(y))

Class balance (all): Counter({0: 1104914, 1: 296762})


In [88]:
# Cell 2 — split + undersample TRAIN only
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

def undersample_even_split(X_tr, y_tr, random_state=42):
    cnt = Counter(y_tr)
    n_min = min(cnt.values())
    idx_0 = np.where(y_tr.values == 0)[0]
    idx_1 = np.where(y_tr.values == 1)[0]
    rng = np.random.default_rng(random_state)
    if len(idx_0) > len(idx_1):
        keep = np.concatenate([rng.choice(idx_0, n_min, replace=False), idx_1])
    else:
        keep = np.concatenate([idx_0, rng.choice(idx_1, n_min, replace=False)])
    rng.shuffle(keep)
    return X_tr.iloc[keep].reset_index(drop=True), y_tr.iloc[keep].reset_index(drop=True)

X_tr_bal, y_tr_bal = undersample_even_split(X_train, y_train, random_state=42)
print("TRAIN original:", Counter(y_train))
print("TRAIN balanced:", Counter(y_tr_bal))
print("TEST:", Counter(y_test))

TRAIN original: Counter({0: 883931, 1: 237409})
TRAIN balanced: Counter({1: 237409, 0: 237409})
TEST: Counter({0: 220983, 1: 59353})


In [89]:
# Cell 3 — fit the encoder, transform to matrices
from sklearn.utils.sparsefuncs import inplace_column_scale

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop",
    sparse_threshold=1.0
)

print("Fitting encoder...")
preprocess.fit(X_tr_bal)
print("Transforming...")
Xtr_enc = preprocess.transform(X_tr_bal)   # sparse
Xte_enc = preprocess.transform(X_test)     # sparse

# Optional: you can standardize numeric columns if you want (usually not needed for trees)
# Leaving as-is for simplicity.

Fitting encoder...
Transforming...


In [71]:
# Cell 2: split
X = df[cat_cols + num_cols]
y = df[TARGET].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print("Train balance:", Counter(y_train))
print("Test balance:", Counter(y_test))

Train balance: Counter({0: 883931, 1: 237409})
Test balance: Counter({0: 220983, 1: 59353})


In [91]:
# Cell 4 — train XGBoost with live logs + early stopping (compatible with older versions)
xgb = XGBClassifier(
    n_estimators=500,      # high cap; early stopping will halt earlier
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="aucpr",    # better for imbalanced data
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)

eval_set = [(Xtr_enc, y_tr_bal), (Xte_enc, y_test)]

print("Training XGBoost with progress...")
trained_with_es = False
try:
    # Newer versions support early_stopping_rounds + verbose directly
    xgb.fit(
        Xtr_enc, y_tr_bal,
        eval_set=eval_set,
        early_stopping_rounds=30,
        verbose=10   # print every 10 boosting rounds
    )
    trained_with_es = True
except TypeError:
    # Older versions: remove early stopping but keep logs
    print("Early stopping not supported in this XGBoost version. Training without it...")
    xgb.fit(
        Xtr_enc, y_tr_bal,
        eval_set=eval_set,
        verbose=10
    )

print("Done. Best ntree limit:", getattr(xgb, "best_ntree_limit", None))

Training XGBoost with progress...
Early stopping not supported in this XGBoost version. Training without it...
[0]	validation_0-aucpr:0.63248	validation_1-aucpr:0.32009
[10]	validation_0-aucpr:0.65868	validation_1-aucpr:0.34946
[20]	validation_0-aucpr:0.66485	validation_1-aucpr:0.35626
[30]	validation_0-aucpr:0.66680	validation_1-aucpr:0.35833
[40]	validation_0-aucpr:0.67048	validation_1-aucpr:0.36254
[50]	validation_0-aucpr:0.67321	validation_1-aucpr:0.36533
[60]	validation_0-aucpr:0.67577	validation_1-aucpr:0.36782
[70]	validation_0-aucpr:0.67779	validation_1-aucpr:0.36964
[80]	validation_0-aucpr:0.67982	validation_1-aucpr:0.37142
[90]	validation_0-aucpr:0.68141	validation_1-aucpr:0.37280
[100]	validation_0-aucpr:0.68275	validation_1-aucpr:0.37394
[110]	validation_0-aucpr:0.68419	validation_1-aucpr:0.37513
[120]	validation_0-aucpr:0.68537	validation_1-aucpr:0.37620
[130]	validation_0-aucpr:0.68661	validation_1-aucpr:0.37725
[140]	validation_0-aucpr:0.68741	validation_1-aucpr:0.37779


In [92]:
# Cell 5 — evaluate
# Use the best_ntree_limit if early stopping was used
use_ntree = getattr(xgb, "best_ntree_limit", 0) if trained_with_es else 0

if use_ntree and use_ntree > 0:
    proba = xgb.predict_proba(Xte_enc, ntree_limit=use_ntree)[:, 1]
else:
    proba = xgb.predict_proba(Xte_enc)[:, 1]

pred = (proba >= 0.5).astype(int)

metrics = {
    "accuracy": accuracy_score(y_test, pred),
    "f1": f1_score(y_test, pred),
    "roc_auc": roc_auc_score(y_test, proba),
    "pr_auc": average_precision_score(y_test, proba),
}
print("Metrics:", metrics)
print("\nClassification report:\n", classification_report(y_test, pred, digits=3))

Metrics: {'accuracy': 0.6504908395639518, 'f1': 0.4283613960163825, 'roc_auc': 0.6911627260247225, 'pr_auc': 0.38633868297701235}

Classification report:
               precision    recall  f1-score   support

           0      0.865     0.659     0.748    220983
           1      0.328     0.619     0.428     59353

    accuracy                          0.650    280336
   macro avg      0.597     0.639     0.588    280336
weighted avg      0.752     0.650     0.681    280336



In [93]:
# Cell 6 — save artifacts (encoder + model)
joblib.dump(preprocess, "models/preprocess.pkl")
# Save the XGBoost model as JSON (version-stable)
xgb.save_model("models/xgb_model.json")
print("Saved models/preprocess.pkl and models/xgb_model.json")

Saved models/preprocess.pkl and models/xgb_model.json


In [94]:
# Cell 7 — quick inference helper (optional)
def predict_delay_proba(batch_df: pd.DataFrame):
    """
    batch_df must have the same feature columns: cat_cols + num_cols
    """
    enc = joblib.load("models/preprocess.pkl")
    from xgboost import XGBClassifier
    mdl = XGBClassifier()
    mdl.load_model("models/xgb_model.json")
    Xb = enc.transform(batch_df[cat_cols + num_cols])
    return mdl.predict_proba(Xb)[:, 1]

# Example:
sample = X_test.iloc[:5].copy()
predict_delay_proba(sample)

array([0.33312893, 0.494758  , 0.6988505 , 0.3631203 , 0.6668398 ],
      dtype=float32)