In [1]:
import pandas as pd

df = pd.read_csv("data.csv")

In [2]:
# Single cell: Restored pipeline (top-N -> group mean/count -> rolling -> models)
import warnings
warnings.filterwarnings("ignore")

import os
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, f1_score, classification_report, confusion_matrix
)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# ---------- load df ----------
try:
    df
except NameError:
    candidates = ["data.csv", "./data.csv", "/mnt/data/data.csv"]
    loaded = False
    for p in candidates:
        if os.path.exists(p):
            df = pd.read_csv(p)
            loaded = True
            print("Loaded:", p)
            break
    if not loaded:
        raise FileNotFoundError("No `df` and no data.csv found. Place file or define `df`.")

df = df.copy()
print("Initial shape:", df.shape)

# ---------- basic cleaning ----------
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df[(df['CANCELLED'].astype(float) == 0)]
if 'DIVERTED' in df.columns:
    df = df[df['DIVERTED'].astype(float) == 0]

TARGET = 'ArrivalDelay'
df[TARGET] = pd.to_numeric(df[TARGET], errors='coerce')
df = df.dropna(subset=[TARGET, 'Date']).reset_index(drop=True)
print("After filtering:", df.shape)

# ---------- train/test time split ----------
df = df.sort_values('Date').reset_index(drop=True)
split_idx = int(0.8 * len(df))
train_df = df.iloc[:split_idx].copy()
test_df  = df.iloc[split_idx:].copy()
print("Train / Test sizes:", train_df.shape, test_df.shape)

# ---------- top-N cardinality reduction (do this BEFORE group stats & rolling) ----------
def top_n_map(series, n=30):
    top = series.value_counts().nlargest(n).index
    return series.where(series.isin(top), other='OTHER')

# Ensure columns exist and are string typed before mapping
for c, n in [('Airline',10), ('ORIGIN',20), ('DEST',20), ('Route',50)]:
    if c in train_df.columns:
        train_df[c] = train_df[c].astype(str)
        test_df[c]  = test_df[c].astype(str)
        train_df[c] = top_n_map(train_df[c], n=n)
        test_df[c]  = top_n_map(test_df[c], n=n)

# ---------- leakage-safe group mean/count features (train-only stats -> map to test) ----------
for col in ['Airline', 'ORIGIN', 'DEST', 'Route']:
    if col in train_df.columns:
        stats = train_df.groupby(col)[TARGET].agg(['mean','count']).rename(columns={'mean':'stat_mean','count':'stat_count'})
        # map to train
        train_df[f'{col}_mean'] = train_df[col].map(stats['stat_mean'])
        train_df[f'{col}_count'] = train_df[col].map(stats['stat_count'])
        # map to test using the same train stats
        test_df[f'{col}_mean'] = test_df[col].map(stats['stat_mean'])
        test_df[f'{col}_count'] = test_df[col].map(stats['stat_count'])
        # fill missing in test (group unseen in train)
        test_df[f'{col}_mean'] = test_df[f'{col}_mean'].fillna(train_df[f'{col}_mean'].mean())
        test_df[f'{col}_count'] = test_df[f'{col}_count'].fillna(0)

# ---------- robust rolling helper (idempotent) ----------
def add_train_rolling_features(train, test, group_col, target_col='ArrivalDelay', windows_days=(7,14)):
    helper = f"_date_day_{group_col}"
    # remove helper if present
    for d in (train, test):
        if helper in d.columns:
            d.drop(columns=[helper], inplace=True)
    # create helper day column
    train[helper] = train['Date'].dt.floor('D')
    test[helper]  = test['Date'].dt.floor('D')

    # daily aggregation from TRAIN only
    daily = (train.groupby([group_col, helper])[target_col]
             .agg(['mean', 'count']).reset_index().rename(columns={'mean':'daily_mean','count':'daily_count'}))

    blocks = []
    for gval, g in daily.groupby(group_col):
        g = g.sort_values(helper)
        idx = pd.date_range(g[helper].min(), g[helper].max(), freq='D')
        mean_s = g.set_index(helper)['daily_mean'].reindex(idx)
        count_s = g.set_index(helper)['daily_count'].reindex(idx).fillna(0)
        tmp = pd.DataFrame({helper: idx, group_col: gval,
                            'daily_mean': mean_s.values, 'daily_count': count_s.values})
        for w in windows_days:
            tmp[f'{group_col}_rolling_mean_{w}d'] = tmp['daily_mean'].rolling(window=w, min_periods=1).mean()
            tmp[f'{group_col}_rolling_count_{w}d'] = tmp['daily_count'].rolling(window=w, min_periods=1).sum()
        blocks.append(tmp)
    if len(blocks) == 0:
        # cleanup and return
        train.drop(columns=[helper], inplace=True)
        test.drop(columns=[helper], inplace=True)
        return train, test

    rolling_all = pd.concat(blocks, ignore_index=True)
    roll_cols = [c for c in rolling_all.columns if 'rolling_' in c]

    # join rolling features into TRAIN by group + day
    train = train.merge(rolling_all[[group_col, helper] + roll_cols], on=[group_col, helper], how='left')

    # For TEST: map last available rolling values from train (no leakage)
    last_vals = rolling_all.sort_values(helper).groupby(group_col).last()
    for col in roll_cols:
        test[col] = test[group_col].map(last_vals[col])
        # if still missing, fallback to global train mean of that rolling column
        if col in train.columns:
            test[col] = test[col].fillna(train[col].mean())

    # cleanup helper
    train.drop(columns=[helper], inplace=True)
    test.drop(columns=[helper], inplace=True)
    return train, test

# Apply rolling features (after top-n and group stats)
for g in ['Route','Airline']:
    if g in train_df.columns:
        train_df, test_df = add_train_rolling_features(train_df, test_df, g, TARGET, windows_days=(7,14))

print("After rolling features: train cols =", train_df.shape[1], "test cols =", test_df.shape[1])

# ---------- basic feature engineering ----------
num_candidates = ['TAXI_OUT','TAXI_IN','AIR_TIME','DISTANCE','CRS_ELAPSED_TIME','ELAPSED_TIME',
                  'DepartureHour','ArrivalHour','ScheduledDep']
for c in num_candidates:
    if c in train_df.columns:
        train_df[c] = pd.to_numeric(train_df[c], errors='coerce')
        test_df[c]  = pd.to_numeric(test_df[c], errors='coerce')

if 'ScheduledDep' in train_df.columns:
    train_df['ScheduledDepHour'] = (train_df['ScheduledDep'] // 100).astype('Int64')
    test_df['ScheduledDepHour']  = (test_df['ScheduledDep'] // 100).astype('Int64')

for d in (train_df, test_df):
    d['month'] = d['Date'].dt.month
    d['dayofweek'] = d['Date'].dt.dayofweek
    d['is_weekend'] = d['dayofweek'].isin([5,6]).astype(int)

# ---------- feature lists ----------
base_numeric = ['TAXI_OUT','TAXI_IN','AIR_TIME','DISTANCE','CRS_ELAPSED_TIME','ELAPSED_TIME',
                'ScheduledDepHour','DepartureHour','ArrivalHour','month','dayofweek','is_weekend']
numeric_features = [c for c in base_numeric if c in train_df.columns]
# add rolling cols
numeric_features += [c for c in train_df.columns if ('rolling_mean' in c) or ('rolling_count' in c)]
# add group mean/count columns if present
for suf in ['_mean','_count']:
    for grp in ['Airline','ORIGIN','DEST','Route']:
        col = f'{grp}{suf}'
        if col in train_df.columns:
            numeric_features.append(col)

categorical_features = [c for c in ['Airline','ORIGIN','DEST','Route'] if c in train_df.columns]

print("Numeric features used:", len(numeric_features))
print("Categorical features used:", len(categorical_features))

# ---------- prepare X/y ----------
HEAVY_THRESH = 30.0

X_train_full = train_df[numeric_features + categorical_features].copy()
X_test_full  = test_df[numeric_features + categorical_features].copy()

y_train_class = (train_df[TARGET] > HEAVY_THRESH).astype(int)
y_test_class  = (test_df[TARGET] > HEAVY_THRESH).astype(int)

train_idx_norm = train_df[train_df[TARGET] <= HEAVY_THRESH].index
train_idx_heavy = train_df[train_df[TARGET] > HEAVY_THRESH].index

y_norm_s = np.log1p(np.clip(train_df.loc[train_idx_norm, TARGET], a_min=0, a_max=None))
y_heavy_s = np.log1p(np.clip(train_df.loc[train_idx_heavy, TARGET], a_min=0, a_max=None))

y_norm_s = pd.Series(y_norm_s, index=train_idx_norm).replace([np.inf, -np.inf], np.nan).dropna()
y_heavy_s = pd.Series(y_heavy_s, index=train_idx_heavy).replace([np.inf, -np.inf], np.nan).dropna()

X_train_norm = X_train_full.loc[y_norm_s.index].copy()
X_train_heavy = X_train_full.loc[y_heavy_s.index].copy()
y_train_norm = y_norm_s.values
y_train_heavy = y_heavy_s.values

print("Normal train samples:", X_train_norm.shape[0], "Heavy train samples:", X_train_heavy.shape[0])

# ---------- preprocessor ----------
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('ohe', ohe)
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop', n_jobs=-1)

# ---------- classifier ----------
clf_pipe = Pipeline([
    ('preproc', preprocessor),
    ('clf', HistGradientBoostingClassifier(random_state=RANDOM_SEED))
])
print("Training heavy-delay classifier...")
clf_pipe.fit(X_train_full, y_train_class)
y_test_pred_class = clf_pipe.predict(X_test_full)
print("Classifier metrics (test): acc=%.3f f1=%.3f" % (accuracy_score(y_test_class, y_test_pred_class), f1_score(y_test_class, y_test_pred_class)))
print(classification_report(y_test_class, y_test_pred_class, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test_class, y_test_pred_class))

# ---------- regressors (tuning & train) ----------
USE_XGB = False
try:
    import xgboost as xgb
    from xgboost import XGBRegressor
    USE_XGB = True
    print("XGBoost available:", xgb.__version__)
except Exception:
    print("XGBoost unavailable; using HistGradientBoostingRegressor fallback.")
    USE_XGB = False

tscv = TimeSeriesSplit(n_splits=3)

def tune_and_train_regressor(X_sub, y_sub, model_name='normal', n_iter=12):
    if X_sub.shape[0] < 100:
        print(f"[{model_name}] Too few samples ({X_sub.shape[0]}). Fitting default regressor without tuning.")
        if USE_XGB:
            reg = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=RANDOM_SEED, n_jobs=-1, verbosity=0)
        else:
            reg = HistGradientBoostingRegressor(max_iter=200, random_state=RANDOM_SEED)
        pipe = Pipeline([('preproc', preprocessor), ('reg', reg)])
        pipe.fit(X_sub, y_sub)
        return pipe

    if USE_XGB:
        base = XGBRegressor(objective='reg:squarederror', random_state=RANDOM_SEED, n_jobs=-1, verbosity=0)
        param_dist = {
            'reg__n_estimators': [100,200,400],
            'reg__max_depth': [3,5,7],
            'reg__learning_rate': [0.01,0.03,0.05],
            'reg__subsample': [0.6,0.8,1.0],
            'reg__colsample_bytree': [0.6,0.8,1.0]
        }
    else:
        base = HistGradientBoostingRegressor(random_state=RANDOM_SEED)
        param_dist = {
            'reg__max_iter': [100,200,400],
            'reg__max_leaf_nodes': [15,31,63],
            'reg__learning_rate': [0.01,0.05,0.1]
        }

    pipe = Pipeline([('preproc', preprocessor), ('reg', base)])
    rsearch = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=n_iter, cv=tscv,
                                 scoring='neg_mean_absolute_error', n_jobs=-1, random_state=RANDOM_SEED, verbose=1, refit=True)
    print(f"[{model_name}] Running RandomizedSearchCV (n_iter={n_iter}) ...")
    rsearch.fit(X_sub, y_sub)
    print(f"[{model_name}] Best params:", rsearch.best_params_)
    return rsearch.best_estimator_

normal_reg_pipe = tune_and_train_regressor(X_train_norm, y_train_norm, model_name='normal', n_iter=12)
heavy_reg_pipe = tune_and_train_regressor(X_train_heavy, y_train_heavy, model_name='heavy', n_iter=8)

# ---------- predict & combine ----------
print("Predicting on test set...")
clf_pred = clf_pipe.predict(X_test_full)

pred_norm_log = normal_reg_pipe.predict(X_test_full)
pred_heavy_log = heavy_reg_pipe.predict(X_test_full)

pred_norm = np.expm1(pred_norm_log)
pred_heavy = np.expm1(pred_heavy_log)

pred = np.where(clf_pred == 1, pred_heavy, pred_norm)
pred = np.clip(pred, a_min=0.0, a_max=None)

# ---------- evaluation ----------
y_test = test_df[TARGET].values
rmse = np.sqrt(mean_squared_error(y_test, pred))
mae  = mean_absolute_error(y_test, pred)
r2   = r2_score(y_test, pred)
print("\nCombined model evaluation (test):")
print(f"Samples (test): {len(y_test)}")
print(f"RMSE : {rmse:.3f}")
print(f"MAE  : {mae:.3f}")
print(f"R²   : {r2:.3f}")

mask_normal = (test_df[TARGET] <= HEAVY_THRESH).values
mask_heavy  = (test_df[TARGET] > HEAVY_THRESH).values

def seg_metrics(y_true, y_pred):
    return {
        'count': len(y_true),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)) if len(y_true)>0 else np.nan,
        'MAE': mean_absolute_error(y_true, y_pred) if len(y_true)>0 else np.nan,
        'R2': r2_score(y_true, y_pred) if len(y_true)>1 else np.nan
    }

print("\nPerformance by true-segment:")
print(" Normal (<=30min):", seg_metrics(y_test[mask_normal], pred[mask_normal]))
print(" Heavy  (>30min):", seg_metrics(y_test[mask_heavy], pred[mask_heavy]))

print("\nClassifier confusion matrix (rows=true, cols=pred):")
print(confusion_matrix(y_test_class, clf_pred))

print("\nDone.")

Initial shape: (275272, 37)
After filtering: (267329, 37)
Train / Test sizes: (213863, 37) (53466, 37)
After rolling features: train cols = 53 test cols = 53
Numeric features used: 28
Categorical features used: 4
Normal train samples: 191160 Heavy train samples: 22703
Training heavy-delay classifier...
Classifier metrics (test): acc=0.927 f1=0.698
              precision    recall  f1-score   support

           0      0.933     0.986     0.959     45711
           1      0.874     0.582     0.698      7755

    accuracy                          0.927     53466
   macro avg      0.903     0.784     0.828     53466
weighted avg      0.924     0.927     0.921     53466

Confusion matrix:
 [[45058   653]
 [ 3245  4510]]
XGBoost available: 3.0.5
[normal] Running RandomizedSearchCV (n_iter=12) ...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[normal] Best params: {'reg__subsample': 0.8, 'reg__n_estimators': 400, 'reg__max_depth': 5, 'reg__learning_rate': 0.05, 'reg__colsampl

## Save the model

In [3]:
# Save models + artifacts (single cell)
import os, json, sys
from pathlib import Path
from joblib import dump, load
import numpy as np
import pandas as pd
import sklearn
import platform

# required objects (should exist in notebook)
# clf_pipe, normal_reg_pipe, heavy_reg_pipe, numeric_features, categorical_features, HEAVY_THRESH, TARGET
# optional: test_df for sample inference

ARTIFACT_DIR = Path("model_artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True)

# 1) save trained pipelines
print("Saving pipelines...")
dump(clf_pipe, ARTIFACT_DIR / "clf_pipe.joblib")
dump(normal_reg_pipe, ARTIFACT_DIR / "normal_reg_pipe.joblib")
dump(heavy_reg_pipe, ARTIFACT_DIR / "heavy_reg_pipe.joblib")

# 2) save feature lists & constants
print("Saving feature metadata...")
meta = {
    "numeric_features": list(map(str, numeric_features)),
    "categorical_features": list(map(str, categorical_features)),
    "heavy_threshold": float(HEAVY_THRESH),
    "target": str(TARGET)
}
with open(ARTIFACT_DIR / "features.json", "w") as f:
    json.dump(meta, f, indent=2)

# 3) save training & evaluation metadata (fill with values you reported)
eval_metrics = {
    "data": {
        "initial_shape": [275272, 37],
        "after_filtering": [267329, 37],
        "train_shape": [213863, 37],
        "test_shape": [53466, 37],
        "rolling_train_cols": 53,
        "numeric_features_used": len(numeric_features),
        "categorical_features_used": len(categorical_features),
        "normal_train_samples": int((train_df[TARGET] <= HEAVY_THRESH).sum()) if 'train_df' in globals() else None,
        "heavy_train_samples": int((train_df[TARGET] > HEAVY_THRESH).sum()) if 'train_df' in globals() else None
    },
    "classifier": {
        "accuracy": 0.927,
        "f1_heavy": 0.698,
        "precision_heavy": 0.874,
        "recall_heavy": 0.582,
        "confusion_matrix": [[45058, 653], [3245, 4510]]
    },
    "regression": {
        "RMSE": 38.226,
        "MAE": 16.313,
        "R2": 0.604,
        "normal_segment": {"MAE": 12.543, "RMSE": 15.552},
        "heavy_segment": {"MAE": 38.538, "RMSE": 92.999, "R2": 0.406}
    }
}
with open(ARTIFACT_DIR / "metadata.json", "w") as f:
    json.dump(eval_metrics, f, indent=2)

# 4) save runtime & library versions
versions = {
    "python": sys.version.splitlines()[0],
    "sklearn": sklearn.__version__,
    "joblib": getattr(sys.modules.get('joblib'), '__version__', None),
}
try:
    import xgboost
    versions["xgboost"] = xgboost.__version__
except Exception:
    versions["xgboost"] = None

with open(ARTIFACT_DIR / "versions.json", "w") as f:
    json.dump(versions, f, indent=2)

# 5) create a tiny inference helper module (predictor.py) saved into artifacts
print("Creating predictor helper module...")
predictor_code = f'''
import json
import numpy as np
import pandas as pd
from joblib import load
from pathlib import Path

ART_DIR = Path("{ARTIFACT_DIR.as_posix()}")

# load artifacts
clf = load(ART_DIR / "clf_pipe.joblib")
reg_norm = load(ART_DIR / "normal_reg_pipe.joblib")
reg_heavy = load(ART_DIR / "heavy_reg_pipe.joblib")
with open(ART_DIR / "features.json", "r") as f:
    features = json.load(f)

HEAVY_THRESH = {HEAVY_THRESH}

def predict_single(row_dict):
    """
    row_dict: mapping of feature_name -> value (must include categorical keys and numeric keys used by the model)
    Returns: dict with predicted_delay (minutes) and heavy_flag (0/1)
    """
    # build DataFrame
    df = pd.DataFrame([row_dict])
    X = df[features['numeric_features'] + features['categorical_features']]

    heavy_prob = clf.predict(X)[0]
    # regressors expect the whole feature set; they output log1p predictions
    pred_norm_log = reg_norm.predict(X)[0]
    pred_heavy_log = reg_heavy.predict(X)[0]
    pred_norm = float(np.expm1(pred_norm_log))
    pred_heavy = float(np.expm1(pred_heavy_log))
    pred = pred_heavy if heavy_prob == 1 else pred_norm
    pred = max(0.0, pred)
    return {{
        "predicted_delay_min": pred,
        "heavy_flag": int(heavy_prob),
        "pred_norm": pred_norm,
        "pred_heavy": pred_heavy
    }}
'''

with open(ARTIFACT_DIR / "predictor.py", "w") as f:
    f.write(predictor_code)

# 6) Small smoke test (if test_df available)
smoke = {}
if 'test_df' in globals():
    try:
        sample_row = X_test_full.iloc[0].to_dict()
        from importlib import import_module
        spec = __import__(f"model_artifacts.predictor", fromlist=['predict_single'])
    except Exception:
        # try calling the saved module directly by loading predictor.py
        import importlib.util, importlib.machinery
        spec = importlib.util.spec_from_file_location("predictor_mod", str(ARTIFACT_DIR / "predictor.py"))
        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)
        smoke = mod.predict_single(X_test_full.iloc[0].to_dict())
print("Saved artifacts to:", ARTIFACT_DIR)
print("Artifacts list:", list(ARTIFACT_DIR.iterdir()))
print("Smoke test prediction (example):", smoke)

Saving pipelines...
Saving feature metadata...
Creating predictor helper module...
Saved artifacts to: model_artifacts
Artifacts list: [WindowsPath('model_artifacts/clf_pipe.joblib'), WindowsPath('model_artifacts/features.json'), WindowsPath('model_artifacts/heavy_reg_pipe.joblib'), WindowsPath('model_artifacts/metadata.json'), WindowsPath('model_artifacts/normal_reg_pipe.joblib'), WindowsPath('model_artifacts/predictor.py'), WindowsPath('model_artifacts/versions.json'), WindowsPath('model_artifacts/__pycache__')]
Smoke test prediction (example): {}
