In [1]:
# ----------------------------
# 1. Install & import packages
# ----------------------------
!pip -q install xgboost scikit-learn pandas numpy

import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import joblib

# ----------------------------
# 2. Generate synthetic data
#    (replace with your CSV later)
# ----------------------------
n = 300
rng = np.random.default_rng(42)
df = pd.DataFrame({
    "order_id": range(n),
    "carrier" : rng.choice(["CarrierA","CarrierB","CarrierC"], n),
    "zip"     : rng.integers(10000, 19999, n),
    "delay_minutes": rng.exponential(20, n)     # skewed like real delays
})
df["is_late"] = (df.delay_minutes > 30).astype(int)

# ----------------------------
# 3. Prep features (very simple)
# ----------------------------
X = pd.get_dummies(df[["carrier", "zip", "delay_minutes"]], drop_first=True)
y = df["is_late"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ----------------------------
# 4. Train baseline model
# ----------------------------
model = XGBClassifier(
    n_estimators=150,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    objective="binary:logistic",
    eval_metric="auc",
    random_state=42
)
model.fit(X_train, y_train)

# ----------------------------
# 5. Evaluate
# ----------------------------
preds = model.predict_proba(X_test)[:,1]
auc   = roc_auc_score(y_test, preds)
print("ROC-AUC:", round(auc, 3))

# ----------------------------
# 6. Save artefacts
# ----------------------------
joblib.dump(model, "delay_model.pkl")
df.head(3)


ROC-AUC: 1.0


Unnamed: 0,order_id,carrier,zip,delay_minutes,is_late
0,0,CarrierA,13636,21.936925,0
1,1,CarrierC,18262,3.850732,0
2,2,CarrierB,10099,23.313052,0
