ML MODELS

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import class_weight
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import joblib

In [2]:
CSV_PATH = "cleaned_flights.csv"
RANDOM_SEED = 42
BATCH_SIZE = 1024
EPOCHS = 30
VALIDATION_SPLIT = 0.1

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [3]:
# Load data
# ============================
df = pd.read_csv(CSV_PATH)

# ============================
# Targets
# ============================
target_reg = next((c for c in ["DEP_DELAY","DEP_DELAY_NEW","ARR_DELAY"] if c in df.columns), None)
target_clf = next((c for c in ["CANCELLED","CANCELLED_IND"] if c in df.columns), None)
if not target_reg or not target_clf:
    raise ValueError("Required target columns missing")

# ============================
# Features
# ============================
candidate_features = ["AIRLINE","ORIGIN","DEST","CRS_DEP_TIME_HOUR","CRS_ARR_TIME_HOUR",
                      "Month","DayOfWeek","IsWeekend","DISTANCE","FL_NUMBER","Route"]
cols_lower = {c.lower(): c for c in df.columns}
features = [cols_lower.get(f.lower(), f) for f in candidate_features if f in df.columns or f.lower() in cols_lower]

# Create Route if missing
if "Route" not in features and "ORIGIN" in df.columns and "DEST" in df.columns:
    df["Route"] = df["ORIGIN"].astype(str) + "_" + df["DEST"].astype(str)
    features.append("Route")

In [4]:

# ============================
# Prepare X, y
# ============================
cat_feats = [f for f in features if df[f].dtype == "object"]
num_feats = [f for f in features if f not in cat_feats]

df[cat_feats] = df[cat_feats].fillna("Unknown")
y_reg = df[target_reg].fillna(0).clip(0).astype(float)
y_clf = df[target_clf].fillna(0).astype(int)

X = pd.concat([
    df[num_feats].fillna(0).astype(float),
    pd.get_dummies(df[cat_feats].astype(str), drop_first=True) if cat_feats else pd.DataFrame(index=df.index)
], axis=1)


In [5]:
# ============================
# Train/test split
# ============================
if "FL_DATE" in df.columns:
    df["FL_DATE_dt"] = pd.to_datetime(df["FL_DATE"], errors="coerce")
    df_sorted_idx = df.sort_values("FL_DATE_dt").index
    split_idx = int(len(df_sorted_idx) * 0.8)
    train_idx, test_idx = df_sorted_idx[:split_idx], df_sorted_idx[split_idx:]
    X_train, X_test = X.loc[train_idx], X.loc[test_idx]
    y_train_reg, y_test_reg = y_reg.loc[train_idx], y_reg.loc[test_idx]
    y_train_clf, y_test_clf = y_clf.loc[train_idx], y_clf.loc[test_idx]
else:
    X_train, X_test, y_train_reg, y_test_reg, y_train_clf, y_test_clf = train_test_split(
        X, y_reg, y_clf, test_size=0.2, random_state=RANDOM_SEED
    )


In [6]:

# ============================
# Scale numeric features
# ============================
scaler = StandardScaler()
if num_feats:
    X_train[num_feats] = scaler.fit_transform(X_train[num_feats])
    X_test[num_feats] = scaler.transform(X_test[num_feats])

# ============================
# Build models
# ============================
def build_regressor(input_dim):
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(256, activation="relu")(inp)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.15)(x)
    x = layers.Dense(64, activation="relu")(x)
    out = layers.Dense(1, activation="linear")(x)
    model = models.Model(inp, out)
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model

In [7]:

def build_classifier(input_dim):
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(256, activation="relu")(inp)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.15)(x)
    x = layers.Dense(64, activation="relu")(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = models.Model(inp, out)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["binary_accuracy", tf.keras.metrics.AUC(name="auc")])
    return model

input_dim = X_train.shape[1]


In [12]:
# ============================
# Regression (save as .h5, fixed)
# ============================
reg_model = build_regressor(input_dim)
reg_callbacks = [
    callbacks.EarlyStopping(monitor="val_loss", patience=4, restore_best_weights=True),
    callbacks.ModelCheckpoint("delay_regressor.h5", save_best_only=True)  # .h5 auto = HDF5 format
]
reg_model.fit(
    X_train, y_train_reg,
    validation_split=VALIDATION_SPLIT,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=reg_callbacks,
    verbose=2
)

# Predictions and metrics (compute RMSE manually)
y_pred_reg = reg_model.predict(X_test, batch_size=BATCH_SIZE).flatten()
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"[Regression] MAE: {mae:.3f}, RMSE: {rmse:.3f}, R2: {r2:.3f}")


Epoch 1/30




9/9 - 6s - 690ms/step - loss: 2012.8074 - mae: 11.8889 - val_loss: 3531.7546 - val_mae: 16.0253
Epoch 2/30




9/9 - 5s - 543ms/step - loss: 1978.2733 - mae: 12.4291 - val_loss: 3448.1440 - val_mae: 16.7004
Epoch 3/30




9/9 - 1s - 159ms/step - loss: 1904.7917 - mae: 14.3953 - val_loss: 3297.0520 - val_mae: 19.2786
Epoch 4/30




9/9 - 1s - 146ms/step - loss: 1853.7703 - mae: 18.4510 - val_loss: 3242.2856 - val_mae: 21.5016
Epoch 5/30
9/9 - 1s - 147ms/step - loss: 1826.6141 - mae: 18.1655 - val_loss: 3262.5554 - val_mae: 20.0022
Epoch 6/30
9/9 - 3s - 278ms/step - loss: 1801.0557 - mae: 16.8782 - val_loss: 3248.7905 - val_mae: 20.3825
Epoch 7/30




9/9 - 1s - 148ms/step - loss: 1766.5382 - mae: 17.5090 - val_loss: 3229.8726 - val_mae: 21.3498
Epoch 8/30
9/9 - 2s - 171ms/step - loss: 1728.2485 - mae: 17.5614 - val_loss: 3234.6072 - val_mae: 21.2820
Epoch 9/30
9/9 - 3s - 300ms/step - loss: 1678.0627 - mae: 16.7240 - val_loss: 3242.3499 - val_mae: 21.4552
Epoch 10/30
9/9 - 2s - 238ms/step - loss: 1615.0557 - mae: 16.1313 - val_loss: 3259.7776 - val_mae: 21.7691
Epoch 11/30
9/9 - 1s - 153ms/step - loss: 1543.1633 - mae: 15.2084 - val_loss: 3292.1013 - val_mae: 21.8710
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[Regression] MAE: 20.187, RMSE: 46.899, R2: 0.012


In [10]:

# ============================
# Classification
# ============================
cw = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train_clf), y=y_train_clf)
class_weights = {i: cw[i] for i in range(len(cw))}

clf_model = build_classifier(input_dim)
clf_callbacks = [
    callbacks.EarlyStopping(monitor="val_auc", patience=4, restore_best_weights=True),
    callbacks.ModelCheckpoint("cancel_classifier.h5", monitor="val_auc", save_best_only=True, mode="max")
]
clf_model.fit(X_train, y_train_clf, validation_split=VALIDATION_SPLIT, epochs=EPOCHS,
              batch_size=BATCH_SIZE, class_weight=class_weights, callbacks=clf_callbacks, verbose=2)

y_proba = clf_model.predict(X_test, batch_size=BATCH_SIZE).flatten()
y_pred = (y_proba >= 0.5).astype(int)
print(f"[Classification] AUC: {roc_auc_score(y_test_clf,y_proba):.4f}, "
      f"Acc: {accuracy_score(y_test_clf,y_pred):.4f}, "
      f"Prec: {precision_score(y_test_clf,y_pred,zero_division=0):.4f}, "
      f"Rec: {recall_score(y_test_clf,y_pred,zero_division=0):.4f}, "
      f"F1: {f1_score(y_test_clf,y_pred,zero_division=0):.4f}")


Epoch 1/30




9/9 - 5s - 537ms/step - auc: 0.5568 - binary_accuracy: 0.4029 - loss: 0.7051 - val_auc: 0.5487 - val_binary_accuracy: 0.8593 - val_loss: 0.6591
Epoch 2/30




9/9 - 1s - 151ms/step - auc: 0.7464 - binary_accuracy: 0.5543 - loss: 0.6746 - val_auc: 0.5770 - val_binary_accuracy: 0.8955 - val_loss: 0.5821
Epoch 3/30




9/9 - 2s - 270ms/step - auc: 0.8063 - binary_accuracy: 0.6553 - loss: 0.6086 - val_auc: 0.5947 - val_binary_accuracy: 0.9538 - val_loss: 0.3973
Epoch 4/30
9/9 - 1s - 140ms/step - auc: 0.8779 - binary_accuracy: 0.7471 - loss: 0.4951 - val_auc: 0.5903 - val_binary_accuracy: 0.9548 - val_loss: 0.2683
Epoch 5/30
9/9 - 1s - 134ms/step - auc: 0.9533 - binary_accuracy: 0.8571 - loss: 0.3389 - val_auc: 0.5697 - val_binary_accuracy: 0.9166 - val_loss: 0.2380
Epoch 6/30
9/9 - 1s - 135ms/step - auc: 0.9844 - binary_accuracy: 0.9238 - loss: 0.1881 - val_auc: 0.5590 - val_binary_accuracy: 0.9256 - val_loss: 0.1908
Epoch 7/30
9/9 - 1s - 134ms/step - auc: 0.9918 - binary_accuracy: 0.9551 - loss: 0.1020 - val_auc: 0.5521 - val_binary_accuracy: 0.9568 - val_loss: 0.1502




[1m2/3[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 54ms/step 



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[Classification] AUC: 0.4672, Acc: 0.6785, Prec: 0.0140, Rec: 0.3056, F1: 0.0268


In [13]:

# ============================
# Save scaler & features
# ============================
joblib.dump(scaler, "scaler.joblib")
joblib.dump({"num_feats": num_feats, "cat_feats": cat_feats, "features": X.columns.tolist()}, "feature_meta.joblib")

print("Saved models: delay_regressor.h5, cancel_classifier.h5, scaler.joblib, feature_meta.joblib")

Saved models: delay_regressor.h5, cancel_classifier.h5, scaler.joblib, feature_meta.joblib
