In [54]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import (
    classification_report, accuracy_score, confusion_matrix,
    f1_score, roc_auc_score
)
from sklearn.preprocessing import OrdinalEncoder

In [60]:
# =====================
# Load & Preprocess
# =====================

# Full path to dataset
file_path = r"flights_transformed.csv"

# Read CSV
df = pd.read_csv(file_path, dtype=str, low_memory=False)

# Convert numeric-looking columns to numbers
df = df.apply(pd.to_numeric, errors="ignore")

# ---- Target: binary classification ----
target_column = "delay_bucket"   # (1 = delayed >= 15 mins, 0 = not delayed)

# Drop rows with missing target
df = df.dropna(subset=[target_column])

# Drop leakage / identifier columns
drop_cols = [
    "y_dep_bucket"   # derived labels
]

X = df.drop(columns=drop_cols + [target_column], errors="ignore")
y = df[target_column]

print("Shape of features:", X.shape)
print("Target distribution:\n", y.value_counts(normalize=True))

  df = df.apply(pd.to_numeric, errors="ignore")


Shape of features: (342757, 67)
Target distribution:
 delay_bucket
1    0.793294
2    0.133468
3    0.043667
4    0.021479
5    0.008093
Name: proportion, dtype: float64


In [61]:
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

if len(cat_cols) > 0:
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    X[cat_cols] = enc.fit_transform(X[cat_cols])

print("Encoded feature matrix shape:", X.shape)

# =====================
# Train/test split
# =====================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Encoded feature matrix shape: (342757, 67)


In [62]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    random_state=42,
    max_depth=6,             # keep shallow for interpretability (optional)
    n_estimators=200         # number of trees in the forest
)


In [63]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [64]:
clf.fit(X_train_res, y_train_res)

In [65]:
# =====================
# Predictions & Eval
# =====================
y_pred = clf.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average="weighted"))

# Only compute ROC AUC if binary
if len(np.unique(y)) == 2:
    print("ROC AUC:", roc_auc_score(y_test, y_pred))

print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Accuracy: 0.7264995915509395
F1 Score: 0.6994272237359765

Classification Report:
               precision    recall  f1-score   support

           1       0.82      0.90      0.86     54382
           2       0.27      0.09      0.13      9150
           3       0.08      0.02      0.04      2993
           4       0.05      0.07      0.06      1472
           5       0.02      0.08      0.03       555

    accuracy                           0.73     68552
   macro avg       0.25      0.23      0.22     68552
weighted avg       0.69      0.73      0.70     68552

Confusion Matrix:
 [[48768  1904   605  1340  1765]
 [ 7126   811   160   440   613]
 [ 2249   219    71   177   277]
 [ 1082    94    34   108   154]
 [  418    22    21    49    45]]


In [None]:
unique, counts = np.unique(y_pred, return_counts=True)
print("\nPredicted class distribution:")
for cls, cnt in zip(unique, counts):
    print(f"Class {cls}: {cnt}")


Predicted class distribution:
Class 0: 49470
Class 1: 19082


In [None]:

# =====================
# Feature Importances
# =====================
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1][:15]

plt.figure(figsize=(10,5))
plt.bar(range(len(indices)), importances[indices], align="center")
plt.xticks(range(len(indices)), [X.columns[i] for i in indices], rotation=90)
plt.title("Top 15 Feature Importances")
plt.show()

# =====================
# Decision Rules
# =====================
tree_rules = export_text(clf, feature_names=list(X.columns))
print("\nDecision Tree Rules:\n")
print(tree_rules)


  df = df.apply(pd.to_numeric, errors="ignore")


KeyboardInterrupt: 