In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import joblib
from imblearn.combine import SMOTETomek
from xgboost import plot_importance

ModuleNotFoundError: No module named 'sklearn.utils._metadata_requests'

In [None]:
np.random.seed(42)

num_samples = 50000  # Data size

df_synthetic = pd.DataFrame({
    "FL_DATE": pd.date_range(start="2024-01-01", periods=num_samples, freq="D"),
    "AIRLINE": np.random.choice(["Delta", "American", "United", "Southwest", "Alaska", "JetBlue", "Spirit", "Frontier"], num_samples),
    "ORIGIN": np.random.choice(["JFK", "LAX", "ORD", "ATL", "SEA", "DFW", "MIA", "SFO", "DEN", "PHX"], num_samples),
    "DEST": np.random.choice(["BOS", "LAS", "IAH", "MSP", "CLT", "DTW", "EWR", "FLL", "BWI", "SLC"], num_samples),
    "CRS_DEP_TIME": np.random.randint(500, 2359, num_samples),
    "NUM_PREVIOUS_FLIGHTS_LATE": np.random.randint(0, 10, num_samples),  # More variability
    "AVG_GATE_WAIT_TIME": np.random.uniform(0, 60, num_samples),
})

# Simulate delays based on broader conditions
df_synthetic["DELAYED"] = (
    (df_synthetic["NUM_PREVIOUS_FLIGHTS_LATE"] >= 5) | 
    (df_synthetic["AVG_GATE_WAIT_TIME"] > 50)
).astype(int)

# Save for future use
df_synthetic.to_csv("expanded_flight_delays.csv", index=False)

print("New Dataset Size:", df_synthetic.shape)

In [None]:
df_synthetic["FL_DATE"] = pd.to_datetime(df_synthetic["FL_DATE"]).astype(int) // 10**9
df_synthetic["HOUR"] = df_synthetic["CRS_DEP_TIME"] // 100

df_synthetic = pd.get_dummies(df_synthetic, columns=["AIRLINE", "ORIGIN", "DEST"])
df_synthetic = df_synthetic.drop(columns=["CRS_DEP_TIME"])

In [None]:
X = df_synthetic.drop(columns=["DELAYED"])
y = df_synthetic["DELAYED"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train, X_test = X_train.astype("float64"), X_test.astype("float64")

X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

In [None]:
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train, y_train)

print("Balanced class distribution:", np.bincount(y_train_balanced))

In [None]:
model = RandomForestClassifier(n_estimators=200, class_weight="balanced_subsample", random_state=42)
model.fit(X_train_balanced, y_train_balanced)

In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
joblib.dump(model, "C:\\JN\\flight_delay_classifier.pkl")

In [None]:
F1 = f1_score(y_test, y_pred, average = 'macro')
print(F1)

In [None]:
# Create a series with feature names and their importance scores
importances = pd.Series(model.feature_importances_, index=X_train.columns)

# Sort and show the top 20 features
top_features = importances.sort_values(ascending=False).head(20)
print(top_features)
