In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump, load


# Load dataset
df = pd.read_csv('flights_sample_3m.csv')

# Fill missing categorical columns with mode
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Fill missing numeric columns with median
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col] = df[col].fillna(df[col].median())

# --- Encoding ---

le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    if col != 'status':
        df[col] = le.fit_transform(df[col])

# Create binary target column for delay: 1 if delayed, 0 otherwise
df['status_Delayed'] = (df['status'] == 'Delayed').astype(int)

# --- Prepare features and target ---

X = df.drop(columns=['status', 'status_Delayed'])
y = df['status_Delayed']

# --- Train/test split ---

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

# --- Handle class imbalance by undersampling majority class ---

# Combine X_train and y_train for easier resampling
train_data = pd.concat([X_train, y_train], axis=1)

# Separate majority and minority classes
majority = train_data[train_data.status_Delayed == 0]
minority = train_data[train_data.status_Delayed == 1]

print("Before undersampling:", majority.status_Delayed.value_counts(), minority.status_Delayed.value_counts())

# Downsample majority class
majority_downsampled = resample(
    majority,
    replace=False,
    n_samples=len(minority),
    random_state=42
)

# Combine minority class with downsampled majority class
undersampled = pd.concat([majority_downsampled, minority])

print("After undersampling:", undersampled.status_Delayed.value_counts())

# Split back into X and y
X_train_bal = undersampled.drop('status_Delayed', axis=1)
y_train_bal = undersampled['status_Delayed']

# --- Train RandomForestClassifier model ---

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_bal, y_train_bal)

# --- Evaluate model ---

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


In [None]:
# --- Feature Importance from trained model ---
importances = clf.feature_importances_
feature_names = X_train.columns

# Create DataFrame of features and their importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Print top 10 most important features
print("Top 10 Important Features:")
print(feature_importance_df.head(10))

# --- Plotting ---
plt.figure(figsize=(10, 6))
sns.barplot(
    data=feature_importance_df.head(10),
    x='Importance',
    y='Feature',
    hue='Feature',
    dodge=False,
    palette='viridis',
    legend=False
)
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


In [None]:
# Save
dump(clf, 'flight_delay_model.joblib')
clf = load('flight_delay_model.joblib')


