In [73]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFECV
from scipy.stats import uniform

In [74]:
df = pd.read_csv('../cleaning-preprocessing/cleaned_flight_data_with_target.csv')
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_ELAPSED_TIME,DISTANCE,CRS_DEP_M,CRS_ARR_M,Temperature,Dew Point,Humidity,...,Condition_Light Snow / Windy,Condition_Mostly Cloudy,Condition_Mostly Cloudy / Windy,Condition_Partly Cloudy,Condition_Partly Cloudy / Windy,Condition_Rain,Condition_Rain / Windy,Condition_Snow,Condition_Wintry Mix,Condition_Wintry Mix / Windy
0,11,1,5,124,636,324,448,48,34,58,...,False,False,False,False,False,False,False,False,False,False
1,11,1,5,371,2475,340,531,48,34,58,...,False,False,False,False,False,False,False,False,False,False
2,11,1,5,181,1069,301,482,48,34,58,...,False,False,False,False,False,False,False,False,False,False
3,11,1,5,168,944,345,513,48,34,58,...,False,False,False,False,False,False,False,False,False,False
4,11,1,5,139,760,360,499,46,32,58,...,False,False,False,False,False,False,False,False,False,False


In [75]:
X = df.drop(columns=["DEP_DELAY"])
y = df["DEP_DELAY"]

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [94]:
xgb_reg = XGBRegressor(objective='reg:squarederror', random_state=42)

xgb_reg.fit(X_train, y_train)

y_pred = xgb_reg.predict(X_test)

print('MSE:', mean_squared_error(y_test, y_pred))
print('R-squared:', r2_score(y_test, y_pred))

MSE: 989.783567432119
R-squared: 0.32107269763946533


### Feature Selection

RFE with CV

In [78]:
# RFECV with cross-validation
rfecv = RFECV(estimator=xgb_reg, cv=5, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)

# Fit RFECV
rfecv.fit(X_train, y_train)

# Get selected features
selected_features = X_train.columns[rfecv.support_]
print("Optimal number of features:", rfecv.n_features_)
print("Selected Features:", list(selected_features))

# Reduce dataset to selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
Fitting estimator with 102 features.
Fitting estimator with 101 features.
Fitting estimator with 100 features.
Fitting estimator with 99 features.
Fitting estimator with 98 features.
Fitting estimator with 97 features.
Fitting estimator with 96 features.
Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.
Fitting estimator with 92 features.
Fitting estimator with 91 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estimator with 86 features.
Fitting estimator with 85 features.
Fitting estimator with 84 features.
Fitting estimator

In [79]:
xgb_selected = XGBRegressor(objective="reg:squarederror", random_state=42)
xgb_selected.fit(X_train_selected, y_train)

# Predict and evaluate
y_pred_selected = xgb_selected.predict(X_test_selected)
print("MSE after RFECV:", mean_squared_error(y_test, y_pred_selected))
print("R-squared after RFECV:", r2_score(y_test, y_pred_selected))

MSE after RFECV: 910.7393565046915
R-squared after RFECV: 0.3752918243408203


Feature Importance Threshold

In [80]:
# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': xgb_reg.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display feature importance
print(feature_importance)

                             Feature  Importance
86          Condition_Cloudy / Windy    0.073257
104  Condition_Partly Cloudy / Windy    0.057789
17              OP_UNIQUE_CARRIER_DL    0.052601
5                          CRS_DEP_M    0.047614
64                          DEST_PIT    0.041765
..                               ...         ...
42                          DEST_HNL    0.000000
46                          DEST_JAC    0.000000
62                          DEST_PHL    0.000000
59                          DEST_ORH    0.000000
83                          DEST_STT    0.000000

[110 rows x 2 columns]


In [81]:
# Define importance threshold
threshold = 0.01

# Select important features
selected_features_threshold = feature_importance[feature_importance["Importance"] > threshold]["Feature"].tolist()

print("Selected Features:", selected_features_threshold)

# Filter dataset
X_train_selected_threshold = X_train[selected_features_threshold]
X_test_selected_threshold = X_test[selected_features_threshold]

Selected Features: ['Condition_Cloudy / Windy', 'Condition_Partly Cloudy / Windy', 'OP_UNIQUE_CARRIER_DL', 'CRS_DEP_M', 'DEST_PIT', 'DEST_BQN', 'sch_dep', 'Condition_Mostly Cloudy', 'DAY_OF_MONTH', 'Condition_Fair', 'DISTANCE', 'DEST_SAT', 'Temperature', 'DEST_PSE', 'Dew Point', 'Wind Gust', 'DEST_JAX', 'DEST_BWI', 'DAY_OF_WEEK', 'Pressure', 'DEST_SRQ', 'MONTH', 'CRS_ELAPSED_TIME', 'Condition_Light Rain', 'Wind Speed', 'CRS_ARR_M', 'DEST_FLL', 'Condition_Heavy Rain', 'Condition_Light Drizzle', 'DEST_ORD']


In [82]:
# Retrain XGBoost on filtered features
xgb_reg_selected_threshold = XGBRegressor(objective="reg:squarederror", random_state=42)
xgb_reg_selected_threshold.fit(X_train_selected_threshold, y_train)

# Evaluate new model
y_pred_selected_threshold = xgb_reg_selected_threshold.predict(X_test_selected_threshold)
print("MSE after Importance Threshold:", mean_squared_error(y_test, y_pred_selected_threshold))
print("R-squared after Importance Threshold:", r2_score(y_test, y_pred_selected_threshold))

MSE after Importance Threshold: 960.9808298480054
R-squared after Importance Threshold: 0.3408294916152954


### Hyperparameter Tuning

Using features selected from RFECV

In [83]:
params = {
    'max_depth': np.arange(5,30,5),
    'n_estimators': np.arange(50,210,50),
    'learning_rate': np.arange(0.01,0.31,0.1)
}

grid_search = GridSearchCV(XGBRegressor(objective='reg:squarederror', random_state=42), params, scoring='neg_mean_absolute_error', cv=3, verbose=2)
grid_search.fit(X_train_selected, y_train)

print(grid_search.best_params_)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.0s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.0s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_e

In [84]:
xgb_reg_best = grid_search.best_estimator_
y_pred_best = xgb_reg_best.predict(X_test_selected)

print('MSE:', mean_squared_error(y_test, y_pred_best))
print('R-squared:', r2_score(y_test, y_pred_best))

MSE: 764.9723898005906
R-squared: 0.4752785563468933


In [85]:
params = {
    'max_depth': [3, 5, 7, 10, 15],
    'n_estimators': [100, 150, 200, 250],
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': [0, 0.1, 1, 10],
    'reg_lambda': [0, 0.1, 1, 10],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10]
}

# RandomizedSearchCV with 100 iterations
random_search = RandomizedSearchCV(XGBRegressor(objective='reg:squarederror', random_state=42),
                                   param_distributions=params, n_iter=100, scoring='neg_mean_absolute_error',
                                   cv=3, verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X_train_selected, y_train)

print("Best Parameters:", random_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Parameters: {'colsample_bytree': 0.7553073535959489, 'gamma': 0.1, 'learning_rate': 0.08978137748927167, 'max_depth': 15, 'min_child_weight': 1, 'n_estimators': 250, 'reg_alpha': 10, 'reg_lambda': 1, 'subsample': 0.9094485142059234}


In [86]:
xgb_reg_best_2 = random_search.best_estimator_
y_pred_best_2 = xgb_reg_best_2.predict(X_test_selected)

print('MSE:', mean_squared_error(y_test, y_pred_best_2))
print('R-squared:', r2_score(y_test, y_pred_best_2))

MSE: 787.171317343001
R-squared: 0.4600515365600586


Using features selected from Importance Threshold

In [88]:
params = {
    'max_depth': np.arange(5,30,5),
    'n_estimators': np.arange(50,210,50),
    'learning_rate': np.arange(0.01,0.31,0.1)
}

grid_search_threshold = GridSearchCV(XGBRegressor(objective='reg:squarederror', random_state=42), params, scoring='neg_mean_absolute_error', cv=3, verbose=2)
grid_search_threshold.fit(X_train_selected_threshold, y_train)

print(grid_search_threshold.best_params_)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.0s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.0s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_e

In [89]:
xgb_reg_best = grid_search_threshold.best_estimator_
y_pred_best = xgb_reg_best.predict(X_test_selected_threshold)

print('MSE:', mean_squared_error(y_test, y_pred_best))
print('R-squared:', r2_score(y_test, y_pred_best))

MSE: 847.9101268630432
R-squared: 0.4183886647224426


In [90]:
params = {
    'max_depth': [3, 5, 7, 10, 15],
    'n_estimators': [100, 150, 200, 250],
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': [0, 0.1, 1, 10],
    'reg_lambda': [0, 0.1, 1, 10],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10]
}

# RandomizedSearchCV with 100 iterations
random_search_threshold = RandomizedSearchCV(XGBRegressor(objective='reg:squarederror', random_state=42),
                                   param_distributions=params, n_iter=100, scoring='neg_mean_absolute_error',
                                   cv=3, verbose=2, n_jobs=-1, random_state=42)
random_search_threshold.fit(X_train_selected_threshold, y_train)

print("Best Parameters:", random_search_threshold.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Parameters: {'colsample_bytree': 0.7553073535959489, 'gamma': 0.1, 'learning_rate': 0.08978137748927167, 'max_depth': 15, 'min_child_weight': 1, 'n_estimators': 250, 'reg_alpha': 10, 'reg_lambda': 1, 'subsample': 0.9094485142059234}


In [91]:
xgb_reg_best_2 = random_search_threshold.best_estimator_
y_pred_best_2 = xgb_reg_best_2.predict(X_test_selected_threshold)

print('MSE:', mean_squared_error(y_test, y_pred_best_2))
print('R-squared:', r2_score(y_test, y_pred_best_2))

MSE: 666.0222229914132
R-squared: 0.5431519746780396
