In [105]:
import pandas as pd
import xgboost as xgb

from imblearn.under_sampling import RandomUnderSampler
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')

## Importing the Data

In [106]:
flight_report = pd.read_csv("flight_report_merged.csv")

In [107]:
flight_report

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,Unnamed: 32,AIRLINE_ID,CARRIER_NAME,AIRPORT_NAME,DATE,AWND,PRCP,SNOW,SNWD,TMAX
0,1,2,3,9E,N320PQ,3281,11986,GRR,"Grand Rapids, MI",13487,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
1,1,2,3,9E,N935XJ,3300,11986,GRR,"Grand Rapids, MI",11433,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
2,1,2,3,9E,N331PQ,3348,11986,GRR,"Grand Rapids, MI",13487,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
3,1,2,3,9E,N314PQ,3369,11986,GRR,"Grand Rapids, MI",11433,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
4,1,2,3,9E,N232PQ,3389,11986,GRR,"Grand Rapids, MI",11433,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868480,12,29,7,UA,N14231,1104,10299,ANC,"Anchorage, AK",11292,...,,19977,United Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0
868481,12,29,7,UA,N36247,239,10299,ANC,"Anchorage, AK",11292,...,,19977,United Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0
868482,12,29,7,DL,N3761R,565,10299,ANC,"Anchorage, AK",14747,...,,19790,Delta Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0
868483,12,29,7,DL,N553NW,1601,10299,ANC,"Anchorage, AK",13487,...,,19790,Delta Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0


## Feature Selection

### Manual Selection
In this section we will be dropping many unneeded features in order to predict airline delay using weather data and general airline details

In [108]:
# dropping unneeded features
# dropping delay related features aside from our target
df_manual = flight_report.drop(columns= [ 
    "DATE", "ARR_TIME", "ARR_DELAY_NEW", "CANCELLATION_CODE",
    "ACTUAL_ELAPSED_TIME", "CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", 
    "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY", "Unnamed: 32", "CRS_DEP_TIME",
    "DEP_DELAY_NEW", "CRS_ELAPSED_TIME", "OP_CARRIER_FL_NUM", "MONTH",
    "DAY_OF_MONTH"])


### Transforming Categorical Features

In [109]:
df_manual.dtypes

DAY_OF_WEEK            int64
OP_UNIQUE_CARRIER     object
TAIL_NUM              object
ORIGIN_AIRPORT_ID      int64
ORIGIN                object
ORIGIN_CITY_NAME      object
DEST_AIRPORT_ID        int64
DEST                  object
DEST_CITY_NAME        object
DEP_TIME             float64
DEP_DEL15            float64
DEP_TIME_BLK          object
CRS_ARR_TIME           int64
ARR_TIME_BLK          object
CANCELLED            float64
DISTANCE             float64
DISTANCE_GROUP         int64
AIRLINE_ID             int64
CARRIER_NAME          object
AIRPORT_NAME          object
AWND                 float64
PRCP                 float64
SNOW                 float64
SNWD                 float64
TMAX                 float64
dtype: object

In [110]:
cat_features = ["TAIL_NUM", "CARRIER_NAME", "AIRPORT_NAME", "ARR_TIME_BLK", "DEP_TIME_BLK", 
"DEST", "DEST_CITY_NAME", "ORIGIN_CITY_NAME", "ORIGIN", "OP_UNIQUE_CARRIER"]

le = LabelEncoder()

for feature in cat_features:
    label = le.fit_transform(df_manual[feature])
    df_manual.drop(feature, axis=1, inplace=True)
    df_manual[feature] = label

df_manual

Unnamed: 0,DAY_OF_WEEK,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,DEP_TIME,DEP_DEL15,CRS_ARR_TIME,CANCELLED,DISTANCE,DISTANCE_GROUP,AIRLINE_ID,...,TAIL_NUM,CARRIER_NAME,AIRPORT_NAME,ARR_TIME_BLK,DEP_TIME_BLK,DEST,DEST_CITY_NAME,ORIGIN_CITY_NAME,ORIGIN,OP_UNIQUE_CARRIER
0,3,11986,13487,1829.0,0.0,1928,0.0,408.0,2,20363,...,1192,7,14,14,13,133,118,13,15,0
1,3,11986,11433,1014.0,0.0,1124,0.0,120.0,1,20363,...,4900,7,14,6,5,59,53,13,15,0
2,3,11986,13487,1320.0,0.0,1415,0.0,408.0,2,20363,...,1251,7,14,9,8,133,118,13,15,0
3,3,11986,11433,1746.0,0.0,1849,0.0,120.0,1,20363,...,1157,7,14,13,12,59,53,13,15,0
4,3,11986,11433,1925.0,0.0,2040,0.0,120.0,1,20363,...,744,7,14,15,14,59,53,13,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868480,7,10299,11292,2344.0,0.0,658,0.0,2405.0,10,19977,...,321,16,2,1,18,54,51,2,2,13
868481,7,10299,11292,2209.0,0.0,522,0.0,2405.0,10,19977,...,1423,16,2,0,17,54,51,2,2,13
868482,7,10299,14747,24.0,0.0,501,0.0,1448.0,6,19790,...,1556,6,2,0,0,177,174,2,2,4
868483,7,10299,13487,2154.0,0.0,608,0.0,2519.0,11,19790,...,2264,6,2,1,16,133,118,2,2,4


### Splitting into Train, Validation, and Test 
Our target 'DEP_DEL15' = 1 if the airplane is delayed for at least 15 minutes

In [111]:
X = df_manual.drop(columns=['DEP_DEL15'])

y = df_manual['DEP_DEL15']

In [112]:
# split away the test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)

# split the training into train and validation
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

print("Training set shapes:", X_train.shape, y_train.shape)
print("Validation set shapes:", X_valid.shape, y_valid.shape)
print("Testing set shapes:", X_test.shape, y_test.shape)

Training set shapes: (607939, 24) (607939,)
Validation set shapes: (130273, 24) (130273,)
Testing set shapes: (130273, 24) (130273,)


In [113]:
#Feature Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

### LASSO Regression

In [114]:
# create the lasso model
lasso_cv = LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10], cv=5)

lasso_cv.fit(X_train, y_train)

print('The optimal alpha is', lasso_cv.alpha_)

# finding the predicted probability
y_pred_proba = lasso_cv.predict(X_valid)
# converting to binary predictions
y_pred = (y_pred_proba > 0.5).astype(int)   # might need to lower threshold
accuracy = accuracy_score(y_valid, y_pred)
print("Accuracy:", accuracy)

The optimal alpha is 0.001
Accuracy: 0.8413869335933002


In [115]:
#Feature selection in Lasso
col_names = list(X.columns)
selected_features = [col_names[i] for i in range(len(col_names)) if lasso_cv.coef_[i] != 0]
selected_features = selected_features + ["DEP_DEL15"]

lassoCoef = pd.Series(lasso_cv.coef_, index=X.columns)
print(f'The selected features are \n{lassoCoef[lassoCoef != 0]}')

The selected features are 
ORIGIN_AIRPORT_ID   -0.004365
DEP_TIME             0.457261
CRS_ARR_TIME        -0.004614
DISTANCE_GROUP       0.008379
AIRLINE_ID           0.009190
AWND                 0.009363
PRCP                 0.007885
SNOW                 0.007873
SNWD                 0.004555
TAIL_NUM            -0.001581
AIRPORT_NAME        -0.000026
DEP_TIME_BLK        -0.376044
DEST                 0.000418
DEST_CITY_NAME       0.000278
ORIGIN_CITY_NAME     0.001692
OP_UNIQUE_CARRIER    0.007148
dtype: float64


In [116]:
df_final = df_manual[selected_features]

In [117]:
df_final.head()

Unnamed: 0,ORIGIN_AIRPORT_ID,DEP_TIME,CRS_ARR_TIME,DISTANCE_GROUP,AIRLINE_ID,AWND,PRCP,SNOW,SNWD,TAIL_NUM,AIRPORT_NAME,DEP_TIME_BLK,DEST,DEST_CITY_NAME,ORIGIN_CITY_NAME,OP_UNIQUE_CARRIER,DEP_DEL15
0,11986,1829.0,1928,2,20363,10.07,0.09,0.8,0.0,1192,14,13,133,118,13,0,0.0
1,11986,1014.0,1124,1,20363,10.07,0.09,0.8,0.0,4900,14,5,59,53,13,0,0.0
2,11986,1320.0,1415,2,20363,10.07,0.09,0.8,0.0,1251,14,8,133,118,13,0,0.0
3,11986,1746.0,1849,1,20363,10.07,0.09,0.8,0.0,1157,14,12,59,53,13,0,0.0
4,11986,1925.0,2040,1,20363,10.07,0.09,0.8,0.0,744,14,14,59,53,13,0,0.0


## Machine Learning Experiments
All models have been hyperparameter tuned using an exhaustive search or randomized search

### Splitting into Train, Validation, and Test 
Our target 'DEP_DEL15' = 1 if the airplane is delayed for longer than 15 minutes

In [118]:
X = df_final.drop(columns=['DEP_DEL15'])
y = df_final['DEP_DEL15']


# split away the test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)

# split the training into train and validation
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

print("Training set shapes:", X_train.shape, y_train.shape)
print("Validation set shapes:", X_valid.shape, y_valid.shape)
print("Testing set shapes:", X_test.shape, y_test.shape)

Training set shapes: (607939, 16) (607939,)
Validation set shapes: (130273, 16) (130273,)
Testing set shapes: (130273, 16) (130273,)


In [119]:
print("Before undersampling:")
print("Cases Without a Delay:", sum(y_train == 0))
print("Cases With a Delay:", sum(y_train == 1))

Before undersampling:
Cases Without a Delay: 506781
Cases With a Delay: 101158


In [120]:
# undersampling the data
rus = RandomUnderSampler()
X_train, y_train = rus.fit_resample(X_train, y_train)

print("After undersampling:")
print("Cases Without a Delay:", sum(y_train == 0))
print("Cases With a Delay:", sum(y_train == 1))

After undersampling:
Cases Without a Delay: 101158
Cases With a Delay: 101158


In [121]:
# Feature Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [122]:
accuracy_list = []
recall_list = []
precision_list = []
f1_list = []

### Logistic Regression

In [123]:
lr = LogisticRegression(C=0.001)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.68      0.78    108468
         1.0       0.31      0.71      0.43     21805

    accuracy                           0.69    130273
   macro avg       0.62      0.70      0.61    130273
weighted avg       0.82      0.69      0.73    130273



##### Testing on Unseen Data

In [124]:
y_pred = lr.predict(X_test)


lr_accuracy = accuracy_score(y_test, y_pred)
lr_recall = recall_score(y_test, y_pred)
lr_precision = precision_score(y_test, y_pred)
lr_f1 = f1_score(y_test, y_pred)
print("Accuracy", lr_accuracy)
print("Recall", lr_recall)
print("Precision", lr_precision)
print("F1-Score", lr_f1)

Accuracy 0.6877864177534869
Recall 0.7093596059113301
Precision 0.30959652788940684
F1-Score 0.43105932381205503


In [125]:
accuracy_list.append(lr_accuracy)
recall_list.append(lr_recall)
precision_list.append(lr_precision)
f1_list.append(lr_f1)

### K-Nearest-Neigbors

In [126]:
knn = KNeighborsClassifier(n_neighbors=25)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.68      0.78    108468
         1.0       0.29      0.66      0.40     21805

    accuracy                           0.67    130273
   macro avg       0.60      0.67      0.59    130273
weighted avg       0.80      0.67      0.71    130273



##### Testing on Unseen Data

In [127]:
y_pred = knn.predict(X_test)


knn_accuracy = accuracy_score(y_test, y_pred)
knn_recall = recall_score(y_test, y_pred)
knn_precision = precision_score(y_test, y_pred)
knn_f1 = f1_score(y_test, y_pred)
print("Accuracy", knn_accuracy)
print("Recall", knn_recall)
print("Precision", knn_precision)
print("F1-Score", knn_f1)

Accuracy 0.6714591665195397
Recall 0.6570599880300171
Precision 0.28760856860729905
F1-Score 0.4000897062121552


In [128]:
accuracy_list.append(knn_accuracy)
recall_list.append(knn_recall)
precision_list.append(knn_precision)
f1_list.append(knn_f1)

### Random Forest

In [129]:
rf = RandomForestClassifier(n_estimators=1200, random_state=42)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.94      0.96    108468
         1.0       0.77      0.92      0.84     21805

    accuracy                           0.94    130273
   macro avg       0.88      0.93      0.90    130273
weighted avg       0.95      0.94      0.94    130273



##### Testing on Unseen Data

In [130]:
y_pred = rf.predict(X_test)


rf_accuracy = accuracy_score(y_test, y_pred)
rf_recall = recall_score(y_test, y_pred)
rf_precision = precision_score(y_test, y_pred)
rf_f1 = f1_score(y_test, y_pred)
print("Accuracy", rf_accuracy)
print("Recall", rf_recall)
print("Precision", rf_precision)
print("F1-Score", rf_f1)

Accuracy 0.9404558120255156
Recall 0.9203996132774734
Precision 0.7683320522674866
F1-Score 0.837519113550198


In [131]:
accuracy_list.append(rf_accuracy)
recall_list.append(rf_recall)
precision_list.append(rf_precision)
f1_list.append(rf_f1)

### XGBoost

In [132]:
xgb_model = xgb.XGBClassifier(learning_rate=0.1, max_depth=9, n_estimators=1200, random_state=42)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98    108468
         1.0       0.84      0.95      0.89     21805

    accuracy                           0.96    130273
   macro avg       0.91      0.96      0.93    130273
weighted avg       0.96      0.96      0.96    130273



##### Testing on Unseen Data

In [133]:
y_pred = xgb_model.predict(X_test)


xgb_accuracy = accuracy_score(y_test, y_pred)
xgb_recall = recall_score(y_test, y_pred)
xgb_precision = precision_score(y_test, y_pred)
xgb_f1 = f1_score(y_test, y_pred)
print("Accuracy", xgb_accuracy)
print("Recall", xgb_recall)
print("Precision", xgb_precision)
print("F1-Score", xgb_f1)

Accuracy 0.9619337852049158
Recall 0.9523502601169376
Precision 0.8405526208858187
F1-Score 0.8929658328117243


In [134]:
accuracy_list.append(xgb_accuracy)
recall_list.append(xgb_recall)
precision_list.append(xgb_precision)
f1_list.append(xgb_f1)

### CatBoost

In [135]:
# Evaluate the model performance
cat = CatBoostClassifier(depth = 9, iterations = 974, learning_rate = 0.2508856945382033, l2_leaf_reg= 1.0952191624451117, random_state = 42, verbose = False)

cat.fit(X_train, y_train)

y_pred = cat.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.96      0.97    108468
         1.0       0.82      0.94      0.88     21805

    accuracy                           0.96    130273
   macro avg       0.91      0.95      0.93    130273
weighted avg       0.96      0.96      0.96    130273



##### Testing on Unseen Data

In [136]:
y_pred = cat.predict(X_test)


cat_accuracy = accuracy_score(y_test, y_pred)
cat_recall = recall_score(y_test, y_pred)
cat_precision = precision_score(y_test, y_pred)
cat_f1 = f1_score(y_test, y_pred)
print("Accuracy", cat_accuracy)
print("Recall", cat_recall)
print("Precision", cat_precision)
print("F1-Score", cat_f1)

Accuracy 0.9559693873634598
Recall 0.9402882003590994
Precision 0.8214616096207216
F1-Score 0.8768675940236991


In [137]:
accuracy_list.append(cat_accuracy)
recall_list.append(cat_recall)
precision_list.append(cat_precision)
f1_list.append(cat_f1)

### Results

In [138]:
# create a table/chart with the metrics of all the models and figure out which performs the best
data = [accuracy_list, precision_list, recall_list, f1_list]
column_list = ['Logistic Regression', 'K-Nearest Neighbors', 'Random Forest', 'XGBoost', 'CatBoost']

df_metrics = pd.DataFrame(data=data, columns=column_list, 
                          index=['Accuracy', 'Precision', 'Recall', 'F1-Score'])
df_metrics

Unnamed: 0,Logistic Regression,K-Nearest Neighbors,Random Forest,XGBoost,CatBoost
Accuracy,0.687786,0.671459,0.940456,0.961934,0.955969
Precision,0.309597,0.287609,0.768332,0.840553,0.821462
Recall,0.70936,0.65706,0.9204,0.95235,0.940288
F1-Score,0.431059,0.40009,0.837519,0.892966,0.876868
