In [93]:
import pandas as pd
import xgboost as xgb

from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

import warnings
warnings.filterwarnings('ignore')

## Importing the Data

In [94]:
flight_report = pd.read_csv("flight_report_merged.csv")

In [95]:
flight_report

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,Unnamed: 32,AIRLINE_ID,CARRIER_NAME,AIRPORT_NAME,DATE,AWND,PRCP,SNOW,SNWD,TMAX
0,1,2,3,9E,N320PQ,3281,11986,GRR,"Grand Rapids, MI",13487,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
1,1,2,3,9E,N935XJ,3300,11986,GRR,"Grand Rapids, MI",11433,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
2,1,2,3,9E,N331PQ,3348,11986,GRR,"Grand Rapids, MI",13487,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
3,1,2,3,9E,N314PQ,3369,11986,GRR,"Grand Rapids, MI",11433,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
4,1,2,3,9E,N232PQ,3389,11986,GRR,"Grand Rapids, MI",11433,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868480,12,29,7,UA,N14231,1104,10299,ANC,"Anchorage, AK",11292,...,,19977,United Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0
868481,12,29,7,UA,N36247,239,10299,ANC,"Anchorage, AK",11292,...,,19977,United Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0
868482,12,29,7,DL,N3761R,565,10299,ANC,"Anchorage, AK",14747,...,,19790,Delta Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0
868483,12,29,7,DL,N553NW,1601,10299,ANC,"Anchorage, AK",13487,...,,19790,Delta Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0


## Feature Selection

### Manual Selection
In this section we will be dropping many unneeded features in order to predict airline delay using weather data and general airline details

In [96]:
# dropping unneeded features
# dropping delay related features aside from our target
df_manual = flight_report.drop(columns= [ 
    "DATE", "ARR_TIME", "ARR_DELAY_NEW", "CANCELLATION_CODE",
    "ACTUAL_ELAPSED_TIME", "CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", 
    "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY", "Unnamed: 32", "CRS_DEP_TIME",
    "DEP_DELAY_NEW", "CRS_ELAPSED_TIME", "OP_CARRIER_FL_NUM", "MONTH",
    "DAY_OF_WEEK", "DAY_OF_MONTH", "DISTANCE_GROUP", "DEP_TIME"])


### Transforming Categorical Features

In [97]:
df_manual.dtypes

OP_UNIQUE_CARRIER     object
TAIL_NUM              object
ORIGIN_AIRPORT_ID      int64
ORIGIN                object
ORIGIN_CITY_NAME      object
DEST_AIRPORT_ID        int64
DEST                  object
DEST_CITY_NAME        object
DEP_DEL15            float64
DEP_TIME_BLK          object
CRS_ARR_TIME           int64
ARR_TIME_BLK          object
CANCELLED            float64
DISTANCE             float64
AIRLINE_ID             int64
CARRIER_NAME          object
AIRPORT_NAME          object
AWND                 float64
PRCP                 float64
SNOW                 float64
SNWD                 float64
TMAX                 float64
dtype: object

In [98]:
cat_features = ["TAIL_NUM", "CARRIER_NAME", "AIRPORT_NAME", "ARR_TIME_BLK", "DEP_TIME_BLK", 
"DEST", "DEST_CITY_NAME", "ORIGIN_CITY_NAME", "ORIGIN", "OP_UNIQUE_CARRIER"]

le = LabelEncoder()

for feature in cat_features:
    label = le.fit_transform(df_manual[feature])
    df_manual.drop(feature, axis=1, inplace=True)
    df_manual[feature] = label

df_manual

Unnamed: 0,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,DEP_DEL15,CRS_ARR_TIME,CANCELLED,DISTANCE,AIRLINE_ID,AWND,PRCP,SNOW,...,TAIL_NUM,CARRIER_NAME,AIRPORT_NAME,ARR_TIME_BLK,DEP_TIME_BLK,DEST,DEST_CITY_NAME,ORIGIN_CITY_NAME,ORIGIN,OP_UNIQUE_CARRIER
0,11986,13487,0.0,1928,0.0,408.0,20363,10.07,0.09,0.8,...,1192,7,14,14,13,133,118,13,15,0
1,11986,11433,0.0,1124,0.0,120.0,20363,10.07,0.09,0.8,...,4900,7,14,6,5,59,53,13,15,0
2,11986,13487,0.0,1415,0.0,408.0,20363,10.07,0.09,0.8,...,1251,7,14,9,8,133,118,13,15,0
3,11986,11433,0.0,1849,0.0,120.0,20363,10.07,0.09,0.8,...,1157,7,14,13,12,59,53,13,15,0
4,11986,11433,0.0,2040,0.0,120.0,20363,10.07,0.09,0.8,...,744,7,14,15,14,59,53,13,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868480,10299,11292,0.0,658,0.0,2405.0,19977,7.83,0.02,0.3,...,321,16,2,1,18,54,51,2,2,13
868481,10299,11292,0.0,522,0.0,2405.0,19977,7.83,0.02,0.3,...,1423,16,2,0,17,54,51,2,2,13
868482,10299,14747,0.0,501,0.0,1448.0,19790,7.83,0.02,0.3,...,1556,6,2,0,0,177,174,2,2,4
868483,10299,13487,0.0,608,0.0,2519.0,19790,7.83,0.02,0.3,...,2264,6,2,1,16,133,118,2,2,4


### Splitting into Train, Validation, and Test 
Our target 'DEP_DEL15' = 1 if the airplane is delayed for at least 15 minutes

In [99]:
X = df_manual.drop(columns=['DEP_DEL15'])

y = df_manual['DEP_DEL15']

In [100]:
# split away the test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)

# split the training into train and validation
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

print("Training set shapes:", X_train.shape, y_train.shape)
print("Validation set shapes:", X_valid.shape, y_valid.shape)
print("Testing set shapes:", X_test.shape, y_test.shape)

Training set shapes: (607939, 21) (607939,)
Validation set shapes: (130273, 21) (130273,)
Testing set shapes: (130273, 21) (130273,)


In [101]:
#Feature Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

### LASSO Regression

In [102]:
# create the lasso model
lasso_cv = LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10], cv=5)

lasso_cv.fit(X_train, y_train)

print('The optimal alpha is', lasso_cv.alpha_)

# finding the predicted probability
y_pred_proba = lasso_cv.predict(X_valid)
# converting to binary predictions
y_pred = (y_pred_proba > 0.5).astype(int)   # might need to lower threshold
accuracy = accuracy_score(y_valid, y_pred)
print("Accuracy:", accuracy)

The optimal alpha is 0.001
Accuracy: 0.8344937170403691


In [103]:
#Feature selection in Lasso
col_names = list(X.columns)
selected_features = [col_names[i] for i in range(len(col_names)) if lasso_cv.coef_[i] != 0]
selected_features = selected_features + ["DEP_DEL15"]

lassoCoef = pd.Series(lasso_cv.coef_, index=X.columns)
print(f'The selected features are \n{lassoCoef[lassoCoef != 0]}')

The selected features are 
ORIGIN_AIRPORT_ID   -0.003224
DISTANCE             0.003370
AIRLINE_ID           0.010682
AWND                 0.011064
PRCP                 0.009595
SNOW                 0.009323
SNWD                 0.003848
TAIL_NUM            -0.001492
ARR_TIME_BLK         0.024906
DEP_TIME_BLK         0.041614
DEST                 0.001917
ORIGIN_CITY_NAME     0.002858
OP_UNIQUE_CARRIER    0.009938
dtype: float64


In [104]:
df_final = df_manual[selected_features]

In [105]:
df_final.head()

Unnamed: 0,ORIGIN_AIRPORT_ID,DISTANCE,AIRLINE_ID,AWND,PRCP,SNOW,SNWD,TAIL_NUM,ARR_TIME_BLK,DEP_TIME_BLK,DEST,ORIGIN_CITY_NAME,OP_UNIQUE_CARRIER,DEP_DEL15
0,11986,408.0,20363,10.07,0.09,0.8,0.0,1192,14,13,133,13,0,0.0
1,11986,120.0,20363,10.07,0.09,0.8,0.0,4900,6,5,59,13,0,0.0
2,11986,408.0,20363,10.07,0.09,0.8,0.0,1251,9,8,133,13,0,0.0
3,11986,120.0,20363,10.07,0.09,0.8,0.0,1157,13,12,59,13,0,0.0
4,11986,120.0,20363,10.07,0.09,0.8,0.0,744,15,14,59,13,0,0.0


## Machine Learning Experiments
All models have been hyperparameter tuned using an exhaustive search or randomized search

### Splitting into Train, Validation, and Test 
Our target 'DEP_DEL15' = 1 if the airplane is delayed for longer than 15 minutes

In [106]:
X = df_final.drop(columns=['DEP_DEL15'])
y = df_final['DEP_DEL15']


# split away the test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)

# split the training into train and validation
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

print("Training set shapes:", X_train.shape, y_train.shape)
print("Validation set shapes:", X_valid.shape, y_valid.shape)
print("Testing set shapes:", X_test.shape, y_test.shape)

Training set shapes: (607939, 13) (607939,)
Validation set shapes: (130273, 13) (130273,)
Testing set shapes: (130273, 13) (130273,)


In [107]:
# Feature Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

### Logistic Regression

In [111]:
lr = LogisticRegression(C=0.001)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_valid)


print("Accuracy", accuracy_score(y_valid, y_pred))
print("Recall", recall_score(y_valid, y_pred))
print("Precision", precision_score(y_valid, y_pred))
print("F1-Score", f1_score(y_valid, y_pred))

Accuracy 0.8327972795590798
Recall 4.590735894963963e-05
Precision 1.0
F1-Score 9.181050312155711e-05


#### Testing on Unseen Data

### XGBoost

In [109]:
param_grid = {"learning_rate": [],
              "max_depth": []}


xgb_model = xgb.XGBClassifier()

xgb_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5)

xgb_search.fit(X_train, y_train)

y_pred = xgb_search.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
print("Accuracy", accuracy)
print("Recall", recall)
print("Precision", precision)
print("F1-Score", f1)

ValueError: Parameter grid for parameter 'learning_rate' need to be a non-empty sequence, got: []

#### Testing on Unseen Data