In [224]:
import pandas as pd

from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

## Importing the Data

In [225]:
flight_report = pd.read_csv("flight_report_merged.csv")

In [226]:
flight_report

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,Unnamed: 32,AIRLINE_ID,CARRIER_NAME,AIRPORT_NAME,DATE,AWND,PRCP,SNOW,SNWD,TMAX
0,1,2,3,9E,N320PQ,3281,11986,GRR,"Grand Rapids, MI",13487,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
1,1,2,3,9E,N935XJ,3300,11986,GRR,"Grand Rapids, MI",11433,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
2,1,2,3,9E,N331PQ,3348,11986,GRR,"Grand Rapids, MI",13487,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
3,1,2,3,9E,N314PQ,3369,11986,GRR,"Grand Rapids, MI",11433,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
4,1,2,3,9E,N232PQ,3389,11986,GRR,"Grand Rapids, MI",11433,...,,20363,Endeavor Air Inc.,GRAND RAPIDS GERALD R FORD INTERNATIONAL AIRPO...,2019-01-02,10.07,0.09,0.8,0.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868480,12,29,7,UA,N14231,1104,10299,ANC,"Anchorage, AK",11292,...,,19977,United Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0
868481,12,29,7,UA,N36247,239,10299,ANC,"Anchorage, AK",11292,...,,19977,United Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0
868482,12,29,7,DL,N3761R,565,10299,ANC,"Anchorage, AK",14747,...,,19790,Delta Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0
868483,12,29,7,DL,N553NW,1601,10299,ANC,"Anchorage, AK",13487,...,,19790,Delta Air Lines Inc.,"ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A...",2019-12-29,7.83,0.02,0.3,7.1,29.0


## Feature Selection

### Manual Selection
In this section we will be dropping many unneeded features in order to predict airline delay using weather data and general airline details

In [227]:
# dropping unneeded features
# dropping delay related features aside from our target
df_manual = flight_report.drop(columns= [ 
    "DATE", "ARR_TIME", "ARR_DELAY_NEW", "CANCELLATION_CODE",
    "ACTUAL_ELAPSED_TIME", "CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", 
    "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY", "Unnamed: 32", "CRS_DEP_TIME",
    "DEP_DELAY_NEW", "CRS_ELAPSED_TIME", "OP_CARRIER_FL_NUM", "MONTH",
    "DAY_OF_WEEK", "DAY_OF_MONTH"])


### Transforming Categorical Features

In [228]:
df_manual.dtypes

OP_UNIQUE_CARRIER     object
TAIL_NUM              object
ORIGIN_AIRPORT_ID      int64
ORIGIN                object
ORIGIN_CITY_NAME      object
DEST_AIRPORT_ID        int64
DEST                  object
DEST_CITY_NAME        object
DEP_TIME             float64
DEP_DEL15            float64
DEP_TIME_BLK          object
CRS_ARR_TIME           int64
ARR_TIME_BLK          object
CANCELLED            float64
DISTANCE             float64
DISTANCE_GROUP         int64
AIRLINE_ID             int64
CARRIER_NAME          object
AIRPORT_NAME          object
AWND                 float64
PRCP                 float64
SNOW                 float64
SNWD                 float64
TMAX                 float64
dtype: object

In [229]:
cat_features = ["TAIL_NUM", "CARRIER_NAME", "AIRPORT_NAME", "DEP_TIME_BLK",
                "ARR_TIME_BLK", "DEST", "DEST_CITY_NAME", "ORIGIN_CITY_NAME", "ORIGIN", "OP_UNIQUE_CARRIER"]

le = LabelEncoder()

for feature in cat_features:
    label = le.fit_transform(df_manual[feature])
    df_manual.drop(feature, axis=1, inplace=True)
    df_manual[feature] = label

df_manual

Unnamed: 0,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,DEP_TIME,DEP_DEL15,CRS_ARR_TIME,CANCELLED,DISTANCE,DISTANCE_GROUP,AIRLINE_ID,AWND,...,TAIL_NUM,CARRIER_NAME,AIRPORT_NAME,DEP_TIME_BLK,ARR_TIME_BLK,DEST,DEST_CITY_NAME,ORIGIN_CITY_NAME,ORIGIN,OP_UNIQUE_CARRIER
0,11986,13487,1829.0,0.0,1928,0.0,408.0,2,20363,10.07,...,1192,7,14,13,14,133,118,13,15,0
1,11986,11433,1014.0,0.0,1124,0.0,120.0,1,20363,10.07,...,4900,7,14,5,6,59,53,13,15,0
2,11986,13487,1320.0,0.0,1415,0.0,408.0,2,20363,10.07,...,1251,7,14,8,9,133,118,13,15,0
3,11986,11433,1746.0,0.0,1849,0.0,120.0,1,20363,10.07,...,1157,7,14,12,13,59,53,13,15,0
4,11986,11433,1925.0,0.0,2040,0.0,120.0,1,20363,10.07,...,744,7,14,14,15,59,53,13,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868480,10299,11292,2344.0,0.0,658,0.0,2405.0,10,19977,7.83,...,321,16,2,18,1,54,51,2,2,13
868481,10299,11292,2209.0,0.0,522,0.0,2405.0,10,19977,7.83,...,1423,16,2,17,0,54,51,2,2,13
868482,10299,14747,24.0,0.0,501,0.0,1448.0,6,19790,7.83,...,1556,6,2,0,0,177,174,2,2,4
868483,10299,13487,2154.0,0.0,608,0.0,2519.0,11,19790,7.83,...,2264,6,2,16,1,133,118,2,2,4


### Splitting into Train, Validation, and Test 
Our target 'DEP_DEL15' = 1 if the airplane is delayed for longer than 15 minutes

In [230]:
X = df_manual.drop(columns=['DEP_DEL15'])

y = df_manual['DEP_DEL15']

In [238]:
# split away the test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)

# split the training into train and validation
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

print("Training set shapes:", X_train.shape, y_train.shape)
print("Validation set shapes:", X_valid.shape, y_valid.shape)
print("Testing set shapes:", X_test.shape, y_test.shape)

Training set shapes: (607939, 23) (607939,)
Validation set shapes: (130273, 23) (130273,)
Testing set shapes: (130273, 23) (130273,)


In [232]:
#Feature Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

### LASSO Regression

In [233]:
# create the lasso model
lasso_cv = LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10], cv=5)

lasso_cv.fit(X_train, y_train)

print('The optimal alpha is', lasso_cv.alpha_)

# finding the predicted probability
y_pred_proba = lasso_cv.predict(X_valid)
# converting to binary predictions
y_pred = (y_pred_proba > 0.5).astype(int)   # might need to lower threshold
accuracy = accuracy_score(y_valid, y_pred)
print("Accuracy:", accuracy)

The optimal alpha is 0.001
Accuracy: 0.842502618935809


In [234]:
#Feature selection in Lasso
col_names = list(X.columns)
selected_features = [col_names[i] for i in range(len(col_names)) if lasso_cv.coef_[i] != 0]
selected_features = selected_features + ["DEP_DEL15"]

lassoCoef = pd.Series(lasso_cv.coef_, index=X.columns)
print(f'The selected features are \n{lassoCoef[lassoCoef != 0]}')

The selected features are 
ORIGIN_AIRPORT_ID   -0.003614
DEP_TIME             0.451364
CRS_ARR_TIME        -0.004693
DISTANCE_GROUP       0.007989
AIRLINE_ID           0.009749
AWND                 0.009010
PRCP                 0.008225
SNOW                 0.008050
SNWD                 0.004800
TAIL_NUM            -0.001666
AIRPORT_NAME        -0.000132
DEP_TIME_BLK        -0.369656
DEST                 0.000458
DEST_CITY_NAME       0.000045
ORIGIN_CITY_NAME     0.001076
OP_UNIQUE_CARRIER    0.006882
dtype: float64


In [235]:
df_final = df_manual[selected_features]

In [236]:
df_final.to_csv("flight_report_selected.csv", index=False)