## This notebook is dedicated to predicting our taget, whether a given flight got delayed or not

In [1]:
import pandas as pd
import numpy as np

### start by importing the basic data libraries and load our clean data

In [2]:
clean=pd.read_csv('../Data/clean/cleaned_airlines.csv')

In [3]:
clean

Unnamed: 0,airline,flight_id,airport_from,airport_to,day_of_week,departure_hour,flight_length,delay,part_of_day
0,CO,269,SFO,IAH,3,0,205,1,night
1,US,1558,PHX,CLT,3,0,222,1,night
2,AA,2400,LAX,DFW,3,0,165,1,night
3,AA,2466,SFO,DFW,3,0,195,1,night
4,AS,108,ANC,SEA,3,0,202,0,night
...,...,...,...,...,...,...,...,...,...
539378,CO,178,OGG,SNA,5,23,326,0,night
539379,FL,398,SEA,ATL,5,23,305,0,night
539380,FL,609,SFO,MKE,5,23,255,0,night
539381,UA,78,HNL,SFO,5,23,313,1,night


### Split into X and y and check right now the class imbalance

In [4]:
y=clean['delay']
X=clean.drop(['delay', 'flight_id'], axis=1)

In [5]:
display(y)
display((y.value_counts()/len(y))*100)
print()
display(X)

0         1
1         1
2         1
3         1
4         0
         ..
539378    0
539379    0
539380    0
539381    1
539382    1
Name: delay, Length: 539383, dtype: int64

0    55.455771
1    44.544229
Name: delay, dtype: float64




Unnamed: 0,airline,airport_from,airport_to,day_of_week,departure_hour,flight_length,part_of_day
0,CO,SFO,IAH,3,0,205,night
1,US,PHX,CLT,3,0,222,night
2,AA,LAX,DFW,3,0,165,night
3,AA,SFO,DFW,3,0,195,night
4,AS,ANC,SEA,3,0,202,night
...,...,...,...,...,...,...,...
539378,CO,OGG,SNA,5,23,326,night
539379,FL,SEA,ATL,5,23,305,night
539380,FL,SFO,MKE,5,23,255,night
539381,UA,HNL,SFO,5,23,313,night


### The flight id was dropped here, since even there's way too many values on it, and as a rule of thumb we shouldn't keep id columns for prediction models  
### Another interesting thing we noticed is that while we have a slight class imbalace, it's incredibly small and we'll choose NOT to balance it

In [6]:
from sklearn.model_selection import train_test_split

### Let's do our train test split. Since we have a huge dataset with over half a million rows, we can use a bigger test size

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pickle

### After the train test split, let's do some prep work for the scaling and encoding

In [9]:
X_train_num = X_train.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)

In [10]:
display(X_train_cat.head())
display(X_train_num.head())

Unnamed: 0,airline,airport_from,airport_to,part_of_day
511906,XE,IAH,CLT,morning
519215,B6,SRQ,JFK,afternoon
445387,MQ,DFW,LIT,afternoon
146144,EV,IAD,ROC,morning
290903,WN,SEA,OAK,afternoon


Unnamed: 0,day_of_week,departure_hour,flight_length
511906,4,10,139
519215,4,17,164
445387,7,17,70
146144,4,12,73
290903,5,17,125


### We have now split the train set into categortical (for encoding) and numerical (for scaling)

In [11]:
X_train_scaling=X_train_num[['flight_length']]
X_train_no_scaling=X_train_num.drop(['flight_length'], axis=1)

In [12]:
display(X_train_scaling.head())
display(X_train_no_scaling.head())

Unnamed: 0,flight_length
511906,139
519215,164
445387,70
146144,73
290903,125


Unnamed: 0,day_of_week,departure_hour
511906,4,10
519215,4,17
445387,7,17
146144,4,12
290903,5,17


### But since we only need to scale flight length, we're splitting them up first and will now repeat these steps for the test set

In [13]:
X_test_num = X_test.select_dtypes(np.number)
X_test_cat = X_test.select_dtypes(object)

In [14]:
X_test_scaling=X_test_num[['flight_length']]
X_test_no_scaling=X_test_num.drop(['flight_length'], axis=1)

In [15]:
display(X_test_cat.head())
display(X_test_scaling.head())
display(X_test_no_scaling.head())

Unnamed: 0,airline,airport_from,airport_to,part_of_day
448599,B6,BOS,TPA,night
497739,AA,DFW,DTW,afternoon
65638,9E,ATL,HPN,afternoon
205461,UA,SFO,BOS,night
209825,AS,SFO,SEA,morning


Unnamed: 0,flight_length
448599,199
497739,155
65638,143
205461,340
209825,134


Unnamed: 0,day_of_week,departure_hour
448599,7,20
497739,3,14
65638,6,17
205461,7,22
209825,1,9


### With all of the data from both sets neatly split, it's about time we start doing some scaling and encoding

In [16]:
scaler = StandardScaler().fit(X_train_scaling)
with open("../Scalers/StandardScaler.pkl", 'wb') as file:
    pickle.dump(scaler, file)

In [17]:
num_train_stand = scaler.transform(X_train_scaling)
num_train_pd = pd.DataFrame(num_train_stand, columns = X_train_scaling.columns)

num_test_stand = scaler.transform(X_test_scaling)
num_test_pd = pd.DataFrame(num_test_stand, columns = X_test_scaling.columns)

In [18]:
display(num_train_pd.describe().T)
print()
display(num_test_pd.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
flight_length,377568.0,-5.4471410000000006e-17,1.000001,-1.884034,-0.729946,-0.245515,0.424141,7.448399





Unnamed: 0,count,mean,std,min,25%,50%,75%,max
flight_length,161815.0,-0.001403,0.996751,-1.884034,-0.729946,-0.245515,0.424141,7.448399


### With the scaling out of the way, let's do the encoding next

In [19]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat)
with open("../Encoders/OneHotEncoder.pkl", 'wb') as file:
    pickle.dump(encoder, file)

In [20]:
cat_train_encoded = encoder.transform(X_train_cat).toarray()
cat_train_pd = pd.DataFrame(cat_train_encoded, columns = encoder.get_feature_names_out())

cat_test_encoded = encoder.transform(X_test_cat).toarray()
cat_test_pd = pd.DataFrame(cat_test_encoded, columns = encoder.get_feature_names_out())

In [21]:
display(cat_train_pd.head())
print()
display(cat_test_pd.head())

Unnamed: 0,airline_AA,airline_AS,airline_B6,airline_CO,airline_DL,airline_EV,airline_F9,airline_FL,airline_HA,airline_MQ,...,airport_to_TYS,airport_to_UTM,airport_to_VLD,airport_to_VPS,airport_to_WRG,airport_to_XNA,airport_to_YAK,airport_to_YUM,part_of_day_morning,part_of_day_night
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0





Unnamed: 0,airline_AA,airline_AS,airline_B6,airline_CO,airline_DL,airline_EV,airline_F9,airline_FL,airline_HA,airline_MQ,...,airport_to_TYS,airport_to_UTM,airport_to_VLD,airport_to_VPS,airport_to_WRG,airport_to_XNA,airport_to_YAK,airport_to_YUM,part_of_day_morning,part_of_day_night
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
X_train_no_scaling = X_train_no_scaling.reset_index(drop=True)

X_test_no_scaling = X_test_no_scaling.reset_index(drop=True)

In [23]:
X_train_f = pd.concat([X_train_no_scaling, num_train_pd, cat_train_pd], axis=1)

X_test_f = pd.concat([X_test_no_scaling, num_test_pd, cat_test_pd], axis=1)

In [24]:
X_train_f

Unnamed: 0,day_of_week,departure_hour,flight_length,airline_AA,airline_AS,airline_B6,airline_CO,airline_DL,airline_EV,airline_F9,...,airport_to_TYS,airport_to_UTM,airport_to_VLD,airport_to_VPS,airport_to_WRG,airport_to_XNA,airport_to_YAK,airport_to_YUM,part_of_day_morning,part_of_day_night
0,4,10,0.096437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4,17,0.452637,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,17,-0.886674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,12,-0.843930,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5,17,-0.103035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377563,6,5,-1.043402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
377564,7,21,0.025197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
377565,7,10,2.547091,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
377566,7,19,-0.430739,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
X_test_f

Unnamed: 0,day_of_week,departure_hour,flight_length,airline_AA,airline_AS,airline_B6,airline_CO,airline_DL,airline_EV,airline_F9,...,airport_to_TYS,airport_to_UTM,airport_to_VLD,airport_to_VPS,airport_to_WRG,airport_to_XNA,airport_to_YAK,airport_to_YUM,part_of_day_morning,part_of_day_night
0,7,20,0.951316,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,14,0.324405,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,17,0.153429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,22,2.960283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,9,0.025197,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161810,1,13,0.324405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161811,7,11,-0.174275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
161812,7,15,-0.245515,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161813,7,8,0.409893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [26]:
print(len(X_train))
print(len(X_train_f))
print()
print(len(X_test))
print(len(X_test_f))

377568
377568

161815
161815


### Currently we have our data very nicely preppared to train our prediction models, but we still have one problem: after the encoding we now have over 600 features
### We need to reduce this number, so we'll use PCA to extract the more relevant ones

In [27]:
from sklearn.decomposition import PCA

In [28]:
pca = PCA()
pca.fit(X_train_f)

In [29]:
print(pca.explained_variance_ratio_)

[7.36915248e-01 1.25951066e-01 3.49718189e-02 5.77488331e-03
 4.37570968e-03 3.47383332e-03 3.03801710e-03 2.95670585e-03
 2.59252071e-03 2.52020169e-03 2.39418422e-03 2.36508612e-03
 2.17709794e-03 1.94671277e-03 1.60746929e-03 1.59569072e-03
 1.50847875e-03 1.47231513e-03 1.42212138e-03 1.34424268e-03
 1.28024662e-03 1.15534104e-03 1.09793409e-03 1.08572306e-03
 1.04571466e-03 1.02353937e-03 9.92769695e-04 9.55959235e-04
 9.34703256e-04 8.45649666e-04 8.06666871e-04 7.93848561e-04
 7.91090370e-04 7.46867162e-04 7.23472639e-04 7.14894396e-04
 7.09119671e-04 7.00338048e-04 6.95454211e-04 6.80356521e-04
 6.69870948e-04 6.59964703e-04 6.40585288e-04 6.26525128e-04
 6.22604309e-04 6.11021010e-04 6.07577729e-04 5.90806670e-04
 5.68842041e-04 5.52731711e-04 5.33847802e-04 5.25697461e-04
 5.09178105e-04 5.07216487e-04 5.03700013e-04 4.80861003e-04
 4.74700404e-04 4.59514682e-04 4.55948493e-04 4.51351098e-04
 4.27858598e-04 4.21546232e-04 4.14464957e-04 4.09443382e-04
 4.06325198e-04 4.047226