In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense
import category_encoders as ce


In [48]:
flight_df = pd.read_csv('~/Downloads/flight_data.csv')

In [49]:
flight_df.columns

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Reporting_Airline', 'Tail_Number', 'Flight_Number_Reporting_Airline',
       'Origin', 'Dest', 'DepTime', 'DepDelay', 'TaxiOut', 'WheelsOff',
       'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelay', 'ArrDel15',
       'Cancelled', 'Diverted', 'CRSElapsedTime', 'ActualElapsedTime',
       'AirTime', 'Flights', 'Distance', 'CarrierDelay', 'WeatherDelay',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Carrier',
       'Full-time', 'Part-time', 'Grand Total'],
      dtype='object')

In [50]:
flight_df.describe()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,Flight_Number_Reporting_Airline,DepTime,DepDelay,TaxiOut,WheelsOff,...,Flights,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Full-time,Part-time,Grand Total
count,17412880.0,17412880.0,17412880.0,17412880.0,17412880.0,17412880.0,16855120.0,16854880.0,16849360.0,16849360.0,...,17412876.0,17412880.0,2819049.0,2819049.0,2819049.0,2819049.0,2819049.0,17360740.0,17360740.0,17360740.0
mean,2021.117,2.504496,6.516729,15.75754,3.996971,2553.394,1325.829,8.725153,16.11165,1348.385,...,1.0,807.3822,26.71508,4.121156,11.53856,0.172152,23.48876,42070.56,3975.982,46046.54
std,0.8012217,1.132727,3.480794,8.792261,2.003826,1765.038,490.8947,47.14331,8.695804,492.0616,...,0.0,582.6777,74.41445,32.71135,29.21865,3.915622,54.12038,33029.34,4485.664,36234.38
min,2020.0,1.0,1.0,1.0,1.0,1.0,1.0,-128.0,1.0,1.0,...,1.0,29.0,0.0,0.0,0.0,0.0,0.0,2357.0,0.0,2374.0
25%,2020.0,1.0,3.0,8.0,2.0,1065.0,919.0,-6.0,11.0,934.0,...,1.0,383.0,0.0,0.0,0.0,0.0,0.0,10996.0,998.0,13191.0
50%,2021.0,3.0,7.0,16.0,4.0,2189.0,1320.0,-3.0,14.0,1332.0,...,1.0,660.0,6.0,0.0,0.0,0.0,0.0,53367.0,1805.0,54812.0
75%,2022.0,4.0,10.0,23.0,6.0,3914.0,1731.0,5.0,19.0,1744.0,...,1.0,1045.0,26.0,0.0,16.0,0.0,26.0,69124.0,5655.0,78809.0
max,2022.0,4.0,12.0,31.0,7.0,9888.0,2400.0,3890.0,256.0,2400.0,...,1.0,5812.0,3864.0,2363.0,1740.0,1245.0,2361.0,97373.0,16424.0,109108.0


## Shared Preprocessing

In [51]:
flight_df = flight_df[flight_df.Cancelled == 0]
flight_df = flight_df[flight_df.Diverted == 0]
len(flight_df)

16809809

In [52]:
# Reducing dimensionality of Carrier with custom grouping
value_counts = flight_df['Carrier'].value_counts()
to_remove = value_counts[value_counts <= 9000].index
flight_df['Carrier'].replace(to_remove, 'Other', inplace=True)

In [53]:
# Reducing dimensionality of TimeZones 
def binary_encode(df, columns):
    encoder = ce.BinaryEncoder(cols=columns)

    # Fit and transform to produce binary encoded data
    df_encoded = encoder.fit_transform(df[columns])

    # Merge the encoded data back with the original DataFrame
    df = df.drop(columns, axis=1)
    df = pd.concat([df, df_encoded], axis=1)
    return df

In [54]:
def label_encode_airport_codes(df):
    label_encoder = LabelEncoder()
    label_encoder.fit(df['Origin'].append(df['Dest']).unique())

    # Convert Origin and Dest to integer indices
    df['Origin_encoded'] = label_encoder.transform(df['Origin'])
    df['Dest_encoded'] = label_encoder.transform(df['Dest'])

    df = df.drop(['Dest', 'Origin'], axis=1)
    return df

## Model run 1

Timezones instead of airports

In [55]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
        'Dest', 'Origin', 'Distance', 'Carrier',
        'Full-time', 'Part-time', 'Grand Total', 'CRSArrTime', 'CRSElapsedTime']
len(features)

14

In [56]:
temp_features = features + ['ArrDel15']
flight_df = flight_df[temp_features]
flight_df = flight_df.dropna()

y = flight_df['ArrDel15']
X = flight_df[features]

In [57]:
# Encoding categorical variables
X = binary_encode(X, ['Dest', 'Origin'])
X = pd.get_dummies(X, columns=['Carrier'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [58]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid') 
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
 28137/335237 [=>............................] - ETA: 2:07 - loss: 0.4520 - accuracy: 0.8324

KeyboardInterrupt: 

In [59]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)

features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
        'Dest', 'Origin', 'Distance', 'Carrier',
        'Full-time', 'Part-time', 'Grand Total', 'CRSArrTime', 'CRSElapsedTime']
        
# Test accuracy: 0.8320582509040833
# Test loss: 0.45254066586494446

Test accuracy: 0.8320582509040833
Test loss: 0.45254066586494446


In [46]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)


features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
       'Dest', 'Origin',
       'DepTime', 'DepDelay','TaxiOut', 'WheelsOff', 
       'Distance', 'Carrier','Full-time', 'Part-time', 'Grand Total', 'CRSArrTime']
# Test accuracy: 0.949720561504364
# Test loss: 0.1481556445360183

Test accuracy: 0.949720561504364
Test loss: 0.14815564453601837
