<a href="https://colab.research.google.com/github/MichaelJP-DS/Flight-Arrival-Classification-Models/blob/main/Flight_Arrival_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Classification Model

Based on limited information regarding Month, Day, Day_of_Week, Origin Airport/Destination Airport, Scheduled Departure Time, how long the flight is going to be, how accurately can we predict whether a flight will be delayed.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [177]:
# Import Statements

# Libraries
import pandas as pd
import numpy as np

# Data Processing
from sklearn.preprocessing import minmax_scale
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# ML Boosted Model
from xgboost import XGBClassifier

# Neural Network/ML
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.activations import relu, softmax
from keras.layers import Dropout

# Viz
from tensorflow.keras.callbacks import TensorBoard
from sklearn.metrics import confusion_matrix
import seaborn as sns



In [89]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [161]:
df_flights = pd.read_csv('/content/drive/MyDrive/Classification Data/flights.csv', low_memory=False)

In [162]:
df_flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,2354.0,-11.0,21.0,15.0,205.0,194.0,169.0,1448,404.0,4.0,430,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,24.0,-1.0,11.0,35.0,235.0,215.0,199.0,1448,254.0,5.0,320,259.0,-21.0,0,0,,,,,,


In [182]:
df_flights['DEPARTURE_DELAY'].describe()

count    5.732926e+06
mean     9.370158e+00
std      3.708094e+01
min     -8.200000e+01
25%     -5.000000e+00
50%     -2.000000e+00
75%      7.000000e+00
max      1.988000e+03
Name: DEPARTURE_DELAY, dtype: float64

In [197]:
def wrangle(df):

  # Eliminate columns that may result in data leakage
  df = df.drop(columns= ['WHEELS_OFF', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 'ARRIVAL_TIME', 'YEAR', 'DAY'])

  # Drop rows when flight was cancelled but keep data when flight was not
  df = df.drop(df[df.CANCELLED == 1].index)

  df = df.drop(df[df.DIVERTED == 1].index)

  # Now drop cancelled column the resulting rows where not impacted by cancelled flight
  df = df.drop(columns= ['CANCELLED', 'CANCELLATION_REASON'])

  # Drop rows where the flight is delayed greater than 30 minutes - too leaky may tinker with this
  df = df.drop(df[df.DEPARTURE_DELAY >= 15.0].index)

  # Drop rows related to reason for departure delay & high null value count.
  df = df.drop(columns= ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'AIRLINE_DELAY','DIVERTED'])

  # Drop High Cardinality Column
  df = df.drop(columns= ['TAIL_NUMBER', 'FLIGHT_NUMBER'])

  # Label encode Airline, Origin Airport and Destination Airport
  enc = preprocessing.LabelEncoder()
  df['AIRLINE'] = enc.fit_transform(df['AIRLINE'])
  df['ORIGIN_AIRPORT'] = enc.fit_transform(df['ORIGIN_AIRPORT'])
  df['DESTINATION_AIRPORT'] = enc.fit_transform(df['DESTINATION_AIRPORT'])

  # Impute limited null values with column mean
  df['ARRIVAL_DELAY'] = df['ARRIVAL_DELAY'].fillna(4)
  df['TAXI_IN'] = df['TAXI_IN'].fillna(7.4)  

  # Set up target column in classes
  df.loc[df['ARRIVAL_DELAY'] <= 0.0, 'ARRIVAL' ] = 0
  df.loc[(df['ARRIVAL_DELAY'] > 0) & (df['ARRIVAL_DELAY'] <= 15.0), 'ARRIVAL'] = 1
  df.loc[df['ARRIVAL_DELAY'] > 15.0, 'ARRIVAL' ] = 2
  
  # Convert arrival to Integer for NN
  df['ARRIVAL'] = df['ARRIVAL'].astype(int)

  # Drop modified column
  df = df.drop(columns='ARRIVAL_DELAY')

  # Normalize data for processing

  df['MONTH'] = minmax_scale(df['MONTH'])      
  df['DAY_OF_WEEK'] = minmax_scale(df['DAY_OF_WEEK'])
  df['AIRLINE'] = minmax_scale(df['AIRLINE'])
  df['ORIGIN_AIRPORT'] = minmax_scale(df['ORIGIN_AIRPORT'])
  df['DESTINATION_AIRPORT'] = minmax_scale(df['DESTINATION_AIRPORT'])
  df['SCHEDULED_DEPARTURE'] = minmax_scale(df['SCHEDULED_DEPARTURE'])
  df['DEPARTURE_TIME'] = minmax_scale(df['DEPARTURE_TIME'])
  df['DEPARTURE_DELAY'] = minmax_scale(df['DEPARTURE_DELAY'])  
  df['TAXI_OUT'] = minmax_scale(df['TAXI_OUT'])
  df['SCHEDULED_TIME'] = minmax_scale(df['SCHEDULED_TIME'])      
  df['DISTANCE'] = minmax_scale(df['DISTANCE'])            
  df['TAXI_IN'] = minmax_scale(df['TAXI_IN'])                
  df['SCHEDULED_ARRIVAL'] = minmax_scale(df['SCHEDULED_ARRIVAL'])
  df['ARRIVAL'] = minmax_scale(df['ARRIVAL'])                

  return df

## Scratch Area

In [198]:
new_df = wrangle(df_flights)

In [199]:
new_df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,SCHEDULED_TIME,DISTANCE,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL
0,0.0,0.5,0.076923,0.515152,0.929936,0.001696,0.980825,0.739583,0.114286,0.267143,0.286147,0.012146,0.181934,0.0
1,0.0,0.5,0.0,0.76874,0.863057,0.003817,0.000417,0.770833,0.062857,0.374286,0.464257,0.012146,0.317642,0.0
2,0.0,0.5,0.846154,0.931419,0.593949,0.008058,0.007086,0.833333,0.085714,0.382857,0.457391,0.040486,0.341391,0.5
3,0.0,0.5,0.0,0.76874,0.812102,0.008058,0.005836,0.802083,0.08,0.381429,0.46668,0.02834,0.340967,0.0
4,0.0,0.5,0.076923,0.929825,0.515924,0.010178,0.009587,0.84375,0.057143,0.31,0.286147,0.016194,0.135284,0.0


In [186]:
new_df.isnull().sum()

MONTH                  0
DAY_OF_WEEK            0
AIRLINE                0
ORIGIN_AIRPORT         0
DESTINATION_AIRPORT    0
SCHEDULED_DEPARTURE    0
SCHEDULED_TIME         0
AIR_TIME               0
DISTANCE               0
SCHEDULED_ARRIVAL      0
DELAY                  0
dtype: int64

In [187]:
new_df['SCHEDULED_TIME'].mean()

0.17699139153099414

In [83]:
def classcol(val):
  if val <= 0.0:
    return 1.0
  elif ((val > 0.0) & (val < 31.0)):
    return 2.0
  else: 
    return 3.0

## Train, Test, Split

In [201]:
y = new_df['ARRIVAL']

X = new_df.drop(columns='ARRIVAL')

In [202]:
y.shape

(4663586,)

In [203]:
X.shape

(4663586, 13)

In [204]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [205]:
y_test.value_counts()

0.0    357687
0.5     87712
1.0     20960
Name: ARRIVAL, dtype: int64

## Gradient Boosting Classifier

## Build Fully Connected Neural Network
Use early stopping and dropout
Use Relu activation function
Use softmax for output
multiclass crossentropy for loss function

In [207]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

model = Sequential()
model.add(Dense(1000, input_dim=13, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1000, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='Adam', 
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [209]:
model.fit(X_train, y_train, 
          validation_data=(X_test, y_test),
          workers=-1, 
          epochs=1, 
          batch_size=256, 
          verbose=1,
          callbacks=[callback])



<keras.callbacks.History at 0x7fa4448ce890>