<a href="https://colab.research.google.com/github/MichaelJP-DS/Flight-Arrival-Classification-Models/blob/main/Flight_Arrival_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Classification Model

Classify flight arrivals using a gradient boosting model and simple neural network.  More to come. In progress.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
# Import Statements

# Libraries
import pandas as pd
import numpy as np

# Data Processing
from sklearn.preprocessing import minmax_scale
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# ML Boosted Model
from xgboost import XGBClassifier

# Neural Network/ML
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.activations import relu, softmax
from keras.layers import Dropout

# Viz
from tensorflow.keras.callbacks import TensorBoard
from sklearn.metrics import confusion_matrix



In [89]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [14]:
df_flights = pd.read_csv('/content/drive/MyDrive/Classification Data/flights.csv', low_memory=False)

In [15]:
def wrangle(df):

  # Eliminate columns that may result in data leakage
  df = df.drop(columns= ['WHEELS_OFF', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 'ARRIVAL_TIME', 'YEAR', 'DAY'])

  # Drop rows when flight was cancelled but keep data when flight was not
  df = df.drop(df[df.CANCELLED == 1].index)

  # Now drop cancelled column the resulting rows where not impacted by cancelled flight
  df = df.drop(columns= ['CANCELLED', 'CANCELLATION_REASON'])

  # Drop rows where the flight is delayed greater than 30 minutes - too leaky may tinker with this
  df = df.drop(df[df.DEPARTURE_DELAY >= 30.0].index)

  # Drop rows related to reason for departure delay & high null value count.
  df = df.drop(columns= ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'AIRLINE_DELAY'])

  # Drop High Cardinality Column
  df = df.drop(columns= ['TAIL_NUMBER', 'FLIGHT_NUMBER'])

  # Label encode Airline, Origin Airport and Destination Airport
  enc = preprocessing.LabelEncoder()
  df['AIRLINE'] = enc.fit_transform(df['AIRLINE'])
  df['ORIGIN_AIRPORT'] = enc.fit_transform(df['ORIGIN_AIRPORT'])
  df['DESTINATION_AIRPORT'] = enc.fit_transform(df['DESTINATION_AIRPORT'])

  # Impute limited null values with column mean
  df['ARRIVAL_DELAY'] = df['ARRIVAL_DELAY'].fillna(4)
  df['TAXI_IN'] = df['TAXI_IN'].fillna(7.4)  

  # Set up target column in classes
  df.loc[df['ARRIVAL_DELAY'] <= 0.0, 'ARRIVAL' ] = 0
  df.loc[(df['ARRIVAL_DELAY'] > 0) & (df['ARRIVAL_DELAY'] <= 15.0), 'ARRIVAL'] = 1
  df.loc[df['ARRIVAL_DELAY'] > 15.0, 'ARRIVAL' ] = 2
  
  # Convert arrival to Integer for NN
  df['ARRIVAL'] = df['ARRIVAL'].astype(int)

  # Drop modified column
  df = df.drop(columns='ARRIVAL_DELAY')

  # Normalize data for processing

  df['MONTH'] = minmax_scale(df['MONTH'])      
  df['DAY_OF_WEEK'] = minmax_scale(df['DAY_OF_WEEK'])
  df['AIRLINE'] = minmax_scale(df['AIRLINE'])
  df['ORIGIN_AIRPORT'] = minmax_scale(df['ORIGIN_AIRPORT'])
  df['DESTINATION_AIRPORT'] = minmax_scale(df['DESTINATION_AIRPORT'])
  df['SCHEDULED_DEPARTURE'] = minmax_scale(df['SCHEDULED_DEPARTURE'])
  df['DEPARTURE_TIME'] = minmax_scale(df['DEPARTURE_TIME'])
  df['DEPARTURE_DELAY'] = minmax_scale(df['DEPARTURE_DELAY'])  
  df['TAXI_OUT'] = minmax_scale(df['TAXI_OUT'])
  df['SCHEDULED_TIME'] = minmax_scale(df['SCHEDULED_TIME'])      
  df['DISTANCE'] = minmax_scale(df['DISTANCE'])            
  df['TAXI_IN'] = minmax_scale(df['TAXI_IN'])                
  df['SCHEDULED_ARRIVAL'] = minmax_scale(df['SCHEDULED_ARRIVAL'])
  df['DIVERTED'] = minmax_scale(df['DIVERTED'])                

  return df

## Scratch Area

In [16]:
new_df = wrangle(df_flights)

In [6]:
new_df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,SCHEDULED_TIME,DISTANCE,TAXI_IN,SCHEDULED_ARRIVAL,DIVERTED,ARRIVAL
0,0.0,0.5,0.076923,0.515152,0.929936,0.001696,0.980825,0.63964,0.11236,0.267143,0.286147,0.012146,0.178825,0.0,0
1,0.0,0.5,0.0,0.76874,0.863057,0.003817,0.000417,0.666667,0.061798,0.374286,0.464257,0.012146,0.312213,0.0,0
2,0.0,0.5,0.846154,0.931419,0.593949,0.008058,0.007086,0.720721,0.08427,0.382857,0.457391,0.040486,0.335556,0.0,1
3,0.0,0.5,0.0,0.76874,0.812102,0.008058,0.005836,0.693694,0.078652,0.381429,0.46668,0.02834,0.33514,0.0,0
4,0.0,0.5,0.076923,0.929825,0.515924,0.010178,0.009587,0.72973,0.05618,0.31,0.286147,0.016194,0.132972,0.0,0


In [25]:
new_df['ARRIVAL'].value_counts().sum()

5074310

In [83]:
def classcol(val):
  if val <= 0.0:
    return 1.0
  elif ((val > 0.0) & (val < 31.0)):
    return 2.0
  else: 
    return 3.0

## Train, Test, Split

In [7]:
y = new_df['ARRIVAL']

X = new_df.drop(columns='ARRIVAL')

In [8]:
y.shape

(5074310,)

In [9]:
X.shape

(5074310, 14)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [75]:
y_test.value_counts()

0    362577
1    105112
2     39742
Name: ARRIVAL, dtype: int64

## Gradient Boosting Classifier

## Build Fully Connected Neural Network
Use early stopping and dropout
Use Relu activation function
Use softmax for output
multiclass crossentropy for loss function

In [24]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

model = Sequential()
model.add(Dense(8, input_dim=14, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='Adam', 
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'],)

In [27]:
model.fit(X_train, y_train, 
          validation_data=(X_test, y_test),
          workers=1, 
          epochs=2, 
          batch_size=8, 
          verbose=1,
          callbacks=[callback])

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa46878cc10>