# Machine learning model
This script tries to predict the data using a machine learning model, namely a multilayer perceptron. Other ideas might be implemented later.

As is often a good idea with machine learning, we will split the data into a training and validation set.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import warnings
import pickle

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
# Read in the data. Depending on which data set is read in, make sure to change the
# boolean indicator 'WITH_AIRPORT_CLASSES below accordinly!
df = pd.read_csv('Data//modified_data_with_classes.csv')

# Use all data or work on subset?
USE_ALL_DATA = True
WRITE_RESULTS = True
WITH_AIRPORT_CLASSES = True

if not USE_ALL_DATA:
    df = df.sample(n=10000, replace=False)
    df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,DEPARTURE_MONTH,DEPARTURE_DAY,DEPARTURE_DOW,AIRLINE,ORIGIN_CLASS,DESTINATION_CLASS,SCHEDULED_TIME,DISTANCE,CANCELLED,TOTAL_DELAY,ELAPSED_TIME_SELF,TARGET_1,TARGET_2,TARGET_3
0,3,1,4,UA,Moderate,Moderate,96.0,455,0,1.0,96.0,1.0,1.0,1.0
1,3,1,4,OO,Moderate,Moderate,98.0,455,0,-4.0,98.0,-4.0,-4.0,-4.0
2,3,1,4,UA,Moderate,Moderate,98.0,455,0,12.0,98.0,12.0,12.0,12.0
3,3,1,4,OO,Moderate,Moderate,95.0,455,0,126.0,95.0,126.0,126.0,126.0
4,3,1,4,UA,Moderate,Moderate,98.0,455,0,18.0,98.0,18.0,18.0,18.0


## Get the data ready for analysis

As our machine learning model cannot handle categorical variables, we first encode them into a one-hot encoding. After having encoded these columns, we drop the original ones.

In [3]:
airline_dummies = pd.get_dummies(df['AIRLINE'], prefix = 'AIRLINE')

df = pd.merge(
    left=df,
    right=airline_dummies,
    left_index=True,
    right_index=True,
)

df.drop(columns = ['AIRLINE'], inplace=True)

if WITH_AIRPORT_CLASSES:
    origin_dummies = pd.get_dummies(df['ORIGIN_CLASS'], prefix = "ORIGIN")
    destination_dummies = pd.get_dummies(df['DESTINATION_CLASS'], prefix = "DESTINATION")
    
    df = pd.merge(
        left = df,
        right = origin_dummies,
        left_index = True,
        right_index = True)
    
    df = pd.merge(
        left = df,
        right = destination_dummies,
        left_index = True,
        right_index = True)
    
    df.drop(columns = ['ORIGIN_CLASS', 'DESTINATION_CLASS'], inplace=True)
else:
    or_airport_dummies = pd.get_dummies(df['ORIGIN_AIRPORT'], prefix = 'OR_AIR')
    dest_airport_dummies = pd.get_dummies(df['DESTINATION_AIRPORT'], prefix = 'DEST_AIR')

    df = pd.merge(
        left=df,
        right=or_airport_dummies,
        left_index=True,
        right_index=True,
    )

    df = pd.merge(
        left=df,
        right=dest_airport_dummies,
        left_index=True,
        right_index=True,
    )
    
    df.drop(columns=['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'], inplace=True)

df.head()

Unnamed: 0,DEPARTURE_MONTH,DEPARTURE_DAY,DEPARTURE_DOW,SCHEDULED_TIME,DISTANCE,CANCELLED,TOTAL_DELAY,ELAPSED_TIME_SELF,TARGET_1,TARGET_2,...,AIRLINE_UA,AIRLINE_US,AIRLINE_VX,AIRLINE_WN,ORIGIN_High,ORIGIN_Low,ORIGIN_Moderate,DESTINATION_High,DESTINATION_Low,DESTINATION_Moderate
0,3,1,4,96.0,455,0,1.0,96.0,1.0,1.0,...,1,0,0,0,0,0,1,0,0,1
1,3,1,4,98.0,455,0,-4.0,98.0,-4.0,-4.0,...,0,0,0,0,0,0,1,0,0,1
2,3,1,4,98.0,455,0,12.0,98.0,12.0,12.0,...,1,0,0,0,0,0,1,0,0,1
3,3,1,4,95.0,455,0,126.0,95.0,126.0,126.0,...,0,0,0,0,0,0,1,0,0,1
4,3,1,4,98.0,455,0,18.0,98.0,18.0,18.0,...,1,0,0,0,0,0,1,0,0,1


## Train the model (MPRegressor)

We train the model for the selected target variable. As convergence will sometimes not be reached, the training function would output warnings that will clutter the output. We therefore surpress them.

In order to make prof. De Spiegeleer proud, we also make use of pipelines and tune the hyperparameters.

We make use of the 'adam' learning algorithm to train the model. This way we avoid having to choose some important hyperparameters like the (initial) learning rate and learning rate decay. Note that the adam method is the default in `MLPRegressor`

In [4]:
# Set some useful parameters
MAX_ITERATIONS = 1000
TARGET = 'TARGET_3'             # 'TARGET_1', 'TARGET_2', 'TARGET_3' or 'ALL'
warnings.filterwarnings("ignore")

# Create the design matrix (no column of ones in front)
X = df.loc[:, ~df.columns.isin(['SCHEDULED_TIME', 'CANCELLED', 'TARGET_1', 'TARGET_2', 'TARGET_3'])]

# Define the pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reduce_dim', PCA()),
        ('regressor', MLPRegressor(random_state=1, max_iter=MAX_ITERATIONS))
        ])

# Define the parameters to test
params = [{'scaler' : ["passthrough", StandardScaler(), RobustScaler()],
         'reduce_dim' : ["passthrough"],
         'regressor__hidden_layer_sizes' : [(30,), (100,), (100, 30), (70, 10)]},
          
          {'scaler' : [StandardScaler(), RobustScaler()],
         'reduce_dim' : [PCA()],
         'regressor__hidden_layer_sizes' : [(30,), (100,), (100, 30), (70, 10)]}]

# Define penalty function
def MSE(pred, target):
    return (np.square(pred - target)).mean(axis=0)
    
# Define the function that finds the best model and displays some useful information about it, like goodness-of-fit
# measures, among other thigns.
def find_model_for(target):
    y = df.loc[:, target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
    
    print("Model for target variable %s:" % target)
    print("Coefficient of Determination =", gridsearch.score(X_test, y_test))
    print("Mean Squared Error =", MSE(gridsearch.predict(X_test), y_test))
    print(gridsearch.best_params_)
    
    return gridsearch
    
    
if TARGET == 'ALL':
    model1 = find_model_for('TARGET_1')
    model2 = find_model_for('TARGET_2')
    model3 = find_model_for('TARGET_3')
    
    if WRITE_RESULTS:
        pickle.dump(model1, open("TARGET_1_with_classes.sav", 'wb'))
        pickle.dump(model2, open("TARGET_2_with_classes.sav", 'wb'))
        pickle.dump(model2, open("TARGET_3_with_classes.sav", 'wb'))
else:
    model = find_model_for(TARGET)
    
    if WRITE_RESULTS:
        filename = TARGET + "_with_classes" + ".sav"
        pickle.dump(model, open(filename, 'wb'))
    

Fitting 5 folds for each of 20 candidates, totalling 100 fits


KeyboardInterrupt: 