# Machine learning model
This script tries to predict the data using a machine learning model, namely a multilayer perceptron. Other ideas might be implemented later.

As is often a good idea with machine learning, we will split the data into a training and validation set.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split

In [2]:
# Read in the data
df = pd.read_csv('Data//modified_training_data.csv')

# Use all data or work on subset?
USE_ALL_DATA = False
WRITE_DATA = False

if not USE_ALL_DATA:
    df = df.sample(n=10000, replace=False)
    df.reset_index(inplace=True, drop=True)
    
# Since the data is not fully prepped, we do it here. This section will be moved to Data_modification.ipynb

# Remove first index column. This should not be necessary anymore in the new Data_modification.ipynb
df = df.iloc[: , 1:]

# Create a 'day' and 'month' column instead of datetimes
df['SCHEDULED_DEPARTURE'] = pd.to_datetime(df['SCHEDULED_DEPARTURE'])
df['DEPARTURE_DAY'] = df['SCHEDULED_DEPARTURE'].apply(lambda x: x.weekday())
df['DEPARTURE_MONTH'] = df['SCHEDULED_DEPARTURE'].apply(lambda x: x.month)
df.drop(columns=['SCHEDULED_DEPARTURE'], inplace=True)

## Construct the training and test sets

To construct the training and validation set, we make use of in-build functions in sklearn. __Make sure to take note of the selected target variable!__.

As our machine learning model cannot handle categorical variables, we first encode them into a one-hot encoding. Furthermore, we try training the MLP both on standerdized and unstanderdized data.

In [3]:
airline_dummies = pd.get_dummies(df['AIRLINE'], prefix = 'AIRLINE')
or_airport_dummies = pd.get_dummies(df['ORIGIN_AIRPORT'], prefix = 'OR_AIR')
dest_airport_dummies = pd.get_dummies(df['DESTINATION_AIRPORT'], prefix = 'DEST_AIR')

df = pd.merge(
    left=df,
    right=airline_dummies,
    left_index=True,
    right_index=True,
)

df = pd.merge(
    left=df,
    right=or_airport_dummies,
    left_index=True,
    right_index=True,
)

df = pd.merge(
    left=df,
    right=dest_airport_dummies,
    left_index=True,
    right_index=True,
)

df.drop(columns=['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'], inplace=True)
df

Unnamed: 0,SCHEDULED_ARRIVAL,SCHEDULED_TIME,ELAPSED_TIME,DISTANCE,CANCELLED,TOTAL_DELAY,TARGET_1,TARGET_2,DEPARTURE_DAY,DEPARTURE_MONTH,...,DEST_AIR_TVC,DEST_AIR_TXK,DEST_AIR_TYR,DEST_AIR_TYS,DEST_AIR_UST,DEST_AIR_VLD,DEST_AIR_VPS,DEST_AIR_WRG,DEST_AIR_XNA,DEST_AIR_YAK
0,19:08:00,228.0,211.0,1400,0,27.0,27.0,27.0,0,6,...,0,0,0,0,0,0,0,0,0,0
1,07:50:00,80.0,86.0,369,0,-12.0,-12.0,-12.0,3,5,...,0,0,0,0,0,0,0,0,0,0
2,20:45:00,80.0,76.0,371,0,-2.0,-2.0,-2.0,0,5,...,0,0,0,0,0,0,0,0,0,0
3,17:25:00,140.0,156.0,872,0,16.0,16.0,16.0,4,6,...,0,0,0,0,0,0,0,0,0,0
4,10:00:00,145.0,143.0,763,0,-4.0,-4.0,-4.0,3,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,09:17:00,237.0,219.0,1635,0,1.0,1.0,1.0,6,6,...,0,0,0,0,0,0,0,0,0,0
9996,17:50:00,95.0,87.0,453,0,112.0,112.0,112.0,3,4,...,0,0,0,0,0,0,0,0,0,0
9997,16:58:00,52.0,43.0,77,0,-5.0,-5.0,-5.0,6,7,...,0,0,0,0,0,0,0,0,0,0
9998,00:15:00,105.0,84.0,484,0,-1.0,-1.0,-1.0,3,7,...,0,0,0,0,0,0,0,0,0,0


In [4]:
target = 'TARGET_1'

X = df.loc[:, ~df.columns.isin(['ELAPSED_TIME', 'SCHEDULED_ARRIVAL', 'CANCELLED', 'TARGET_1', 'TARGET_2', 'TARGET_3'])]
y = df.loc[:, target]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Train the model (MPLRegressor)

In [12]:
regr = MLPRegressor(random_state=1, max_iter=500, verbose=True).fit(X_train, y_train)
regr_scaled = MLPRegressor(random_state=1, max_iter=2000, verbose=True).fit(X_train_scaled, y_train)

Iteration 1, loss = 5992.63553789
Iteration 2, loss = 4673.02925629
Iteration 3, loss = 3857.16390762
Iteration 4, loss = 3570.69105279
Iteration 5, loss = 3511.95778552
Iteration 6, loss = 3507.30000308
Iteration 7, loss = 3495.71279981
Iteration 8, loss = 3490.72813422
Iteration 9, loss = 3484.36957127
Iteration 10, loss = 3482.13913976
Iteration 11, loss = 3479.24952611
Iteration 12, loss = 3468.48543717
Iteration 13, loss = 3470.25914318
Iteration 14, loss = 3467.49845164
Iteration 15, loss = 3452.26561801
Iteration 16, loss = 3467.05895604
Iteration 17, loss = 3460.55316890
Iteration 18, loss = 3452.10028450
Iteration 19, loss = 3434.37148729
Iteration 20, loss = 3435.68845198
Iteration 21, loss = 3431.69501499
Iteration 22, loss = 3429.31098219
Iteration 23, loss = 3424.75883080
Iteration 24, loss = 3417.14457934
Iteration 25, loss = 3410.99514168
Iteration 26, loss = 3407.24311280
Iteration 27, loss = 3415.18488846
Iteration 28, loss = 3416.82563999
Iteration 29, loss = 3408.141

Iteration 232, loss = 3086.55794637
Iteration 233, loss = 3081.67256522
Iteration 234, loss = 3108.00717926
Iteration 235, loss = 3109.15239549
Iteration 236, loss = 3094.12048306
Iteration 237, loss = 3089.07727555
Iteration 238, loss = 3099.53657783
Iteration 239, loss = 3087.40546305
Iteration 240, loss = 3106.60201300
Iteration 241, loss = 3072.94065376
Iteration 242, loss = 3096.51482570
Iteration 243, loss = 3103.86133683
Iteration 244, loss = 3081.64493771
Iteration 245, loss = 3065.32199837
Iteration 246, loss = 3074.65564507
Iteration 247, loss = 3091.18653150
Iteration 248, loss = 3066.57356941
Iteration 249, loss = 3080.89868878
Iteration 250, loss = 3128.77023297
Iteration 251, loss = 3084.19609243
Iteration 252, loss = 3067.96054196
Iteration 253, loss = 3093.37528932
Iteration 254, loss = 3074.44979848
Iteration 255, loss = 3070.92476041
Iteration 256, loss = 3054.39984659
Iteration 257, loss = 3052.99616627
Iteration 258, loss = 3070.57163783
Iteration 259, loss = 3086.6

Iteration 460, loss = 2841.12979470
Iteration 461, loss = 2835.08469023
Iteration 462, loss = 2906.58131480
Iteration 463, loss = 2863.24633048
Iteration 464, loss = 2834.21698629
Iteration 465, loss = 2853.62624637
Iteration 466, loss = 2823.84365439
Iteration 467, loss = 2828.06546666
Iteration 468, loss = 2824.43546769
Iteration 469, loss = 2850.75875721
Iteration 470, loss = 2817.12519946
Iteration 471, loss = 2826.29229754
Iteration 472, loss = 2852.55872886
Iteration 473, loss = 2820.48366784
Iteration 474, loss = 2842.80372304
Iteration 475, loss = 2857.85738496
Iteration 476, loss = 2855.60914888
Iteration 477, loss = 2830.89908719
Iteration 478, loss = 2823.48684220
Iteration 479, loss = 2836.05146613
Iteration 480, loss = 2817.05069457
Iteration 481, loss = 2819.13867142
Iteration 482, loss = 2841.42750714
Iteration 483, loss = 2812.52150713
Iteration 484, loss = 2810.75987196
Iteration 485, loss = 2812.51338264
Iteration 486, loss = 2831.50552448
Iteration 487, loss = 2824.7



Iteration 2, loss = 7184.94183814
Iteration 3, loss = 6985.30420596
Iteration 4, loss = 6761.27637315
Iteration 5, loss = 6549.03809724
Iteration 6, loss = 6376.36274891
Iteration 7, loss = 6240.19171481
Iteration 8, loss = 6118.98514214
Iteration 9, loss = 5997.93891427
Iteration 10, loss = 5872.33874190
Iteration 11, loss = 5738.38126525
Iteration 12, loss = 5590.82484752
Iteration 13, loss = 5435.69188787
Iteration 14, loss = 5261.15132225
Iteration 15, loss = 5079.39843831
Iteration 16, loss = 4885.43972187
Iteration 17, loss = 4688.75846771
Iteration 18, loss = 4495.07390743
Iteration 19, loss = 4307.73178186
Iteration 20, loss = 4128.37692001
Iteration 21, loss = 3961.00630153
Iteration 22, loss = 3808.23751294
Iteration 23, loss = 3669.85502929
Iteration 24, loss = 3547.69480618
Iteration 25, loss = 3441.02801972
Iteration 26, loss = 3349.08421282
Iteration 27, loss = 3266.66195797
Iteration 28, loss = 3202.85749940
Iteration 29, loss = 3151.14306412
Iteration 30, loss = 3104.90

Iteration 234, loss = 1737.64576898
Iteration 235, loss = 1734.41326333
Iteration 236, loss = 1736.16614524
Iteration 237, loss = 1733.39604499
Iteration 238, loss = 1731.05443495
Iteration 239, loss = 1724.76484709
Iteration 240, loss = 1724.09061772
Iteration 241, loss = 1720.25086173
Iteration 242, loss = 1718.00887834
Iteration 243, loss = 1718.94037445
Iteration 244, loss = 1715.74785594
Iteration 245, loss = 1710.87595491
Iteration 246, loss = 1707.12615176
Iteration 247, loss = 1704.19317223
Iteration 248, loss = 1700.87122864
Iteration 249, loss = 1696.56058087
Iteration 250, loss = 1695.68172117
Iteration 251, loss = 1694.12650070
Iteration 252, loss = 1686.14970520
Iteration 253, loss = 1683.84297903
Iteration 254, loss = 1680.78786518
Iteration 255, loss = 1681.23110327
Iteration 256, loss = 1682.01329645
Iteration 257, loss = 1679.19851179
Iteration 258, loss = 1673.79429059
Iteration 259, loss = 1672.08180277
Iteration 260, loss = 1671.02520339
Iteration 261, loss = 1666.1

Iteration 463, loss = 1281.42880009
Iteration 464, loss = 1280.87783605
Iteration 465, loss = 1280.18016291
Iteration 466, loss = 1275.65740711
Iteration 467, loss = 1272.24859979
Iteration 468, loss = 1273.97698761
Iteration 469, loss = 1270.70082415
Iteration 470, loss = 1272.00011491
Iteration 471, loss = 1271.16158816
Iteration 472, loss = 1265.58615998
Iteration 473, loss = 1261.96256850
Iteration 474, loss = 1266.97208581
Iteration 475, loss = 1262.78555606
Iteration 476, loss = 1261.18955355
Iteration 477, loss = 1258.38196536
Iteration 478, loss = 1254.66331697
Iteration 479, loss = 1257.45713602
Iteration 480, loss = 1252.92851573
Iteration 481, loss = 1251.94629069
Iteration 482, loss = 1250.52046334
Iteration 483, loss = 1254.17321769
Iteration 484, loss = 1253.22961508
Iteration 485, loss = 1252.71361092
Iteration 486, loss = 1246.51184312
Iteration 487, loss = 1241.14141228
Iteration 488, loss = 1239.90189792
Iteration 489, loss = 1243.46286981
Iteration 490, loss = 1240.3

Iteration 691, loss = 997.17155066
Iteration 692, loss = 993.66967852
Iteration 693, loss = 992.05931444
Iteration 694, loss = 995.30626451
Iteration 695, loss = 989.67902724
Iteration 696, loss = 987.15683223
Iteration 697, loss = 982.62587777
Iteration 698, loss = 986.23357803
Iteration 699, loss = 987.31586707
Iteration 700, loss = 986.80690892
Iteration 701, loss = 984.07342718
Iteration 702, loss = 982.43771496
Iteration 703, loss = 983.07028668
Iteration 704, loss = 982.75949117
Iteration 705, loss = 984.31151091
Iteration 706, loss = 977.95167402
Iteration 707, loss = 973.94605341
Iteration 708, loss = 974.74221819
Iteration 709, loss = 973.01849284
Iteration 710, loss = 971.50372271
Iteration 711, loss = 971.28901419
Iteration 712, loss = 972.28768759
Iteration 713, loss = 974.64990797
Iteration 714, loss = 971.16866542
Iteration 715, loss = 965.35112244
Iteration 716, loss = 974.07121317
Iteration 717, loss = 972.39965956
Iteration 718, loss = 965.73523416
Iteration 719, loss 

Iteration 926, loss = 797.27880093
Iteration 927, loss = 795.81497026
Iteration 928, loss = 797.02898989
Iteration 929, loss = 794.25628617
Iteration 930, loss = 795.43229152
Iteration 931, loss = 792.34141046
Iteration 932, loss = 790.56838591
Iteration 933, loss = 794.34081263
Iteration 934, loss = 793.41707474
Iteration 935, loss = 789.82889312
Iteration 936, loss = 788.69671618
Iteration 937, loss = 792.31637818
Iteration 938, loss = 791.55441057
Iteration 939, loss = 788.51684126
Iteration 940, loss = 785.87791655
Iteration 941, loss = 785.49588831
Iteration 942, loss = 780.82036652
Iteration 943, loss = 789.69715305
Iteration 944, loss = 785.55567721
Iteration 945, loss = 780.36416737
Iteration 946, loss = 785.41942702
Iteration 947, loss = 789.00702486
Iteration 948, loss = 787.09234839
Iteration 949, loss = 782.40564794
Iteration 950, loss = 784.92592832
Iteration 951, loss = 787.91265226
Iteration 952, loss = 781.59141348
Iteration 953, loss = 780.70512422
Iteration 954, loss 

In [13]:
# Define penalty function
def MSE(pred, target):
    return (np.square(pred - target)).mean(axis=0)

R_sq = regr.score(X_test, y_test)
y_test_pred = regr.predict(X_test)
mse = MSE(y_test_pred, y_test)

R_sq_scaled = regr_scaled.score(X_test_scaled, y_test)
y_test_pred_scaled = regr_scaled.predict(X_test_scaled)
mse_scaled = MSE(y_test_pred_scaled, y_test)

print("Results for unscaled input:")
print("Coefficient of Determination =", R_sq)
print("Mean Squared Error =", mse)
print("")
print("Results for scaled input:")
print("Coefficient of Determination =", R_sq_scaled)
print("Mean Squared Error =", mse_scaled)

Results for unscaled input:
Coefficient of Determination = 0.3345749327278581
Mean Squared Error = 7654.1445591819165

Results for scaled input:
Coefficient of Determination = -0.39239932627042795
Mean Squared Error = 16016.267272694511
