In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import (log_loss,
                            roc_auc_score, 
                            recall_score, 
                            precision_score, 
                            average_precision_score, 
                            f1_score, classification_report, 
                            accuracy_score, plot_roc_curve, 
                            plot_precision_recall_curve, 
                            plot_confusion_matrix
                        )

In [2]:
flights = pd.read_csv("Starting_Features_df.csv")

In [3]:
flights.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,op_unique_carrier,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,...,taxi_in,crs_arr_time,arr_time,arr_delay,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,Weather
0,0,2018-12-01,AA,AA_CODESHARE,YV,5948,14730,11298,1229,1337.0,...,9.0,1355,1501.0,66.0,0,146,144.0,123.0,733,Rainy
1,1,2018-12-01,NK,NK,NK,395,14100,12889,1945,1941.0,...,6.0,2244,2215.0,-29.0,0,359,334.0,313.0,2176,Cloudy
2,2,2018-12-01,NK,NK,NK,396,12889,14100,1110,1107.0,...,12.0,1855,1856.0,1.0,0,285,289.0,245.0,2176,Sunny
3,3,2018-12-01,NK,NK,NK,403,10397,11697,1739,1736.0,...,4.0,1930,1915.0,-15.0,0,111,99.0,84.0,581,Rainy
4,4,2018-12-01,NK,NK,NK,403,12892,10397,930,926.0,...,7.0,1649,1614.0,-35.0,0,259,228.0,209.0,1947,Rainy


In [4]:
flights.drop("Unnamed: 0", axis=1, inplace=True)

In [5]:
flights.columns.tolist()

['fl_date',
 'mkt_unique_carrier',
 'branded_code_share',
 'op_unique_carrier',
 'op_carrier_fl_num',
 'origin_airport_id',
 'dest_airport_id',
 'crs_dep_time',
 'dep_time',
 'dep_delay',
 'taxi_out',
 'wheels_off',
 'wheels_on',
 'taxi_in',
 'crs_arr_time',
 'arr_time',
 'arr_delay',
 'diverted',
 'crs_elapsed_time',
 'actual_elapsed_time',
 'air_time',
 'distance',
 'Weather']

In [6]:
features = flights[['mkt_unique_carrier', 'taxi_out', 'taxi_in', 'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time',
            'distance', 'Weather', 'arr_delay']] 

In [7]:
features.head()

Unnamed: 0,mkt_unique_carrier,taxi_out,taxi_in,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,Weather,arr_delay
0,AA,12.0,9.0,0,146,144.0,123.0,733,Rainy,66.0
1,NK,15.0,6.0,0,359,334.0,313.0,2176,Cloudy,-29.0
2,NK,32.0,12.0,0,285,289.0,245.0,2176,Sunny,1.0
3,NK,11.0,4.0,0,111,99.0,84.0,581,Rainy,-15.0
4,NK,12.0,7.0,0,259,228.0,209.0,1947,Rainy,-35.0


In [8]:
df = pd.get_dummies(features, columns=['mkt_unique_carrier', 'Weather'], drop_first=True)

In [9]:
df.head()

Unnamed: 0,taxi_out,taxi_in,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,arr_delay,mkt_unique_carrier_AS,mkt_unique_carrier_B6,mkt_unique_carrier_DL,mkt_unique_carrier_F9,mkt_unique_carrier_G4,mkt_unique_carrier_HA,mkt_unique_carrier_NK,mkt_unique_carrier_UA,mkt_unique_carrier_WN,Weather_Rainy,Weather_Snowy,Weather_Sunny
0,12.0,9.0,0,146,144.0,123.0,733,66.0,0,0,0,0,0,0,0,0,0,1,0,0
1,15.0,6.0,0,359,334.0,313.0,2176,-29.0,0,0,0,0,0,0,1,0,0,0,0,0
2,32.0,12.0,0,285,289.0,245.0,2176,1.0,0,0,0,0,0,0,1,0,0,0,0,1
3,11.0,4.0,0,111,99.0,84.0,581,-15.0,0,0,0,0,0,0,1,0,0,1,0,0
4,12.0,7.0,0,259,228.0,209.0,1947,-35.0,0,0,0,0,0,0,1,0,0,1,0,0


In [10]:
# separate numerical and categorical columns
numerical_cols = ['taxi_out', 'taxi_in', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'distance']
categorical_cols = list(set(df.columns) - set(numerical_cols) - {'arr_delay'})

# Modeling

The goal is to predict arrival delay. We will separate the arr_delay from the main dataframe and store it in another variable.

In [11]:
y = df['arr_delay']
X = df.drop("arr_delay", axis=1)

In [12]:
seed = 100

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

In [13]:
# scale data on the numerical columns in X_train
scaler = StandardScaler()
df_num_scaled = scaler.fit_transform(X_train[numerical_cols])

In [14]:
# add categorical/dummy arrays back to X_train
X_cat = X_train[categorical_cols].to_numpy()
X_train = np.hstack((X_cat, df_num_scaled))

In [15]:
# PCA model

## No PCA
#### Linear Regression

In [16]:
line_reg = LinearRegression()

line_reg.fit(X_train, y_train)

yhat_line = line_reg.predict(X_test)



In [17]:
print(f'R2 score: {metrics.r2_score(y_test, yhat_line)}')
print(f'Mean Squared Error: {metrics.mean_squared_error(y_test, yhat_line)}')
print(f'Mean Absolute Error: {metrics.mean_absolute_error(y_test, yhat_line)}')
print(f'Root MSE: {metrics.mean_squared_error(y_test, yhat_line, squared=False)}')

R2 score: -40053.25310732143
Mean Squared Error: 80368211.96359807
Mean Absolute Error: 7278.273427118312
Root MSE: 8964.831954007732


#### Random Forest

In [18]:
rfc = RandomForestRegressor(n_estimators=10, max_depth=8, random_state=seed)

rfc.fit(X_train, y_train)

yhat_rfc = rfc.predict(X_test)



In [19]:
print(f'R2 score: {metrics.r2_score(y_test, yhat_rfc)}')
print(f'Mean Squared Error: {metrics.mean_squared_error(y_test, yhat_rfc)}')
print(f'Mean Absolute Error: {metrics.mean_absolute_error(y_test, yhat_rfc)}')
print(f'Root MSE: {metrics.mean_squared_error(y_test, yhat_rfc, squared=False)}')

R2 score: -0.7013004832952112
Mean Squared Error: 3413.6319428772154
Mean Absolute Error: 38.088351535156995
Root MSE: 58.42629496106368


#### SVM

In [20]:
# clf = svm.SVC(kernel="rbf")

# clf.fit(X_train, y_train)

# yhat_svm = clf.predict(X_test)