## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import f1_score, confusion_matrix, balanced_accuracy_score, recall_score, precision_score, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
# from sklearn.utils import resample

# modules for building model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

### For this question, I would use reduced-aggregated data

In [4]:
# Read the reduced data from csv file
df = pd.read_csv("data/reduced-aggregated.csv")
df.head(5)

Unnamed: 0,MONTH,DAY_OF_WEEK,FL_DATE,UNIQUE_CARRIER,FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE,Unnamed: 13
0,2.0,5.0,2017-02-03,B6,33.0,BTV,"Burlington, VT",JFK,"New York, NY",1907.0,0.0,90.0,266.0,
1,2.0,7.0,2017-02-12,B6,33.0,BTV,"Burlington, VT",JFK,"New York, NY",1907.0,1.0,90.0,266.0,
2,2.0,1.0,2017-02-13,B6,33.0,BTV,"Burlington, VT",JFK,"New York, NY",1907.0,1.0,90.0,266.0,
3,2.0,1.0,2017-02-27,B6,33.0,DCA,"Washington, DC",PBI,"West Palm Beach/Palm Beach, FL",1730.0,0.0,157.0,857.0,
4,2.0,2.0,2017-02-14,B6,34.0,JFK,"New York, NY",BTV,"Burlington, VT",1706.0,0.0,81.0,266.0,


### Droping useless Data

In [5]:
# Drop the column Unnamed:13
df = df.iloc[:,:-1]

# Checking the null(Nan) value in all columns
df.isnull().any()

MONTH               False
DAY_OF_WEEK         False
FL_DATE             False
UNIQUE_CARRIER      False
FL_NUM              False
ORIGIN              False
ORIGIN_CITY_NAME    False
DEST                False
DEST_CITY_NAME      False
CRS_DEP_TIME        False
ARR_DEL15            True
CRS_ELAPSED_TIME    False
DISTANCE            False
dtype: bool

In [6]:
# Delete the row that target value is Nan
df.dropna(inplace=True, subset=['ARR_DEL15'])

### Handling FL_DATE

In [7]:
# Split the FL_DATE into YEAR and DAY_OF_MONTY then delete FL_DATE column
date = df.FL_DATE.str.split("-", n = 2, expand = True)
df['YEAR'] = pd.to_numeric(date[0])
df['DAY_OF_MONTH'] = pd.to_numeric(date[2])
df.drop(inplace=True, columns=['FL_DATE'])

### Handling all Locations include ARIPORT CODE and CITY

In [8]:
# Since one location may have different airport, combining airport code and city name would make location unique.
# Combine Airport CODE with CITY_NAME and Drop columns related to CITY_NAME
df['ORIGIN'] = df['ORIGIN'].str.cat(df['ORIGIN_CITY_NAME'], sep ="-")
df['DEST'] = df['DEST'].str.cat(df['DEST_CITY_NAME'], sep ="-")
# Dropping the Old columns ORIGIN_CITY_NAME, DEST_CITY_NAME
df.drop(inplace=True, columns=['ORIGIN_CITY_NAME', 'DEST_CITY_NAME'])
df.head(5)

Unnamed: 0,MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE,YEAR,DAY_OF_MONTH
0,2.0,5.0,B6,33.0,"BTV-Burlington, VT","JFK-New York, NY",1907.0,0.0,90.0,266.0,2017,3
1,2.0,7.0,B6,33.0,"BTV-Burlington, VT","JFK-New York, NY",1907.0,1.0,90.0,266.0,2017,12
2,2.0,1.0,B6,33.0,"BTV-Burlington, VT","JFK-New York, NY",1907.0,1.0,90.0,266.0,2017,13
3,2.0,1.0,B6,33.0,"DCA-Washington, DC","PBI-West Palm Beach/Palm Beach, FL",1730.0,0.0,157.0,857.0,2017,27
4,2.0,2.0,B6,34.0,"JFK-New York, NY","BTV-Burlington, VT",1706.0,0.0,81.0,266.0,2017,14


In [9]:
# Encode the ORIGIN and DEST location with the same encoder
locs = pd.Series(np.concatenate([df.DEST.unique(), df.ORIGIN.unique()])).unique()
le_loc = LabelEncoder()
le_loc.fit(locs)
ORIGIN = le_loc.transform(df.ORIGIN)
DEST = le_loc.transform(df.DEST)
df.ORIGIN = ORIGIN
df.DEST = DEST

In [10]:
# Encode the UNIQUE_CARRIER
le_carrier = LabelEncoder()
le_carrier.fit(df.UNIQUE_CARRIER)
carriers = le_carrier.transform(df.UNIQUE_CARRIER)
df.UNIQUE_CARRIER = carriers

### Final Data

In [11]:
df.head(5)

Unnamed: 0,MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE,YEAR,DAY_OF_MONTH
0,2.0,5.0,2,33.0,47,158,1907.0,0.0,90.0,266.0,2017,3
1,2.0,7.0,2,33.0,47,158,1907.0,1.0,90.0,266.0,2017,12
2,2.0,1.0,2,33.0,47,158,1907.0,1.0,90.0,266.0,2017,13
3,2.0,1.0,2,33.0,77,226,1730.0,0.0,157.0,857.0,2017,27
4,2.0,2.0,2,34.0,158,47,1706.0,0.0,81.0,266.0,2017,14


In [12]:
df.corr()

Unnamed: 0,MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE,YEAR,DAY_OF_MONTH
MONTH,1.0,0.009906,0.006127,0.008026,-0.002013,-0.000911,0.000246,-0.011899,-0.00796,-0.002247,-0.718502,0.016294
DAY_OF_WEEK,0.009906,1.0,-0.004483,0.016537,0.005775,0.004899,0.004588,0.003171,0.015837,0.017476,-0.015624,-0.014552
UNIQUE_CARRIER,0.006127,-0.004483,1.0,0.186976,0.07286,0.074449,0.003881,0.012879,-0.133466,-0.100875,0.000791,-8.9e-05
FL_NUM,0.008026,0.016537,0.186976,1.0,-0.007806,-0.012911,-0.005804,-0.00392,-0.291242,-0.30694,-0.005612,0.012466
ORIGIN,-0.002013,0.005775,0.07286,-0.007806,1.0,0.021395,-0.035397,0.00494,0.064451,0.090876,0.00226,-0.002071
DEST,-0.000911,0.004899,0.074449,-0.012911,0.021395,1.0,0.036251,0.026547,0.094968,0.088292,0.002083,0.001435
CRS_DEP_TIME,0.000246,0.004588,0.003881,-0.005804,-0.035397,0.036251,1.0,0.13981,-0.01815,-0.012061,-0.003358,-0.002342
ARR_DEL15,-0.011899,0.003171,0.012879,-0.00392,0.00494,0.026547,0.13981,1.0,0.028218,0.026575,0.014153,-0.001898
CRS_ELAPSED_TIME,-0.00796,0.015837,-0.133466,-0.291242,0.064451,0.094968,-0.01815,0.028218,1.0,0.98493,0.013479,0.003495
DISTANCE,-0.002247,0.017476,-0.100875,-0.30694,0.090876,0.088292,-0.012061,0.026575,0.98493,1.0,-0.000383,0.002727


In [13]:
df.describe()

Unnamed: 0,MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE,YEAR,DAY_OF_MONTH
count,562422.0,562422.0,562422.0,562422.0,562422.0,562422.0,562422.0,562422.0,562422.0,562422.0,562422.0,562422.0
mean,6.883778,3.927446,5.730624,2076.653541,154.24342,154.385792,1329.084872,0.180229,145.726385,852.726698,2016.167268,15.711715
std,3.370589,1.993089,4.064407,1697.277459,85.838353,85.862764,488.14958,0.384379,76.579691,622.259876,0.373215,8.766291
min,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,18.0,31.0,2016.0,1.0
25%,4.0,2.0,3.0,706.0,78.0,78.0,915.0,0.0,89.0,391.0,2016.0,8.0
50%,7.0,4.0,5.0,1639.0,166.0,166.0,1322.0,0.0,126.0,679.0,2016.0,16.0
75%,10.0,6.0,9.0,2885.0,227.0,227.0,1735.0,0.0,178.0,1096.0,2016.0,23.0
max,12.0,7.0,11.0,7439.0,309.0,309.0,2359.0,1.0,712.0,4983.0,2017.0,31.0


### Split data into features and label

In [14]:
# Get the dependent variable and independent variable
X = df.iloc[:,[0,1,2,3,4,5,6,8,9,10,11]]
y = df.iloc[:,7]

### Label proportion

In [15]:
# The proportion of the Data
len(df[df.ARR_DEL15==1])/len(df)

0.18022943625960577

**Since the data is imbalanced, I would do the resampling or set weight for model to handle it**

## Models

**This is a binary classification problem, I would try different classification model to find out the best model**

### Logistic Regression

**Doing 10-fold cross validation with Logistic regression and get all the hypotheses, in this model I would use over smapling to handle the imbalanced data**

In [17]:
lr = LogisticRegression(n_jobs=-1)
skf = StratifiedKFold(n_splits=10, shuffle=True)
start = time.time()

# initialize the result with -1
result_lr = np.full((len(y),), fill_value=-1)

# K-Folds cross validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Resample train data using over sampling
    ros = RandomOverSampler()
    X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
#     rus = RandomUnderSampler()
#     X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
    
    # Train and fit the model
    lr.fit(X_train_res, y_train_res)
    hyps = lr.predict(X_test)
    
    # append the test hyps to result
    result_lr[test_index.tolist()] = hyps
    
    # Print out metrics for each test case
    print('CV Accuracy: ', lr.score(X_test, y_test))
    print('CV Bal Accuracy:', balanced_accuracy_score(y_test, hyps))
    print('CV Precision: ', precision_score(y_test, hyps))
    print('CV Recall: ', recall_score(y_test, hyps))
    print('CV F1-Score: ', f1_score(y_test, hyps))
    print('CV Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, hyps)))
    print("----------------")
end = time.time()
# Print out the final result
print("----------------")
print('\nComplete in {:.0f}m {:.0f}s'.format((end - start) // 60, (end - start) % 60))
print('Final Accuracy: ', accuracy_score(y, result_lr))
print('Final Bal Accuracy:', balanced_accuracy_score(y, result_lr))
print('Final Precision: ', precision_score(y, result_lr))
print('Final Recall: ', recall_score(y, result_lr))
print('Final F1-Score: ', f1_score(y, result_lr))
print('Final Confusion matrix:\n', pd.DataFrame(confusion_matrix(y, result_lr)))

CV Accuracy:  0.5817435058585069
CV Bal Accuracy: 0.5831621009409624
CV Precision:  0.23496337358938824
CV Recall:  0.585380290026635
CV F1-Score:  0.33533001808318263
CV Confusion matrix:
        0      1
0  26785  19321
1   4203   5934
----------------
CV Accuracy:  0.5826858453496435
CV Bal Accuracy: 0.5840062212196444
CV Precision:  0.23560437817258884
CV Recall:  0.586070829634014
CV F1-Score:  0.3360959465957627
CV Confusion matrix:
        0      1
0  26831  19275
1   4196   5941
----------------
CV Accuracy:  0.5796632469818467
CV Bal Accuracy: 0.584048147801485
CV Precision:  0.23504944278763146
CV Recall:  0.5909046068856664
CV F1-Score:  0.3363184638275175
CV Confusion matrix:
        0      1
0  26612  19494
1   4147   5990
----------------
CV Accuracy:  0.5840549046103515
CV Bal Accuracy: 0.5892664169810395
CV Precision:  0.23871654381331547
CV Recall:  0.5974154088980961
CV F1-Score:  0.3411254435869994
CV Confusion matrix:
        0      1
0  26793  19313
1   4081   6056

### Decision Tree

**Doing 10-fold cross validation with decision tree using entropy as the criterion and get all the hypotheses, in this model I would use under smapling to handle the imbalanced data**

In [18]:
dt = DecisionTreeClassifier(criterion='entropy')
skf = StratifiedKFold(n_splits=10, shuffle=True)
start = time.time()

# initialize the result with -1
result_dt = np.full((len(y),), fill_value=-1)

# K-Folds cross validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Resample train data
#     ros = RandomOverSampler()
#     X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
    rus = RandomUnderSampler()
    X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
    
    # Train and fit the model
    dt.fit(X_train_res, y_train_res)
    hyps = dt.predict(X_test)
    
    # append the test hyps to result
    result_dt[test_index.tolist()] = hyps
    
    # Print out the metrics for each test case
    print('CV Accuracy: ', dt.score(X_test, y_test))
    print('CV Bal Accuracy:', balanced_accuracy_score(y_test, hyps))
    print('CV Precision: ', precision_score(y_test, hyps))
    print('CV Recall: ', recall_score(y_test, hyps))
    print('CV F1-Score: ', f1_score(y_test, hyps))
    print('CV Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, hyps)))
    print("----------------")
end = time.time()
# Print out the final result
print("----------------")
print('\nComplete in {:.0f}m {:.0f}s'.format((end - start) // 60, (end - start) % 60))
print('Final Accuracy: ', accuracy_score(y, result_dt))
print('Final Bal Accuracy:', balanced_accuracy_score(y, result_dt))
print('Final Precision: ', precision_score(y, result_dt))
print('Final Recall: ', recall_score(y, result_dt))
print('Final F1-Score: ', f1_score(y, result_dt))
print('Final Confusion matrix:\n', pd.DataFrame(confusion_matrix(y, result_dt)))

CV Accuracy:  0.5784542076347279
CV Bal Accuracy: 0.5811943341475763
CV Precision:  0.23327568587375205
CV Recall:  0.585478938541975
CV F1-Score:  0.3336237668287473
CV Confusion matrix:
        0      1
0  26599  19507
1   4202   5935
----------------
CV Accuracy:  0.5681418132034208
CV Bal Accuracy: 0.5723648213120984
CV Precision:  0.22668984163769795
CV Recall:  0.5789681365295453
CV F1-Score:  0.32581119715768725
CV Confusion matrix:
        0      1
0  26085  20021
1   4268   5869
----------------
CV Accuracy:  0.576071688921288
CV Bal Accuracy: 0.5797026813426456
CV Precision:  0.23203253304137014
CV Recall:  0.585380290026635
CV F1-Score:  0.3323345747808798
CV Confusion matrix:
        0      1
0  26466  19640
1   4203   5934
----------------
CV Accuracy:  0.5713777714560034
CV Bal Accuracy: 0.5748002902465006
CV Precision:  0.2285481113011037
CV Recall:  0.5801519187136234
CV F1-Score:  0.3279154701831665
CV Confusion matrix:
        0      1
0  26255  19851
1   4256   5881


### Random Forest

**Doing 10-fold cross validation with Random Forest using entropy as the criterion and get all the hypotheses, in this model I would use under smapling to handle the imbalanced data**

In [19]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', min_samples_split=50)
skf = StratifiedKFold(n_splits=10, shuffle=True)
start = time.time()

# initialize the result with -1
result_rf = np.full((len(y),), fill_value=-1)

# K-Folds cross validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Resample train data using under sampling
#     ros = RandomOverSampler()
#     X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
    rus = RandomUnderSampler()
    X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
    
    # Train and fit the model
    rf.fit(X_train_res, y_train_res)
    hyps = rf.predict(X_test)
    
    # append the test hyps to result
    result_rf[test_index.tolist()] = hyps
    
    # Print out the metrics for cv
    print('CV Accuracy: ', rf.score(X_test, y_test))
    print('CV Bal Accuracy:', balanced_accuracy_score(y_test, hyps))
    print('CV Precision: ', precision_score(y_test, hyps))
    print('CV Recall: ', recall_score(y_test, hyps))
    print('CV F1-Score: ', f1_score(y_test, hyps))
    print('CV Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, hyps)))
    print("----------------")
end = time.time()
# Print out the final result
print("----------------")
print('\nComplete in {:.0f}m {:.0f}s'.format((end - start) // 60, (end - start) % 60))
print('Final Accuracy: ', accuracy_score(y, result_rf))
print('Final Bal Accuracy:', balanced_accuracy_score(y, result_rf))
print('Final Precision: ', precision_score(y, result_rf))
print('Final Recall: ', recall_score(y, result_rf))
print('Final F1-Score: ', f1_score(y, result_rf))
print('Final Confusion matrix:\n', pd.DataFrame(confusion_matrix(y, result_rf)))

CV Accuracy:  0.6523834077129599
CV Bal Accuracy: 0.6547995237124897
CV Precision:  0.29324431169287535
CV Recall:  0.6585774884087995
CV F1-Score:  0.4057988633255326
CV Confusion matrix:
        0      1
0  30016  16090
1   3461   6676
----------------
CV Accuracy:  0.6494674892875558
CV Bal Accuracy: 0.6492884841571054
CV Precision:  0.2893648838845883
CV Recall:  0.6490085824208346
CV F1-Score:  0.4002676968941076
CV Confusion matrix:
        0      1
0  29949  16157
1   3558   6579
----------------
CV Accuracy:  0.656472805504685
CV Bal Accuracy: 0.6563702637164133
CV Precision:  0.2958022056207755
CV Recall:  0.6562099240406432
CV F1-Score:  0.4077854406130268
CV Confusion matrix:
        0      1
0  30270  15836
1   3485   6652
----------------
CV Accuracy:  0.6460359511405864
CV Bal Accuracy: 0.6490040261371965
CV Precision:  0.2877991573643748
CV Recall:  0.6536450626418072
CV F1-Score:  0.39963811821471656
CV Confusion matrix:
        0      1
0  29709  16397
1   3511   6626


### Gradient Boosting

**Doing 10-fold cross validation with Gradient Boosting using MSE as the criterion and get all the hypotheses, in this model I would use under smapling to handle the imbalanced data**

In [18]:
gbc =GradientBoostingClassifier(max_depth=6, criterion='mse')
skf = StratifiedKFold(n_splits=10, shuffle=True)
start = time.time()

# initialize the result with -1
result_gbc = np.full((len(y),), fill_value=-1)

# K-Folds cross validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    # Resample train data
#     ros = RandomOverSampler()
#     X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
    rus = RandomUnderSampler()
    X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
    
    # Train and fit the model
    gbc.fit(X_train_res, y_train_res)
    hyps = gbc.predict(X_test)
    
    # append the test hyps to result
    result_gbc[test_index.tolist()] = hyps
    
    # Print out the metrics
    print('CV Accuracy: ', gbc.score(X_test, y_test))
    print('CV Bal Accuracy:', balanced_accuracy_score(y_test, hyps))
    print('CV Precision: ', precision_score(y_test, hyps))
    print('CV Recall: ', recall_score(y_test, hyps))
    print('CV F1-Score: ', f1_score(y_test, hyps))
    print('CV Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, hyps)))
    print("----------------")
end = time.time()
# Print out the final result
print("----------------")
print('\nComplete in {:.0f}m {:.0f}s'.format((end - start) // 60, (end - start) % 60))
print('Final Accuracy: ', accuracy_score(y, result_gbc))
print('Final Bal Accuracy:', balanced_accuracy_score(y, result_gbc))
print('Final Precision: ', precision_score(y, result_gbc))
print('Final Recall: ', recall_score(y, result_gbc))
print('Final F1-Score: ', f1_score(y, result_gbc))
print('Final Confusion matrix:\n', pd.DataFrame(confusion_matrix(y, result_gbc)))

CV Accuracy:  0.655566025994346
CV Bal Accuracy: 0.6531620922969682
CV Precision:  0.2938708093388688
CV Recall:  0.6494031764821939
CV F1-Score:  0.4046345811051693
CV Confusion matrix:
        0      1
0  30288  15818
1   3554   6583
----------------
CV Accuracy:  0.6552282061767687
CV Bal Accuracy: 0.6559189808853941
CV Precision:  0.2950296801630194
CV Recall:  0.6569991121633619
CV F1-Score:  0.40720247011708605
CV Confusion matrix:
        0      1
0  30192  15914
1   3477   6660
----------------
CV Accuracy:  0.6539302668776559
CV Bal Accuracy: 0.6524722288039964
CV Precision:  0.2928162068505931
CV Recall:  0.6501923646049127
CV F1-Score:  0.40378606873736445
CV Confusion matrix:
        0      1
0  30188  15918
1   3546   6591
----------------
CV Accuracy:  0.6527212275305372
CV Bal Accuracy: 0.6533894239984952
CV Precision:  0.292723822971363
CV Recall:  0.654434250764526
CV F1-Score:  0.40451219512195125
CV Confusion matrix:
        0      1
0  30077  16029
1   3503   6634
-

### XGBoost

**Using XGBoost with weight 4 to do the 10-fold cross validation without resampling**

In [20]:
xgb = XGBClassifier(n_jobs=-1, scale_pos_weight=4)
result_xgb = cross_val_predict(xgb, X, y, cv=10, n_jobs=-1)
print('CV Accuracy: ', accuracy_score(y, result_xgb))
print('CV Bal Accuracy:', balanced_accuracy_score(y, result_xgb))
print('CV Precision: ', precision_score(y, result_xgb))
print('CV Recall: ', recall_score(y, result_xgb))
print('CV F1-Score: ', f1_score(y, result_xgb))
print('CV Confusion matrix:\n', pd.DataFrame(confusion_matrix(y, result_xgb)))

CV Accuracy:  0.5510044059442909
CV Bal Accuracy: 0.5127241550303315
CV Precision:  0.18893279005638555
CV Recall:  0.45286834706259554
CV F1-Score:  0.26662988078470096
CV Confusion matrix:
         0       1
0  263992  197065
1   55460   45905


### Neural Network

**10-fold cross validation on 3 hidden layers neural network with 0.001 learning rate, adam optimizer and logistic sigmoid ativation function**

In [21]:
mc = MLPClassifier(hidden_layer_sizes=(20,100,50,), early_stopping=True, max_iter=1000, 
                   learning_rate_init=0.001, solver='adam', activation='relu')
skf = StratifiedKFold(n_splits=10, shuffle=True)
start = time.time()

# initialize the result with -1
result_mc = np.full((len(y),), fill_value=-1)

# K-Folds cross validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Resample train data
#     ros = RandomOverSampler()
#     X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
    rus = RandomUnderSampler()
    X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
    
    # Train and test the model
    mc.fit(X_train_res, y_train_res)
    hyps = mc.predict(X_test)
    
    # append the test hyps to result
    result_mc[test_index.tolist()] = hyps
    
    # Print out the metrics
    print('CV Accuracy: ', mc.score(X_test, y_test))
    print('CV Bal Accuracy:', balanced_accuracy_score(y_test, hyps))
    print('CV Precision: ', precision_score(y_test, hyps))
    print('CV Recall: ', recall_score(y_test, hyps))
    print('CV F1-Score: ', f1_score(y_test, hyps))
    print('CV Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, hyps)))
    print("----------------")
end = time.time()
# Print out the final result
print("----------------")
print('\nComplete in {:.0f}m {:.0f}s'.format((end - start) // 60, (end - start) % 60))
print('Final Accuracy: ', accuracy_score(y, result_mc))
print('Final Bal Accuracy:', balanced_accuracy_score(y, result_mc))
print('Final Precision: ', precision_score(y, result_mc))
print('Final Recall: ', recall_score(y, result_mc))
print('Final F1-Score: ', f1_score(y, result_mc))
print('Final Confusion matrix:\n', pd.DataFrame(confusion_matrix(y, result_mc)))

CV Accuracy:  0.5418629873939869
CV Bal Accuracy: 0.5826566412336818
CV Precision:  0.22804148106904232
CV Recall:  0.6464437210219987
CV F1-Score:  0.3371491780927636
CV Confusion matrix:
        0      1
0  23923  22183
1   3584   6553
----------------
CV Accuracy:  0.5374179897942856
CV Bal Accuracy: 0.5931440283172804
CV Precision:  0.2324076570504179
CV Recall:  0.6802801617835652
CV F1-Score:  0.34645431937501575
CV Confusion matrix:
        0      1
0  23330  22776
1   3241   6896
----------------
CV Accuracy:  0.5473747844176164
CV Bal Accuracy: 0.5885581186724651
CV Precision:  0.23177393374886196
CV Recall:  0.6529545230344284
CV F1-Score:  0.3421113838997286
CV Confusion matrix:
        0      1
0  24167  21939
1   3518   6619
----------------
CV Accuracy:  0.4479135181267002
CV Bal Accuracy: 0.5582540322810652
CV Precision:  0.20733277357962496
CV Recall:  0.7307882016375653
CV F1-Score:  0.3230209082782829
CV Confusion matrix:
        0      1
0  17784  28322
1   2729   74

## Conclusion

**Gradient Boosting:**
```
Complete in 8m 42s
Final Accuracy:  0.6548285806742979
Final Bal Accuracy: 0.6538099644947999
Final Precision:  0.29384286482570415
Final Recall:  0.6522172347457209
Final F1-Score:  0.40515265538246575
Final Confusion matrix:
         0       1
0  302178  158879
1   35253   66112
```

**Random Forest:**
```
Complete in 5m 36s
Final Accuracy:  0.6518432778234138
Final Bal Accuracy: 0.6517428633282708
Final Precision:  0.291548587016977
Final Recall:  0.6515858531051152
Final F1-Score:  0.4028459288761752
Final Confusion matrix:
         0       1
0  300563  160494
1   35317   66048
```

**After training the models above, Random forest and Gradient Boosting come out with best performence based ont the balanced acc and f1 score. Although Gradient Boosting provide a relative high performence, I would choose the model using random forest, since its training time is relative smaller than gradient boosting, it would be much obvious when using the larger datasets. The result is showing above**

### Choosen model

```
Random Forest
hyperprarameters: n_estimators=100, criterion='entropy', min_samples_split=50
```

## Write Hyps to file

In [40]:
# write to dui-test-hypotheses.csv
hyps = pd.DataFrame()
hyps['ARR_DEL15'] = result_rf
hyps.to_csv("pred1.txt", index=False)