# Evaluate Random Forest Model

In [104]:
import pandas as pd
import pickle
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn import metrics

## 1. Load the training data

In [80]:
flight_sample = pd.read_csv('flight_sample1.zip', compression='zip')
flight_sample.head()


Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,...,early_count,early_avgDryDays,early_avgRainDays,early_avgCloud,early_avgSnowDays,delay_count,delay_avgDryDays,delay_avgRainDays,delay_avgCloud,delay_avgSnowDays
0,DL,DL_CODESHARE,4016,OO,N291SY,4016,12953,13342,1747,1944.0,...,55.5,17,11,3,0,30.8,18,10,3,0
1,DL,DL_CODESHARE,4016,OO,N292SY,4016,12953,13342,1747,1929.0,...,55.5,17,11,3,0,30.8,18,10,3,0
2,DL,DL_CODESHARE,4016,OO,N278SY,4016,12953,13342,1747,1750.0,...,55.5,17,11,3,0,30.8,18,10,3,0
3,DL,DL_CODESHARE,3616,OO,N268SY,3616,12953,11995,1729,1851.0,...,55.5,17,11,3,0,30.8,18,10,3,0
4,DL,DL_CODESHARE,3607,OO,N276SY,3607,12953,11995,910,906.0,...,55.5,17,11,3,0,30.8,18,10,3,0


In [81]:
flight_sample = flight_sample.sample(n=30000, random_state=1)

In [82]:
# Calculate Correlated features from sample
corr = flight_sample.corr(method ='pearson')

In [83]:
# Drop 100% correlational values(diagonal values in matrix)
corr1 = corr[corr < 1].unstack().transpose().sort_values(ascending=False).drop_duplicates()

In [84]:
# Get all correlations above threshold value
threshold = 0.80
corr1 = corr1[corr1 < threshold]

# Extract all feature names from correlations above threshold
corr_data = corr1.index.values

# Flatten into list format
lst = [corr_data[x][y] for x in range(len(corr_data)) for y in range(2)]

# Isolate Unique feature names
unique_features = list(set(lst))
unique_features

['first_dep_time',
 'crs_arr_time',
 'is_delayed',
 'mkt_carrier_fl_num',
 'dep_time',
 'air_time',
 'arr_delay',
 'early_avgDryDays',
 'early_avgSnowDays',
 'avg_passengers',
 'distance',
 'total_passengers',
 'flight_speed',
 'delay_avgRainDays',
 'delay_avgDryDays',
 'crs_dep_time',
 'departure_hour_of_day',
 'arrival_hour_of_day',
 'total_distance',
 'percent_flights',
 'early_avgRainDays',
 'avgCloud',
 'early_count',
 'actual_elapsed_time',
 'taxi_in',
 'carrier_delay',
 'cancelled',
 'arr_time',
 'flight_haul_type',
 'avg_flights',
 'flight_month',
 'op_carrier_fl_num',
 'delay_avgCloud',
 'avgRainDays',
 'delay_avgSnowDays',
 'longest_add_gtime',
 'avgSnowDays',
 'crs_elapsed_time',
 'total_add_gtime',
 'origin_airport_id',
 'avgDryDays',
 'dest_airport_id',
 'avg_fuel',
 'early_avgCloud',
 'nas_delay',
 'wheels_on',
 'wheels_off',
 'weather_delay',
 'percent_delay',
 'diverted',
 'security_delay',
 'avgThunderDays',
 'dep_delay',
 'delay_count',
 'late_aircraft_delay',
 'taxi_

### Prune Data Sample

In [85]:
# Keep features with high correlations
pruned_data = flight_sample[unique_features]

In [86]:
# Get columns with NaN values present
NaN_cols = pruned_data.columns[pruned_data.isnull().any()]

# Adjust NaN values by replacing them with respective column means to prevent data loss
for column in NaN_cols:
    pruned_data[column].fillna(value=pruned_data[column].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [87]:
pruned_data[NaN_cols].isna().sum()

first_dep_time         0
dep_time               0
air_time               0
arr_delay              0
actual_elapsed_time    0
taxi_in                0
carrier_delay          0
arr_time               0
longest_add_gtime      0
total_add_gtime        0
nas_delay              0
wheels_on              0
wheels_off             0
weather_delay          0
security_delay         0
dep_delay              0
late_aircraft_delay    0
taxi_out               0
dtype: int64

In [88]:
pruned_data.head()

Unnamed: 0,first_dep_time,crs_arr_time,is_delayed,mkt_carrier_fl_num,dep_time,air_time,arr_delay,early_avgDryDays,early_avgSnowDays,avg_passengers,...,wheels_off,weather_delay,percent_delay,diverted,security_delay,avgThunderDays,dep_delay,delay_count,late_aircraft_delay,taxi_out
864377,1352.896714,1530,0,2997,1404.0,114.0,-19.0,19,2,1090.0,...,1413.0,4.085318,0.003063,0.0,0.030039,1,-1.0,28.5,27.809101,9.0
554769,1352.896714,1511,0,3536,1118.0,125.0,-13.0,18,1,293.0,...,1128.0,4.085318,0.000986,0.0,0.030039,6,-7.0,26.5,27.809101,10.0
956373,1352.896714,848,1,5815,1046.0,72.0,255.0,20,3,30.5,...,1101.0,221.0,7.3e-05,0.0,0.0,0,221.0,31.8,0.0,15.0
335821,1352.896714,2301,1,2098,2125.0,80.0,9.0,17,0,940.5,...,2143.0,4.085318,0.004916,0.0,0.030039,4,7.0,35.4,27.809101,18.0
430891,1352.896714,600,0,1114,2213.0,236.0,-34.0,21,4,929.0,...,2225.0,4.085318,0.002454,0.0,0.030039,1,-2.0,26.1,27.809101,12.0


### Drop all delay columns

In [89]:
pruned_data = pruned_data.drop(columns=['first_dep_time', 'dep_time', 'air_time', 'is_delayed', 'actual_elapsed_time', 'taxi_in', 'carrier_delay', 'arr_time','nas_delay', 'wheels_on', 'wheels_off','weather_delay', 'security_delay', 'dep_delay','late_aircraft_delay', 'taxi_out'])
pruned_data.shape

(30000, 40)

In [90]:
X = pruned_data.drop(columns=['arr_delay'])
X.shape

(30000, 39)

In [113]:
pruned_data.describe()

Unnamed: 0,crs_arr_time,mkt_carrier_fl_num,arr_delay,early_avgDryDays,early_avgSnowDays,avg_passengers,distance,total_passengers,flight_speed,delay_avgRainDays,...,total_add_gtime,origin_airport_id,avgDryDays,dest_airport_id,avg_fuel,early_avgCloud,percent_delay,diverted,avgThunderDays,delay_count
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,1487.787033,2628.4471,5.510859,18.717933,1.1793,485.289767,791.318667,489.238633,298.208217,8.5552,...,39.887324,12686.3295,16.935233,12685.993467,138433300.0,3.051267,0.00159,0.002467,2.196933,28.82557
std,517.751711,1825.750138,51.143783,1.595992,1.501786,366.78779,592.76016,372.630243,94.695521,1.445184,...,3.162906,1522.04873,6.571875,1526.177045,129499400.0,0.862204,0.001389,0.049605,3.106167,3.650017
min,1.0,1.0,-62.0,17.0,0.0,1.0,31.0,1.0,0.0,6.0,...,1.0,10135.0,0.0,10135.0,0.0,2.0,3e-06,0.0,0.0,22.1
25%,1104.0,1083.0,-15.0,17.0,0.0,168.0,362.0,184.0,240.0,7.0,...,39.887324,11292.0,12.0,11292.0,8790766.0,2.0,0.000458,0.0,0.0,26.5
50%,1515.0,2223.0,-6.0,18.0,0.0,485.5,629.0,469.0,309.0,9.0,...,39.887324,12889.0,17.0,12889.0,176971000.0,3.0,0.001259,0.0,1.0,28.5
75%,1920.0,4009.25,7.0,20.0,2.0,675.5,1020.0,675.0,365.947063,10.0,...,39.887324,14057.0,21.0,14057.0,297848100.0,4.0,0.002277,0.0,3.0,31.8
max,2359.0,9392.0,1495.0,21.0,4.0,1445.0,5095.0,1680.0,543.0,11.0,...,221.0,16218.0,30.0,16218.0,303891700.0,4.0,0.006384,1.0,23.0,35.4


In [91]:
y = pruned_data['arr_delay']

In [92]:
y.shape

(30000,)

## Convert to Arrays

In [93]:
X = X.to_numpy()

In [110]:
y = y.to_numpy()

AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [111]:
print("The mean of all the target values is: ",y.mean())

The mean of all the target values is:  5.510858851918567


### Split the data

In [95]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25,random_state=109) # 75% training and 30% test

### Scale the data

In [96]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [101]:
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.svm import SVC

def runGridSearch(model, param_grid, X_train, y_train, X_test, y_test):
    grid = GridSearchCV(estimator=model, 
        param_grid=param_grid, 
        scoring='r2', 
        verbose=1, 
        n_jobs=-1) 
    
    best_model = grid.fit(X_train, y_train)


    y_pred = best_model.predict(X_test)
    
    print("MSE: ", metrics.mean_squared_error(y_test, y_pred))
    print("MAE: ", metrics.mean_absolute_error(y_test, y_pred))
    print("Variance Score: ", metrics.explained_variance_score(y_test, y_pred))
    print("R2: ", metrics.r2_score(y_test, y_pred))
    
    

## Cross Validate a Base Model

In [108]:
# RandomForestRegressor
param_grid_rf = {
    'n_estimators': [5,10, 100, 500,1000],
    'max_depth': [5,10,100]
}

# DecisionTree
param_grid_dt = {    
    'max_depth': [5,10,100]
}

param_grid_xgb = {
    'objective':['reg:squarederror'], 
    'colsample_bytree':[0.3], 
    'learning_rate':[0.1],
    'max_depth':[5,10,100], 
    'alpha':[1,0.1,0.01], 
    'n_estimators':[5,10,100,500]
    
}

In [99]:
# Decision Tree
runGridSearch(DecisionTreeRegressor(), param_grid_dt, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    3.7s finished


MSE:  2431.1544557747807
MAE:  24.274185669232267
Variance Score:  -0.0674057471492211
R2:  -0.06742932268625479


In [100]:
# Random Forest
runGridSearch(RandomForestRegressor(), param_grid_rf, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed: 20.5min finished


MSE:  2081.120721437787
MAE:  23.865794264853385
Variance Score:  0.08625910173551765
R2:  0.08625744578424288


In [114]:
# XGBoost
runGridSearch(xgb.XGBRegressor(), param_grid_xgb, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 11.3min finished


MSE:  2160.4561706060585
MAE:  23.43066506047705
Variance Score:  0.05293231801297904
R2:  0.051424206551110796


| Model | MSE | MAE | Variance | R2 | 
| --- | --- | --- | --- | --- |
| Random Forest | 2081.120721437787 | 23.865794264853385 | 0.08625910173551765 |  0.08625744578424288 |
| Decision Tree | 2317.290228192625 | 23.379388770971 | 0.07616203926823162 | 0.07615860147119247 |

In [None]:
## Fit the 

In [23]:
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [25]:
y_pred = model.predict(X_test)

In [35]:
f1_score = metrics.f1_score(y_test, y_pred)
#roc_auc = metrics.auc(y_test, y_pred)
#print("F1_Score for {0}: {1}".format(cutoff, f1_score))
#print("Roc Auc Score for {0}: {1}".format(cutoff, roc_auc))
print( np.unique( y_pred ) )
f1_score = metrics.f1_score(y_test, y_pred)
print("F1_Score: {0}".format(f1_score))
print("R2: ", metrics.r2_score(y_test, y_pred))
print("Precision: ", metrics.precision_score(y_test, y_pred))
print("Recall: ", metrics.recall_score(y_test, y_pred))
print("Roc Auc: ", metrics.roc_auc_score(y_test, y_pred))

[0 1]
F1_Score: 0.9999759849185289
R2:  0.9999266292287783
Precision:  0.9999639778103312
Recall:  0.9999879923150816
Roc Auc:  0.9999844936457102
