### Random Forest Modeling using Train-Test-Split

In [1]:
# import packages needed for data handling 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
sns.set(style="darkgrid")

# import packages to split the data 
from sklearn import model_selection
from sklearn.model_selection import train_test_split

# import Random Forest
from sklearn.ensemble import RandomForestClassifier

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score

In [2]:
# read in modeling data file
cols = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
       'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
       'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
       'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70',
       'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'Y1', 'Y2']

df = pd.read_csv('data/modeling.csv', header=0, names=cols)
df.head(2)

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f70,f71,f72,f73,f74,f75,f76,f77,Y1,Y2
0,-0.01821,-0.010433,-0.018399,-0.018279,-2.896385,-0.024231,-0.02066,4.079933,-1.414801,-3.011022,...,0,0,0,0,0,0,0,9.0,0,0
1,-0.01821,-3.1822,-3.260786,-3.270119,-2.037297,-0.024231,-0.02066,3.366161,-3.683655,-3.011022,...,0,0,0,0,0,0,0,9.0,0,0


In [3]:
# Splitting up our data into features and target
# for this modeling approach I will only be using 1 target at a time 
# the first target will be Accept (ACCP)
X = df.iloc[:, :-2] # Features
Y = df.Y1 # Target

# Split dataset into training set and test set using a 70/30 split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2019)

# import the RandomOverSampler package from imblearn
# this package will help address the imbalanced nature of the ACCP target 
from imblearn.over_sampling import RandomOverSampler

# define the RandomOverSampler (ros) model
ros = RandomOverSampler(random_state=2019)

# fit the training data only to the RandomOverSampler model
X_train_resample, Y_train_resample = ros.fit_resample(X_train, Y_train)



### Target Y1 (ACCP)

In [7]:
# build decision tree 
rfy1 = RandomForestClassifier(n_estimators=100, random_state=2019)

# Train Decision Tree Classifer
rfy1 = rfy1.fit(X_train_resample, Y_train_resample)

# Predict the target for the test data 
Y_pred = rfy1.predict(X_test)

In [8]:
# Model Accuracy, how often is the Decision Tree correct?
print("F1 Score:",metrics.f1_score(Y_test, Y_pred, average='weighted'))
print("AUC Score:",metrics.roc_auc_score(Y_test, Y_pred, average='weighted'))

# We are going to look at the classification report and also the confusion matrix for the Decision Tree  
print(metrics.classification_report(Y_test, Y_pred))
print(metrics.confusion_matrix(Y_test, Y_pred))

F1 Score: 0.9371593942103368
AUC Score: 0.9341124108633746
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      5612
           1       0.96      0.94      0.95     10224

    accuracy                           0.94     15836
   macro avg       0.93      0.93      0.93     15836
weighted avg       0.94      0.94      0.94     15836

[[5187  425]
 [ 573 9651]]


## Target Y2 (CONF)

In [9]:
# Splitting up our data into features and target
# for this modeling approach I will only be using 1 target at a time 
# the first target will be Accept (ACCP)
X = df.iloc[:, :-2] # Features
Y = df.Y2 # Target

# Split dataset into training set and test set using a 70/30 split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2019)

# import the RandomOverSampler package from imblearn
# this package will help address the imbalanced nature of the ACCP target 
from imblearn.over_sampling import RandomOverSampler

# define the RandomOverSampler (ros) model
ros = RandomOverSampler(random_state=2019)

# fit the training data only to the RandomOverSampler model
X_train_resample, Y_train_resample = ros.fit_resample(X_train, Y_train)

In [10]:
# build decision tree 
rfy2 = RandomForestClassifier(n_estimators=100, random_state=2019)

# Train Decision Tree Classifer
rfy2 = rfy2.fit(X_train_resample, Y_train_resample)

# Predict the target for the test data 
Y_pred = rfy2.predict(X_test)

In [11]:
# Model Accuracy, how often is the Decision Tree correct?
print("F1 Score:",metrics.f1_score(Y_test, Y_pred, average='weighted'))
print("AUC Score:",metrics.roc_auc_score(Y_test, Y_pred, average='weighted'))

# We are going to look at the classification report and also the confusion matrix for the Decision Tree  
print(metrics.classification_report(Y_test, Y_pred))
print(metrics.confusion_matrix(Y_test, Y_pred))

F1 Score: 0.8537552685252987
AUC Score: 0.5304778812623758
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     14304
           1       0.20      0.11      0.14      1532

    accuracy                           0.87     15836
   macro avg       0.55      0.53      0.53     15836
weighted avg       0.84      0.87      0.85     15836

[[13626   678]
 [ 1366   166]]
