#### Step-Wise Modeling using Cross Validation

In [1]:
# import basic packages 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
sns.set(style="darkgrid")

# import packages for Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# import modeling packages 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [2]:
# read in modeling data file
cols = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
       'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
       'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
       'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70',
       'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'Y1', 'Y2']

df = pd.read_csv('data/modeling.csv', header=0, names=cols)
df.head(2)

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f70,f71,f72,f73,f74,f75,f76,f77,Y1,Y2
-0.01821,-0.010433,-0.018399,-0.018279,-2.896385,-0.024231,-0.02066,4.079933,-1.414801,-3.011022,1,...,0,0,0,0,0,0,0,9.0,0,0
-0.01821,-3.1822,-3.260786,-3.270119,-2.037297,-0.024231,-0.02066,3.366161,-3.683655,-3.011022,1,...,0,0,0,0,0,0,0,9.0,0,0


### Target: Y1 (AACP)

In [3]:
# Splitting up our data into features and target
# for this modeling approach I will only be using 1 target at a time 
# the first target will be Accept (ACCP)
X = df.iloc[:, :-2] # Features
Y = df.Y1 # Target

# import the RandomOverSampler package from imblearn 
from imblearn.over_sampling import RandomOverSampler

# define the model
ros = RandomOverSampler(random_state=2019)

# fit the training data only to the RandomOverSampler model
# this will help address the imbalanced nature of the target variable 
X_resample, Y_resample = ros.fit_resample(X, Y)



In [4]:
# save features and targets as the X_resample and Y_resample variables 
features, targets = X_resample, Y_resample

# define an empty list that the following models will feed into
models = []

# append the list with all the desired models 

models.append(('LogisticRegression', LogisticRegression(solver='liblinear', random_state=2019)))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=2019)))
models.append(('KNeighborsClassifier', KNeighborsClassifier()))
models.append(('SVC', SVC(kernel='rbf',gamma='auto',random_state=2019)))
models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=2019)))
models.append(('XGBoost', xgb.XGBClassifier(n_estimators=100, random_state=2019)))
models.append(('Random Forest', RandomForestClassifier(n_estimators=100, random_state=2019)))

# use Cross Validation in the model with a 'stratify' option using the StratifiedKFolds package from sklearn
# specifiy that the scoring method is F1 
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv, scoring='f1')
    print("Model:{0}, F1 Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))
    
    
# Same as above but the scoring option has beeen changed to AUC
cv1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv1, scoring='roc_auc')
    print("Model:{0}, AUC Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))

Model:LogisticRegression, F1 Score: mean=0.93905, var=0.00001
Model:DecisionTreeClassifier, F1 Score: mean=0.95268, var=0.00013
Model:KNeighborsClassifier, F1 Score: mean=0.92529, var=0.00003
Model:SVC, F1 Score: mean=0.94224, var=0.00001
Model:AdaBoostClassifier, F1 Score: mean=0.94708, var=0.00001
Model:Random Forest, F1 Score: mean=0.96112, var=0.00004
Model:LogisticRegression, AUC Score: mean=0.98279, var=0.00000
Model:DecisionTreeClassifier, AUC Score: mean=0.95960, var=0.00014
Model:KNeighborsClassifier, AUC Score: mean=0.96979, var=0.00002
Model:SVC, AUC Score: mean=0.97833, var=0.00000
Model:AdaBoostClassifier, AUC Score: mean=0.98633, var=0.00000
Model:XGBoost, AUC Score: mean=0.98802, var=0.00000
Model:Random Forest, AUC Score: mean=0.98964, var=0.00001


In [10]:
dt = DecisionTreeClassifier(random_state=2019)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
score = cross_val_score(dt, features, targets, cv=cv, scoring='f1')
print(score.mean())

0.9422998845235453


In [11]:
# define the feature importance variable 
feature_imp_accp = pd.Series(dt.feature_importances_,index=df.iloc[:,:-2].columns).sort_values(ascending=False)

# print the results 
feature_imp_accp

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

### Target: Y2 (CONF)

In [5]:
# Reassign the target to Y2
X = df.iloc[:, :-2] # Features
Y = df.Y2 # Target

# import the RandomOverSampler package from imblearn 
from imblearn.over_sampling import RandomOverSampler

# define the model
ros = RandomOverSampler(random_state=2019)

# fit the training data only to the RandomOverSampler model
# this will help address the imbalanced nature of the target variable 
X_resample, Y_resample = ros.fit_resample(X, Y)

In [7]:
# save features and targets as the X_resample and Y_resample variables 
features, targets = X_resample, Y_resample

# define an empty list that the following models will feed into
models = []

# append the list with all the desired models 

models.append(('LogisticRegression', LogisticRegression(solver='liblinear', random_state=2019)))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=2019)))
models.append(('KNeighborsClassifier', KNeighborsClassifier()))
#models.append(('SVC', SVC(kernel='rbf',gamma='auto',random_state=2019)))
models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=2019)))
models.append(('XGBoost', xgb.XGBClassifier(n_estimators=100, random_state=2019)))
models.append(('Random Forest', RandomForestClassifier(n_estimators=100, random_state=2019)))

# use Cross Validation in the model with a 'stratify' option using the StratifiedKFolds package from sklearn
# specifiy that the scoring method is F1 
cv2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv2, scoring='f1')
    print("Model:{0}, F1 Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))
    
    
# Same as above but the scoring option has beeen changed to AUC
cv3 = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv3, scoring='roc_auc')
    print("Model:{0}, AUC Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))

Model:LogisticRegression, F1 Score: mean=0.68019, var=0.00002
Model:DecisionTreeClassifier, F1 Score: mean=0.94230, var=0.00001
Model:KNeighborsClassifier, F1 Score: mean=0.87646, var=0.00001
Model:AdaBoostClassifier, F1 Score: mean=0.69346, var=0.00003
Model:XGBoost, F1 Score: mean=0.71162, var=0.00002
Model:Random Forest, F1 Score: mean=0.96954, var=0.00000
Model:LogisticRegression, AUC Score: mean=0.71848, var=0.00003
Model:DecisionTreeClassifier, AUC Score: mean=0.94746, var=0.00001
Model:KNeighborsClassifier, AUC Score: mean=0.93654, var=0.00001
Model:AdaBoostClassifier, AUC Score: mean=0.72858, var=0.00006
Model:XGBoost, AUC Score: mean=0.75494, var=0.00004
Model:Random Forest, AUC Score: mean=0.99593, var=0.00000
