In [123]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm
import numpy as np
import random

In [124]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score

In [125]:
import warnings
warnings.filterwarnings('ignore')

In [126]:
with open('big_cards', 'rb') as f: # 100% Of features
    big_cards = pickle.load(f)
    
with open('big_top_50', 'rb') as f: # Top 50% of features
    big_top_50 = pickle.load(f)

In [127]:
def quick_test(model, X, y):
# The following splits result in: 60% Train, 20% Val, 20% Test
    xtrain_val, xtest, ytrain_val, ytest = train_test_split(X, y, test_size=0.2)
    xtrain, xval, ytrain, yval = train_test_split(xtrain_val, ytrain_val, test_size=0.25)   
# val
    model_60 = model.fit(xtrain, ytrain)
    val_score = np.round(model_60.score(xval, yval), 3)
# train
    train_score = np.round(model_60.score(xtrain, ytrain), 3)
# overfit        
    overfit_score = np.round(abs(val_score-train_score), 4)
# test   
    model_80 = model.fit(xtrain_val, ytrain_val)
    test_score = np.round(model_80.score(xtest, ytest), 3)
    
    return [val_score, train_score, overfit_score, test_score]

In [128]:
def quick_predict(model, X, y):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)
    model.fit(xtrain, ytrain)
    return model.predict(xtest), ytest

def quick_probas(model, X, y):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)
    model.fit(xtrain, ytrain)
    return model.predict_proba(xtest), ytest

In [129]:
X_100 = big_cards.drop('Binary_Rank', axis = 1)
y_100 = big_cards.loc[:, 'Binary_Rank']

In [130]:
X_50 = big_top_50.drop('Binary_Rank', axis = 1)
y_50 = big_top_50.loc[:, 'Binary_Rank']

In [131]:
#xtrain_100, xtest_100, ytrain_100, ytest_100 = train_test_split(X_100, y_100, test_size=0.3)

In [132]:
#xtrain_50, xtest_50, ytrain_50, ytest_50 = train_test_split(X_50, y_50, test_size=0.3)

In [133]:
logreg_C_tenth = LogisticRegression(C = 0.1)
logreg_C_one = LogisticRegression(C = 1)
logreg_C_ten = LogisticRegression(C = 10)

In [65]:
# 100% Features, C = 0.1:

lr_10th = quick_test(logreg_C_tenth, X_100, y_100)

print('Validation Score:...', lr_10th[0])
print('Train Score:........', lr_10th[1])
print('Overfit Score:......', lr_10th[2])
print('Test Score:.........', lr_10th[3])

Validation Score:... 0.7667831658657347
Train Score:........ 0.7702482949442975
Overfit Score:...... 0.0035
Test Score:......... 0.7698412698412699


In [66]:
# 100% Features, C = 1:

lr_1 = quick_test(logreg_C_one, X_100, y_100)

print('Validation Score:...', lr_1[0])
print('Train Score:........', lr_1[1])
print('Overfit Score:......', lr_1[2])
print('Test Score:.........', lr_1[3])

Validation Score:... 0.7736274938109801
Train Score:........ 0.7806363923205747
Overfit Score:...... 0.007
Test Score:......... 0.762341633901267


In [71]:
# 100% Features, C = 10:

lr_10 = quick_test(logreg_C_ten, X_100, y_100)

print('Validation Score:...', lr_10[0])
print('Train Score:........', lr_10[1])
print('Overfit Score:......', lr_10[2])
print('Test Score:.........', lr_10[3])

Validation Score:... 0.7571719819426241
Train Score:........ 0.760248537656853
Overfit Score:...... 0.0031
Test Score:......... 0.7699868938401049


In [72]:
# 50% Features, C = 0.1:

lr_10th_50 = quick_test(logreg_C_tenth, X_50, y_50)

print('Validation Score:...', lr_10th_50[0])
print('Train Score:........', lr_10th_50[1])
print('Overfit Score:......', lr_10th_50[2])
print('Test Score:.........', lr_10th_50[3])

Validation Score:... 0.7780690257754478
Train Score:........ 0.783160602897988
Overfit Score:...... 0.0051
Test Score:......... 0.7831658657346731


In [73]:
# 50% Features, C = 1:

lr_1_50 = quick_test(logreg_C_one, X_50, y_50)

print('Validation Score:...', lr_1_50[0])
print('Train Score:........', lr_1_50[1])
print('Overfit Score:......', lr_1_50[2])
print('Test Score:.........', lr_1_50[3])

Validation Score:... 0.7834571137323431
Train Score:........ 0.791509914807893
Overfit Score:...... 0.0081
Test Score:......... 0.7862967817096257


In [74]:
# 50% Features, C = 10:

lr_10_50 = quick_test(logreg_C_ten, X_50, y_50)

print('Validation Score:...', lr_10_50[0])
print('Train Score:........', lr_10_50[1])
print('Overfit Score:......', lr_10_50[2])
print('Test Score:.........', lr_10_50[3])

Validation Score:... 0.7886267656909859
Train Score:........ 0.7875294288973569
Overfit Score:...... 0.0011
Test Score:......... 0.7848405417212757


In [82]:
# Decision Trees:
dt_depth_None = DecisionTreeClassifier(max_depth=None)
dt_depth_16 = DecisionTreeClassifier(max_depth=16)
dt_depth_4 = DecisionTreeClassifier(max_depth=4)

In [77]:
# 100% Features, Max Depth = None

dt_None = quick_test(dt_depth_None, X_100, y_100)

print('Validation Score:...', dt_None[0])
print('Train Score:........', dt_None[1])
print('Overfit Score:......', dt_None[2])
print('Test Score:.........', dt_None[3])

Validation Score:... 0.8144750254841998
Train Score:........ 0.9921118419455839
Overfit Score:...... 0.1776
Test Score:......... 0.8234309014125528


In [83]:
# 100% Features, Max Depth = 16

dt_16 = quick_test(dt_depth_16, X_100, y_100)

print('Validation Score:...', dt_16[0])
print('Train Score:........', dt_16[1])
print('Overfit Score:......', dt_16[2])
print('Test Score:.........', dt_16[3])

Validation Score:... 0.8256152613950779
Train Score:........ 0.8765806655178272
Overfit Score:...... 0.051
Test Score:......... 0.8262705693898355


In [79]:
# 100% Features, Max Depth = 4

dt_4 = quick_test(dt_depth_4, X_100, y_100)

print('Validation Score:...', dt_4[0])
print('Train Score:........', dt_4[1])
print('Overfit Score:......', dt_4[2])
print('Test Score:.........', dt_4[3])

Validation Score:... 0.7512013979903888
Train Score:........ 0.7468265333365695
Overfit Score:...... 0.0044
Test Score:......... 0.7546963739624291


In [85]:
# 50% Features, Max Depth = None

dt_None_50 = quick_test(dt_depth_None, X_50, y_50)

print('Validation Score:...', dt_None_50[0])
print('Train Score:........', dt_None_50[1])
print('Overfit Score:......', dt_None_50[2])
print('Test Score:.........', dt_None_50[3])

Validation Score:... 0.795
Train Score:........ 0.898
Overfit Score:...... 0.103
Test Score:......... 0.805


In [86]:
# 50% Features, Max Depth = 16

dt_16_50 = quick_test(dt_depth_16, X_50, y_50)

print('Validation Score:...', dt_16_50[0])
print('Train Score:........', dt_16_50[1])
print('Overfit Score:......', dt_16_50[2])
print('Test Score:.........', dt_16_50[3])

Validation Score:... 0.803
Train Score:........ 0.82
Overfit Score:...... 0.017
Test Score:......... 0.796


In [87]:
# 50% Features, Max Depth = 4

dt_4_50 = quick_test(dt_depth_4, X_50, y_50)

print('Validation Score:...', dt_4_50[0])
print('Train Score:........', dt_4_50[1])
print('Overfit Score:......', dt_4_50[2])
print('Test Score:.........', dt_4_50[3])

Validation Score:... 0.725
Train Score:........ 0.725
Overfit Score:...... 0.0
Test Score:......... 0.73


In [88]:
# Highest Eval Metric for decision trees determined to be at Max Depth = 16
# Therefore, all random forests shall have Max Depth = 16

In [98]:
rf_n_10_depth_none = RandomForestClassifier(n_estimators=10, max_depth=None)
rf_n_100_depth_none = RandomForestClassifier(n_estimators=100, max_depth=None)
rf_n_1000_depth_none = RandomForestClassifier(n_estimators=1000, max_depth=None)

rf_n_10_depth_16 = RandomForestClassifier(n_estimators=10, max_depth=16)
rf_n_100_depth_16 = RandomForestClassifier(n_estimators=100, max_depth=16)
rf_n_1000_depth_16 = RandomForestClassifier(n_estimators=1000, max_depth=16)

rf_n_10_depth_4 = RandomForestClassifier(n_estimators=10, max_depth=4)
rf_n_100_depth_4 = RandomForestClassifier(n_estimators=100, max_depth=4)
rf_n_1000_depth_4 = RandomForestClassifier(n_estimators=1000, max_depth=4)

In [99]:
# 100% Features, n_estimators = 10, Max Depth = None

rf_n_10_d_None = quick_test(rf_n_10_depth_none, X_100, y_100)

print('Validation Score:...', rf_n_10_d_None[0])
print('Train Score:........', rf_n_10_d_None[1])
print('Overfit Score:......', rf_n_10_d_None[2])
print('Test Score:.........', rf_n_10_d_None[3])

Validation Score:... 0.842
Train Score:........ 0.985
Overfit Score:...... 0.143
Test Score:......... 0.848


In [100]:
# 100% Features, n_estimators = 100, Max Depth = None

rf_n_100_d_None = quick_test(rf_n_100_depth_none, X_100, y_100)

print('Validation Score:...', rf_n_100_d_None[0])
print('Train Score:........', rf_n_100_d_None[1])
print('Overfit Score:......', rf_n_100_d_None[2])
print('Test Score:.........', rf_n_100_d_None[3])

Validation Score:... 0.853
Train Score:........ 0.992
Overfit Score:...... 0.139
Test Score:......... 0.862


In [101]:
# 100% Features, n_estimators = 1000, Max Depth = None

rf_n_1000_d_None = quick_test(rf_n_1000_depth_none, X_100, y_100)

print('Validation Score:...', rf_n_1000_d_None[0])
print('Train Score:........', rf_n_1000_d_None[1])
print('Overfit Score:......', rf_n_1000_d_None[2])
print('Test Score:.........', rf_n_1000_d_None[3])

Validation Score:... 0.859
Train Score:........ 0.992
Overfit Score:...... 0.133
Test Score:......... 0.857


In [103]:
# 50% Features, n_estimators = 10, Max Depth = None

rf_n_10_50_d_None = quick_test(rf_n_10_depth_none, X_50, y_50)

print('Validation Score:...', rf_n_10_50_d_None[0])
print('Train Score:........', rf_n_10_50_d_None[1])
print('Overfit Score:......', rf_n_10_50_d_None[2])
print('Test Score:.........', rf_n_10_50_d_None[3])

Validation Score:... 0.804
Train Score:........ 0.895
Overfit Score:...... 0.091
Test Score:......... 0.808


In [104]:
# 50% Features, n_estimators = 100, Max Depth = None

rf_n_100_50_d_None = quick_test(rf_n_100_depth_none, X_50, y_50)

print('Validation Score:...', rf_n_100_50_d_None[0])
print('Train Score:........', rf_n_100_50_d_None[1])
print('Overfit Score:......', rf_n_100_50_d_None[2])
print('Test Score:.........', rf_n_100_50_d_None[3])

Validation Score:... 0.813
Train Score:........ 0.898
Overfit Score:...... 0.085
Test Score:......... 0.816


In [105]:
# 50% Features, n_estimators = 1000, Max Depth = None

rf_n_1000_50_d_None = quick_test(rf_n_1000_depth_none, X_50, y_50)

print('Validation Score:...', rf_n_1000_50_d_None[0])
print('Train Score:........', rf_n_1000_50_d_None[1])
print('Overfit Score:......', rf_n_1000_50_d_None[2])
print('Test Score:.........', rf_n_1000_50_d_None[3])

Validation Score:... 0.815
Train Score:........ 0.898
Overfit Score:...... 0.083
Test Score:......... 0.816


In [90]:
# 100% Features, n_estimators = 10, Max Depth = 16

rf_n_10_d_16 = quick_test(rf_n_10_depth_16, X_100, y_100)

print('Validation Score:...', rf_n_10_d_16[0])
print('Train Score:........', rf_n_10_d_16[1])
print('Overfit Score:......', rf_n_10_d_16[2])
print('Test Score:.........', rf_n_10_d_16[3])

Validation Score:... 0.79
Train Score:........ 0.81
Overfit Score:...... 0.02
Test Score:......... 0.793


In [91]:
# 100% Features, n_estimators = 100, Max Depth = 16

rf_n_100_d_16 = quick_test(rf_n_100_depth_16, X_100, y_100)

print('Validation Score:...', rf_n_100_d_16[0])
print('Train Score:........', rf_n_100_d_16[1])
print('Overfit Score:......', rf_n_100_d_16[2])
print('Test Score:.........', rf_n_100_d_16[3])

Validation Score:... 0.803
Train Score:........ 0.821
Overfit Score:...... 0.018
Test Score:......... 0.802


In [92]:
# 100% Features, n_estimators = 1000, Max Depth = 16

rf_n_1000_d_16 = quick_test(rf_n_1000_depth_16, X_100, y_100)

print('Validation Score:...', rf_n_1000_d_16[0])
print('Train Score:........', rf_n_1000_d_16[1])
print('Overfit Score:......', rf_n_1000_d_16[2])
print('Test Score:.........', rf_n_1000_d_16[3])

Validation Score:... 0.801
Train Score:........ 0.822
Overfit Score:...... 0.021
Test Score:......... 0.803


In [95]:
# 50% Features, n_estimators = 10, Max Depth = 16

rf_n_10_50_d_16 = quick_test(rf_n_10_depth_16, X_50, y_50)

print('Validation Score:...', rf_n_10_50_d_16[0])
print('Train Score:........', rf_n_10_50_d_16[1])
print('Overfit Score:......', rf_n_10_50_d_16[2])
print('Test Score:.........', rf_n_10_50_d_16[3])

Validation Score:... 0.775
Train Score:........ 0.793
Overfit Score:...... 0.018
Test Score:......... 0.778


In [96]:
# 50% Features, n_estimators = 100, Max Depth = 16

rf_n_100_50_d_16 = quick_test(rf_n_100_depth_16, X_50, y_50)

print('Validation Score:...', rf_n_100_50_d_16[0])
print('Train Score:........', rf_n_100_50_d_16[1])
print('Overfit Score:......', rf_n_100_50_d_16[2])
print('Test Score:.........', rf_n_100_50_d_16[3])

Validation Score:... 0.792
Train Score:........ 0.799
Overfit Score:...... 0.007
Test Score:......... 0.794


In [97]:
# 50% Features, n_estimators = 1000, Max Depth = 16

rf_n_1000_50_d_16 = quick_test(rf_n_1000_depth_16, X_50, y_50)

print('Validation Score:...', rf_n_1000_50_d_16[0])
print('Train Score:........', rf_n_1000_50_d_16[1])
print('Overfit Score:......', rf_n_1000_50_d_16[2])
print('Test Score:.........', rf_n_1000_50_d_16[3])

Validation Score:... 0.792
Train Score:........ 0.799
Overfit Score:...... 0.007
Test Score:......... 0.786


In [106]:
# 100% Features, n_estimators = 10, Max Depth = 4

rf_n_10_d_4 = quick_test(rf_n_10_depth_4, X_100, y_100)

print('Validation Score:...', rf_n_10_d_4[0])
print('Train Score:........', rf_n_10_d_4[1])
print('Overfit Score:......', rf_n_10_d_4[2])
print('Test Score:.........', rf_n_10_d_4[3])

Validation Score:... 0.703
Train Score:........ 0.701
Overfit Score:...... 0.002
Test Score:......... 0.707


In [107]:
# 100% Features, n_estimators = 100, Max Depth = 4

rf_n_100_d_4 = quick_test(rf_n_100_depth_4, X_100, y_100)

print('Validation Score:...', rf_n_100_d_4[0])
print('Train Score:........', rf_n_100_d_4[1])
print('Overfit Score:......', rf_n_100_d_4[2])
print('Test Score:.........', rf_n_100_d_4[3])

Validation Score:... 0.731
Train Score:........ 0.724
Overfit Score:...... 0.007
Test Score:......... 0.706


In [108]:
# 100% Features, n_estimators = 1000, Max Depth = 4

rf_n_1000_d_4 = quick_test(rf_n_1000_depth_4, X_100, y_100)

print('Validation Score:...', rf_n_1000_d_4[0])
print('Train Score:........', rf_n_1000_d_4[1])
print('Overfit Score:......', rf_n_1000_d_4[2])
print('Test Score:.........', rf_n_1000_d_4[3])

Validation Score:... 0.728
Train Score:........ 0.722
Overfit Score:...... 0.006
Test Score:......... 0.71


In [109]:
# 50% Features, n_estimators = 10, Max Depth = 4

rf_n_10_50_d_4 = quick_test(rf_n_10_depth_4, X_50, y_50)

print('Validation Score:...', rf_n_10_50_d_4[0])
print('Train Score:........', rf_n_10_50_d_4[1])
print('Overfit Score:......', rf_n_10_50_d_4[2])
print('Test Score:.........', rf_n_10_50_d_4[3])

Validation Score:... 0.708
Train Score:........ 0.709
Overfit Score:...... 0.001
Test Score:......... 0.715


In [110]:
# 50% Features, n_estimators = 100, Max Depth = 4

rf_n_100_50_d_4 = quick_test(rf_n_100_depth_4, X_50, y_50)

print('Validation Score:...', rf_n_100_50_d_4[0])
print('Train Score:........', rf_n_100_50_d_4[1])
print('Overfit Score:......', rf_n_100_50_d_4[2])
print('Test Score:.........', rf_n_100_50_d_4[3])

Validation Score:... 0.712
Train Score:........ 0.717
Overfit Score:...... 0.005
Test Score:......... 0.72


In [111]:
# 50% Features, n_estimators = 1000, Max Depth = 4

rf_n_1000_50_d_4 = quick_test(rf_n_1000_depth_4, X_50, y_50)

print('Validation Score:...', rf_n_1000_50_d_4[0])
print('Train Score:........', rf_n_1000_50_d_4[1])
print('Overfit Score:......', rf_n_1000_50_d_4[2])
print('Test Score:.........', rf_n_1000_50_d_4[3])

Validation Score:... 0.72
Train Score:........ 0.72
Overfit Score:...... 0.0
Test Score:......... 0.721
