In [47]:
###Logistic Regression & RForests

from collections import Counter
import pandas as pd

#Handle Imbalance Data
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import smote_variants as sv

#ML ALgorithims
from matplotlib import pyplot
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier, plot_importance
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier

In [48]:
#Reading our data

train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [49]:
train.columns

Index(['player_name', 'school', 'conference', 'GP', 'Min_per', 'ORtg', 'usg',
       'eFG', 'TS_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FTM',
       'FTA', 'FT_per', '2PM', '2PA', '2P_per', '3PM', '3PA', '3P_per',
       'blk_per', 'stl_per', 'ftr', 'yr', 'ht', 'num', 'porpag', 'adjoe',
       'pfr', 'year', 'pid', 'type', 'rec-rk', 'ast/tov', 'rimmade',
       'rimmade + rimmiss', 'midmade', 'midmade + midmiss',
       'rimmade/(rimmade+rimmiss)', 'midmade/(midmade+mismiss)', 'dunksmade',
       'dunksmiss + dunksmade', 'dunksmade/(dunksmade+dunksmiss)', 'pick',
       'drtg', 'adrtg', 'dporpag', 'stops', 'bpm', 'obpm', 'dbpm', 'gbpm',
       'mp', 'ogbpm', 'dgbpm', 'oreb', 'dreb', 'treb', 'ast', 'stl', 'blk',
       'pts', 'pos', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'ht_in',
       'drafted', 'yr_cat', 'ATH', 'GP_adj', 'BBIQ'],
      dtype='object')

In [50]:
#stats to look at

per_stats = ['Min_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FT_per', '2P_per', '3P_per', 
             'blk_per', 'stl_per', 'GP', 'ht_in', 'yr_cat',  'ATH', 'BBIQ']
box_stats = ['GP', 'mp', 'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg','oreb', 'dreb', 'treb',
             'ast', 'stl', 'blk', 'ht_in', 'yr_cat', 'FTA', 'FTM', 'ftr',  'ATH', 'BBIQ']
adv_stats = ['GP', 'mp', 'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'oreb', 'dreb', 'treb',
             'ast', 'stl', 'blk', 'ht_in', 'yr_cat', 'bpm', 'obpm', 'dbpm', 'FTA', 'FTM', 'ftr' ,'ATH', 'BBIQ']

per_adv = ['Min_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FT_per', '2P_per', '3P_per', 
             'blk_per', 'stl_per', 'GP', 'ht_in', 'yr_cat', 'bpm', 'obpm', 'dbpm', 'ATH', 'BBIQ']

all_stats = ['GP', 'Min_per', 'ORtg', 'usg',
       'eFG', 'TS_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FTM',
       'FTA', 'FT_per', '2PM', '2PA', '2P_per', '3PM', '3PA', '3P_per',
       'blk_per', 'stl_per', 'ftr', 'porpag', 'adjoe',
       'pfr','year', 'rec-rk', 'ast/tov', 
       'drtg', 'adrtg', 'dporpag', 'stops', 'bpm', 'obpm', 'dbpm', 'gbpm',
       'mp', 'ogbpm', 'dgbpm', 'oreb', 'dreb', 'treb', 'ast', 'stl', 'blk',
       'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'yr_cat', 'ht_in']

In [51]:
#delete all missing values
train = train.dropna(axis=0, subset=per_stats)
train = train.dropna(axis=0, subset=box_stats)
train = train.dropna(axis=0, subset=adv_stats)

#delete mssing value in test set as well

test = test.dropna(axis=0, subset=per_stats)
test = test.dropna(axis=0, subset=box_stats)
test = test.dropna(axis=0, subset=adv_stats)

In [52]:
#set our 2 test-set aside
test_2018 = test[test['year']==2018]
test_2019 = test[test['year']==2019]

In [53]:
train[train['drafted']==1]['bpm'].describe()

count    390.000000
mean       7.188642
std        2.765719
min       -0.608598
25%        5.415707
50%        7.235225
75%        8.920550
max       17.668000
Name: bpm, dtype: float64

In [54]:
###DROP BY BPM

train_2 = train[train['bpm']>= -0.6]

In [63]:
#with Undersample LogREG####

tests = [per_stats, box_stats, adv_stats, per_adv]

for i in tests:
    
    model = LogisticRegression(max_iter=10000)
    under = RandomUnderSampler(sampling_strategy=0.05)
    X, y = under.fit_resample(train[i], train['drafted'])
    model.fit(X, y)
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))



F1: 0.3401360544217687 recall: 0.49019607843137253 precision: 0.2604166666666667 predictions: 96.0
F1: 0.39080459770114945 recall: 0.6666666666666666 precision: 0.2764227642276423 predictions: 123.0
F1: 0.38666666666666666 recall: 0.5686274509803921 precision: 0.29292929292929293 predictions: 99.0
F1: 0.4 recall: 0.49019607843137253 precision: 0.33783783783783783 predictions: 74.0


In [64]:
###WITH Undersample XGBOOOST#####

tests = [per_stats, box_stats, adv_stats, per_adv] #all_stats]

for i in tests:
    
    model = XGBClassifier(max_depth=4)
    under = RandomUnderSampler(sampling_strategy=0.05)
    X, y = under.fit_resample(train[i], train['drafted'])
    model.fit(X, y)
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))



F1: 0.3404255319148936 recall: 0.47058823529411764 precision: 0.26666666666666666 predictions: 90.0
F1: 0.45637583892617445 recall: 0.6666666666666666 precision: 0.3469387755102041 predictions: 98.0
F1: 0.45588235294117646 recall: 0.6078431372549019 precision: 0.36470588235294116 predictions: 85.0
F1: 0.3968253968253968 recall: 0.49019607843137253 precision: 0.3333333333333333 predictions: 75.0


In [65]:
###WITH Undersample MLP#####

tests = [per_stats, box_stats, adv_stats, per_adv] #all_stats]

for i in tests:
    
    model = MLPClassifier()
    under = RandomUnderSampler(sampling_strategy=0.05)
    X, y = under.fit_resample(train[i], train['drafted'])
    model.fit(X, y)
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))

F1: 0.18867924528301885 recall: 0.19607843137254902 precision: 0.18181818181818182 predictions: 55.0
F1: 0.2368421052631579 recall: 0.17647058823529413 precision: 0.36 predictions: 25.0
F1: 0.40944881889763785 recall: 0.5098039215686274 precision: 0.34210526315789475 predictions: 76.0
F1: 0.3953488372093023 recall: 0.3333333333333333 precision: 0.4857142857142857 predictions: 35.0


In [66]:
###WITH Undersample ADABOOST#####

tests = [per_stats, box_stats, adv_stats, per_adv] #all_stats]

for i in tests:
    model = AdaBoostClassifier()
    under = RandomUnderSampler(sampling_strategy=0.05)
    X, y = under.fit_resample(train[i], train['drafted'])
    model.fit(X, y)
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))

F1: 0.2893081761006289 recall: 0.45098039215686275 precision: 0.21296296296296297 predictions: 108.0
F1: 0.3636363636363636 recall: 0.5490196078431373 precision: 0.27184466019417475 predictions: 103.0
F1: 0.4358974358974359 recall: 0.6666666666666666 precision: 0.3238095238095238 predictions: 105.0
F1: 0.3401360544217687 recall: 0.49019607843137253 precision: 0.2604166666666667 predictions: 96.0


In [67]:
####WITH UNDERSAMPLE BY BPM

tests = [per_stats, box_stats, adv_stats, per_adv]

for i in tests:
    
    model = LogisticRegression(max_iter=10000)
    #under = RandomUnderSampler(sampling_strategy=0.05)
    #X, y = under.fit_resample(train[i], train['drafted'])
    model.fit(train_2[i], train_2['drafted'])
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))


F1: 0.03636363636363636 recall: 0.0196078431372549 precision: 0.25 predictions: 4.0
F1: 0.25 recall: 0.19607843137254902 precision: 0.3448275862068966 predictions: 29.0
F1: 0.2631578947368421 recall: 0.19607843137254902 precision: 0.4 predictions: 25.0
F1: 0.3333333333333333 recall: 0.21568627450980393 precision: 0.7333333333333333 predictions: 15.0


In [68]:
#ADA BOOST Undersample BPM EXP_ID: 8


tests = [per_stats, box_stats, adv_stats, per_adv]

for i in tests:
    
    model = AdaBoostClassifier()
    #under = RandomUnderSampler(sampling_strategy=0.05)
    #X, y = under.fit_resample(train[i], train['drafted'])
    model.fit(train_2[i], train_2['drafted'])
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))

F1: 0.2278481012658228 recall: 0.17647058823529413 precision: 0.32142857142857145 predictions: 28.0
F1: 0.3181818181818182 recall: 0.27450980392156865 precision: 0.3783783783783784 predictions: 37.0
F1: 0.44680851063829785 recall: 0.4117647058823529 precision: 0.4883720930232558 predictions: 43.0
F1: 0.379746835443038 recall: 0.29411764705882354 precision: 0.5357142857142857 predictions: 28.0


In [69]:
####WITH UNDERSAMPLE BY BPM

tests = [per_stats, box_stats, adv_stats, per_adv]

for i in tests:
    
    model = MLPClassifier()
    #under = RandomUnderSampler(sampling_strategy=0.05)
    #X, y = under.fit_resample(train[i], train['drafted'])
    model.fit(train_2[i], train_2['drafted'])
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))


F1: 0.037037037037037035 recall: 0.0196078431372549 precision: 0.3333333333333333 predictions: 3.0
F1: 0.3 recall: 0.23529411764705882 precision: 0.41379310344827586 predictions: 29.0
F1: 0.46341463414634143 recall: 0.37254901960784315 precision: 0.6129032258064516 predictions: 31.0
F1: 0.1111111111111111 recall: 0.058823529411764705 precision: 1.0 predictions: 3.0


In [70]:
#XGB UNDER SAMPLE BY BPM EXP_ID: 9

tests = [per_stats, box_stats, adv_stats, per_adv]

for i in tests:
    
    model = XGBClassifier()
    under = RandomUnderSampler(sampling_strategy=0.10)
    X, y = under.fit_resample(train_2[i], train_2['drafted'])
    model.fit(X, y)
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))

F1: 0.2962962962962963 recall: 0.3137254901960784 precision: 0.2807017543859649 predictions: 57.0
F1: 0.47407407407407404 recall: 0.6274509803921569 precision: 0.38095238095238093 predictions: 84.0
F1: 0.47540983606557374 recall: 0.5686274509803921 precision: 0.4084507042253521 predictions: 71.0
F1: 0.44660194174757284 recall: 0.45098039215686275 precision: 0.4423076923076923 predictions: 52.0


In [71]:
###WITH SMOTE XGBOOOST##### & UNDERSAMPLING EXP_ID: 10

tests = [per_stats, box_stats, adv_stats, per_adv] #all_stats]

for i in tests:
    
    model = XGBClassifier(max_depth=4)
    over = SMOTE(sampling_strategy=0.10)
    under = RandomUnderSampler(sampling_strategy=0.1)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X, y = pipeline.fit_resample(train_2[i], train_2['drafted'])
    model.fit(X, y)
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))



F1: 0.2888888888888889 recall: 0.2549019607843137 precision: 0.3333333333333333 predictions: 39.0
F1: 0.4181818181818182 recall: 0.45098039215686275 precision: 0.3898305084745763 predictions: 59.0
F1: 0.44660194174757284 recall: 0.45098039215686275 precision: 0.4423076923076923 predictions: 52.0
F1: 0.45833333333333326 recall: 0.43137254901960786 precision: 0.4888888888888889 predictions: 45.0


In [72]:
for i in tests:
    
    model = MLPClassifier()
    over = SMOTE(sampling_strategy=0.10)
    under = RandomUnderSampler(sampling_strategy=0.1)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X, y = pipeline.fit_resample(train_2[i], train_2['drafted'])
    model.fit(X, y)
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))



F1: 0.24175824175824173 recall: 0.43137254901960786 precision: 0.16793893129770993 predictions: 131.0
F1: 0.46956521739130436 recall: 0.5294117647058824 precision: 0.421875 predictions: 64.0
F1: 0.45544554455445546 recall: 0.45098039215686275 precision: 0.46 predictions: 50.0
F1: 0.4444444444444445 recall: 0.39215686274509803 precision: 0.5128205128205128 predictions: 39.0


In [73]:
for i in tests:
    
    model = AdaBoostClassifier()
    over = SMOTE(sampling_strategy=0.10)
    under = RandomUnderSampler(sampling_strategy=0.1)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X, y = pipeline.fit_resample(train_2[i], train_2['drafted'])
    model.fit(X, y)
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))



F1: 0.2878787878787879 recall: 0.37254901960784315 precision: 0.2345679012345679 predictions: 81.0
F1: 0.36231884057971014 recall: 0.49019607843137253 precision: 0.28735632183908044 predictions: 87.0
F1: 0.453125 recall: 0.5686274509803921 precision: 0.37662337662337664 predictions: 77.0
F1: 0.36507936507936506 recall: 0.45098039215686275 precision: 0.30666666666666664 predictions: 75.0


In [74]:
for i in tests:
    
    model = RandomForestClassifier()
    over = SMOTE(sampling_strategy=0.10)
    under = RandomUnderSampler(sampling_strategy=0.1)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X, y = pipeline.fit_resample(train_2[i], train_2['drafted'])
    model.fit(X, y)
    y_pred = model.predict(test_2019[i])
    predictions = [round(value) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))



F1: 0.1643835616438356 recall: 0.11764705882352941 precision: 0.2727272727272727 predictions: 22.0
F1: 0.3870967741935484 recall: 0.35294117647058826 precision: 0.42857142857142855 predictions: 42.0
F1: 0.4000000000000001 recall: 0.35294117647058826 precision: 0.46153846153846156 predictions: 39.0
F1: 0.3846153846153846 recall: 0.29411764705882354 precision: 0.5555555555555556 predictions: 27.0
