In [1]:
###Logistic Regression & RForests

from collections import Counter
import pandas as pd
import numpy as np

#Handle Imbalance Data
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import smote_variants as sv

#ML ALgorithims
from matplotlib import pyplot
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier, plot_importance
from sklearn.neural_network import MLPClassifier

In [2]:
#Reading our data

train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train.columns

Index(['player_name', 'school', 'conference', 'GP', 'Min_per', 'ORtg', 'usg',
       'eFG', 'TS_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FTM',
       'FTA', 'FT_per', '2PM', '2PA', '2P_per', '3PM', '3PA', '3P_per',
       'blk_per', 'stl_per', 'ftr', 'yr', 'ht', 'num', 'porpag', 'adjoe',
       'pfr', 'year', 'pid', 'type', 'rec-rk', 'ast/tov', 'rimmade',
       'rimmade + rimmiss', 'midmade', 'midmade + midmiss',
       'rimmade/(rimmade+rimmiss)', 'midmade/(midmade+mismiss)', 'dunksmade',
       'dunksmiss + dunksmade', 'dunksmade/(dunksmade+dunksmiss)', 'pick',
       'drtg', 'adrtg', 'dporpag', 'stops', 'bpm', 'obpm', 'dbpm', 'gbpm',
       'mp', 'ogbpm', 'dgbpm', 'oreb', 'dreb', 'treb', 'ast', 'stl', 'blk',
       'pts', 'pos', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'ht_in',
       'drafted', 'yr_cat', 'ATH', 'GP_adj', 'BBIQ'],
      dtype='object')

In [4]:
#stats to look at

per_stats = ['Min_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FT_per', '2P_per', '3P_per', 
             'blk_per', 'stl_per', 'GP', 'ht_in', 'yr_cat',  'ATH', 'BBIQ']
box_stats = ['GP', 'mp', 'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg','oreb', 'dreb', 'treb',
             'ast', 'stl', 'blk', 'ht_in', 'yr_cat', 'FTA', 'FTM', 'ftr',  'ATH', 'BBIQ']
adv_stats = ['GP', 'mp', 'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'oreb', 'dreb', 'treb',
             'ast', 'stl', 'blk', 'ht_in', 'yr_cat', 'bpm', 'obpm', 'dbpm', 'FTA', 'FTM', 'ftr' ,'ATH', 'BBIQ']

per_adv = ['Min_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FT_per', '2P_per', '3P_per', 
             'blk_per', 'stl_per', 'GP', 'ht_in', 'yr_cat', 'bpm', 'obpm', 'dbpm', 'ATH', 'BBIQ']

all_stats = ['GP', 'Min_per', 'ORtg', 'usg',
       'eFG', 'TS_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FTM',
       'FTA', 'FT_per', '2PM', '2PA', '2P_per', '3PM', '3PA', '3P_per',
       'blk_per', 'stl_per', 'ftr', 'porpag', 'adjoe',
       'pfr','year', 'rec-rk', 'ast/tov', 
       'drtg', 'adrtg', 'dporpag', 'stops', 'bpm', 'obpm', 'dbpm', 'gbpm',
       'mp', 'ogbpm', 'dgbpm', 'oreb', 'dreb', 'treb', 'ast', 'stl', 'blk',
       'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'yr_cat', 'ht_in']

In [5]:
adv_stats = ['GP', 'mp', 'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'oreb', 'dreb', 'treb',
             'ast', 'stl', 'blk', 'ht_in', 'yr_cat', 'bpm', 'obpm', 'dbpm', 'FTA', 'FTM', 'ftr' ,'ATH', 'BBIQ']

adv_stats2 = ['GP', 'mp', 'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'oreb', 'dreb', 'treb', 'ftr',
             'ast', 'stl', 'blk', 'ht_in', 'yr_cat', 'bpm', 'obpm', 'dbpm', 'FTA', 'FTM','ATH']

adv_stats3 = ['GP', 'mp', 'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'oreb', 'treb', 'ftr',
             'ast', 'stl', 'blk', 'ht_in', 'yr_cat', 'bpm', 'obpm', 'dbpm', 'FTA', 'FTM','ATH']

adv_stats4 = ['GP', 'mp', 'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'oreb', 'treb', 'ftr',
             'ast', 'stl', 'blk', 'ht_in', 'yr_cat', 'bpm', 'obpm', 'FTA', 'FTM','ATH']

adv_stats5 = ['GP', 'mp', 'pts', '2PA_pg', '2PM_pg', '3PM_pg', 'oreb', 'treb', 'ftr',
             'ast', 'stl', 'blk', 'ht_in', 'yr_cat', 'bpm', 'obpm', 'FTA', 'FTM','ATH']

tests = [adv_stats, adv_stats2, adv_stats3, adv_stats4, adv_stats5]

In [6]:
#delete all missing values
train = train.dropna(axis=0, subset=per_stats)
train = train.dropna(axis=0, subset=box_stats)
train = train.dropna(axis=0, subset=adv_stats)

#delete mssing value in test set as well

test = test.dropna(axis=0, subset=per_stats)
test = test.dropna(axis=0, subset=box_stats)
test = test.dropna(axis=0, subset=adv_stats)

In [7]:
#set our 2 test-set aside
test_2018 = test[test['year']==2018]
test_2019 = test[test['year']==2019]

In [9]:
train_3 = pd.concat([train, test_2018])
train3 = train_3[(train_3['bpm']>= -0.6) & (train_3['pts']>= 3) & (train_3['ht_in']>= 72)] 

In [13]:
sample = [0.05, 0.1, 0.125, 0.15]

for i in sample:
    
    model =MLPClassifier()
    over = SMOTE(sampling_strategy=0.1)
    under = RandomUnderSampler(sampling_strategy=0.1)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X, y = pipeline.fit_resample(train3[adv_stats], train3['drafted'])
    model.fit(X, y)
    y_pred = model.predict_proba(test_2019[adv_stats])

    predictions = [np.round(value[1]) for value in y_pred]

    #roc_auc = roc_auc_score(test_2019['drafted'], predictions)
    f1 = f1_score(test_2019['drafted'], predictions)
    recall = recall_score(test_2019['drafted'], predictions)
    precision = precision_score(test_2019['drafted'], predictions)
    print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))

F1: 0.46268656716417905 recall: 0.6078431372549019 precision: 0.37349397590361444 predictions: 83.0
F1: 0.40789473684210525 recall: 0.6078431372549019 precision: 0.3069306930693069 predictions: 101.0
F1: 0.4819277108433735 recall: 0.39215686274509803 precision: 0.625 predictions: 32.0
F1: 0.4251968503937008 recall: 0.5294117647058824 precision: 0.35526315789473684 predictions: 76.0


In [25]:
prob = []

for i in range(0,len(test_2019)):
    prob.append(y_pred[i][1])

In [26]:
test_2019['pred_prob'] = prob
test_2019['draft_pred'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
adv_stats_res = ['player_name','GP', 'mp', 'pts', '2PA_pg', '2PM_pg', '3PA_pg', '3PM_pg', 'oreb', 'dreb', 'treb',
             'ast', 'stl', 'blk', 'ht_in', 'yr_cat', 'bpm', 'obpm', 'dbpm', 'FTA', 'FTM', 'ftr' ,'ATH', 'BBIQ', 'pick', 'drafted', 'draft_pred','pred_prob']


In [28]:
res_table = test_2019[adv_stats_res].sort_values('pred_prob', ascending=False)

In [29]:
res_res = res_table[(res_table['pred_prob']>= 0.5) | (res_table['pick']> 0)]

In [30]:
res_res.to_excel('results_mlp.xlsx', index=True)

In [49]:
model =MLPClassifier()
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X, y = pipeline.fit_resample(train3[adv_stats], train3['drafted'])
model.fit(X, y)
y_pred = model.predict_proba(test_2019[adv_stats])

predictions = [np.round(value[1]) for value in y_pred]

#roc_auc = roc_auc_score(test_2019['drafted'], predictions)
f1 = f1_score(test_2019['drafted'], predictions)
recall = recall_score(test_2019['drafted'], predictions)
precision = precision_score(test_2019['drafted'], predictions)
print('F1: {} recall: {} precision: {} predictions: {}'.format(f1, recall, precision, sum(predictions)))

prob = []
for i in range(0,len(test_2019)):
    prob.append(y_pred[i][1])

test_2019['pred_prob'] = prob
test_2019['draft_pred'] = predictions

res_table = test_2019[adv_stats_res].sort_values('pred_prob', ascending=False)
res_res = res_table[(res_table['pred_prob']>= 0.5) | (res_table['pick']> 0)]
res_res.to_excel('results_mlp_2019.xlsx', index=True)

F1: 0.5473684210526317 recall: 0.5098039215686274 precision: 0.5909090909090909 predictions: 44.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
