In [1]:
### Import all the necessary modules
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, roc_auc_score, recall_score, confusion_matrix, auc, roc_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
### Import Data
data = pd.read_csv("MergeDataPercentile.csv")
data = data[data['Pos'] != '0']

data.dropna(axis=0,inplace= True,subset = ['Per GameGP'])
hold_out = data[data['Highest Level Reached_x'] == '2017-18']
hold_out['NCAA Seasons\r\r\n(D-I)'] = hold_out.groupby('RealGM Summary Page')['NCAA Seasons\r\r\n(D-I)'].cumcount() + 1
keep_data = data[data['Highest Level Reached_x']!='2017-18']
keep_data = keep_data.drop_duplicates(subset =  ['RealGM Summary Page'], keep = 'last' )
keep_data['Highest Level Reached_x'] = keep_data['Highest Level Reached_x'].apply(lambda x: 'International' if x != 'NBA' else 'NBA')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [3]:
## Which Features to keep
print(hold_out.columns.values)
feature_set_1 = ['Per GameMIN','Misc StatsOWS', 'Misc StatsDWS','Advanced StatsTS%', 'Advanced StatseFG%',
       'Advanced StatsORB%', 'Advanced StatsDRB%', 'Advanced StatsTRB%',
       'Advanced StatsAST%', 'Advanced StatsTOV%', 'Advanced StatsSTL%',
       'Advanced StatsBLK%', 'Advanced StatsUSG%','Advanced StatsPPR', 'Advanced StatsPPS',
       'Advanced StatsORtg', 'Advanced StatsDRtg', 'Advanced StatsPER','Pos','Wingspan','Height','Per Game3P%','NCAA Seasons\r\r\n(D-I)']
feature_set_2 = ['Per GameMIN','Misc StatsOWS','Misc StatsDWS','Pos','Per GameFG%', 'Per Game3PM',
        'Per Game3P%', 'Per GameFTM',
       'Per GameFT%', 'Per GameDEF', 'Per GameTRB',
       'Per GameAST', 'Per GameSTL', 'Per GameBLK',
       'Per GameTOV', 'Per GamePTS']
feature_set_3 = list(set(feature_set_1).union(set(feature_set_2)))
y_col = ["Highest Level Reached_x"]
X = keep_data[feature_set_1]
X['Pos'] = LabelEncoder().fit_transform(X['Pos'])
#X = pd.DataFrame(StandardScaler().fit_transform(X),columns= X.columns.values)
y = keep_data[y_col]
le = LabelEncoder()
y = le.fit_transform(y)

['RealGM Summary Page' 'Highest Level Reached_x' 'Season' 'School' 'League'
 'Conference' 'TeamID' 'Year' 'Year at School' 'Per GameGP' 'Per GameGS'
 'Per GameMIN' 'Per GameFGM' 'Per GameFGA' 'Per GameFG%' 'Per Game3PM'
 'Per Game3PA' 'Per Game3P%' 'Per GameFTM' 'Per GameFTA' 'Per GameFT%'
 'Per GameOFF' 'Per GameDEF' 'Per GameTRB' 'Per GameAST' 'Per GameSTL'
 'Per GameBLK' 'Per GamePF' 'Per GameTOV' 'Per GamePTS'
 'Season Total StatsGP' 'Season Total StatsGS' 'Season Total StatsMIN'
 'Season Total StatsFGM' 'Season Total StatsFGA' 'Season Total StatsFG%'
 'Season Total Stats3PM' 'Season Total Stats3PA' 'Season Total Stats3P%'
 'Season Total StatsFTM' 'Season Total StatsFTA' 'Season Total StatsFT%'
 'Season Total StatsOFF' 'Season Total StatsDEF' 'Season Total StatsTRB'
 'Season Total StatsAST' 'Season Total StatsSTL' 'Season Total StatsBLK'
 'Season Total StatsPF' 'Season Total StatsTOV' 'Season Total StatsPTS'
 'Misc StatsGP' 'Misc StatsDbl Dbl' 'Misc StatsTpl Dbl' 'Misc Stats40 Pts'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
  y = column_or_1d(y, warn=True)


In [4]:
## Encode the position, encode the response variable
#X_dummies = pd.get_dummies(X,columns=['Pos'])
#y_dummies = pd.get_dummies(y)


In [5]:
### Split into train, test, validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test,X_val,y_test,y_val = train_test_split(X_test, y_test, test_size=0.1, random_state=42)

In [6]:
### penalty parameters to search over


In [7]:
### how to find the best logistic classifier
best_F1 = 0
best_classifier = None
split_list = [30,50,60]
for split in split_list:
    
    classifier = RandomForestClassifier(class_weight= 'balanced', min_samples_split=split)
    classifier.fit(X_train,y_train)
    test_pred = classifier.predict(X_test)
    score = f1_score(y_test,test_pred)
    print(score)
    if score >= best_F1:
        best_classifier = classifier
        best_F1 = score
print('Found Best Classifier!')

        

0.508806262231
0.517123287671
0.518518518519
Found Best Classifier!


In [8]:
### Check for overfit
from ggplot import *
pred_val = best_classifier.predict(X_val)
test_f1 = f1_score(y_val,pred_val)
test_f1
print(classification_report(pred_val,y_val))
print(test_f1)
print(recall_score(pred_val,y_val))
print(classification_report(best_classifier.predict(X_train),y_train))
conf_mat = pd.DataFrame(confusion_matrix(pred_val,y_val),columns = ['Predicted Intl.','Predicted NBA'],index = ['Actual Intl.','Actual NBA'])
print(conf_mat)










prediction = best_classifier.predict_proba(X_train)[:,1]
fpr,tpr,_ = roc_curve(y_train,prediction)
auc_score = auc(fpr,tpr)
#conf_mat = pd.DataFrame(confusion_matrix(prediction,y_val),columns = ['Predicted Intl.','Predicted NBA'],index = ['Actual Intl.','Actual NBA'])
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', ymin='0', y='tpr')) +\
 geom_area(alpha=0.2) +\
 geom_line(aes(y='tpr')) +\
 ggtitle('Random Forest Classifier:' + "ROC Curve w/ AUC=%s" % str(auc_score))



You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


             precision    recall  f1-score   support

          0       0.87      0.94      0.90       158
          1       0.75      0.57      0.65        53

avg / total       0.84      0.84      0.84       211

0.645161290323
0.566037735849
             precision    recall  f1-score   support

          0       0.89      0.98      0.93      3882
          1       0.87      0.55      0.67      1027

avg / total       0.89      0.89      0.88      4909

              Predicted Intl.  Predicted NBA
Actual Intl.              148             10
Actual NBA                 23             30


<Figure size 1100x800 with 1 Axes>

<ggplot: (-9223371867698177669)>

In [9]:
## We seem good here, now lets predict on the class of 2017-2018

X_hold_out = hold_out[feature_set_1]
X_hold_out['Pos'] = LabelEncoder().fit_transform(X_hold_out['Pos'])

#y_hold_out = hold_out[y_col]
y_hold_out = best_classifier.predict(X_hold_out)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [10]:
big_df = pd.DataFrame()
coef_df = pd.DataFrame()

coefficients = best_classifier.feature_importances_
#print(coefficients)

features = X.columns.values
coef_df['Coefficients'] = coefficients
coef_df['Features'] = features
big_df = pd.concat([big_df,coef_df])
big_df.to_csv("rfc_coefficients.csv",index=False)

In [11]:
best_classifier.predict_proba(X_hold_out)

array([[ 0.22021215,  0.77978785],
       [ 0.65493733,  0.34506267],
       [ 0.63010049,  0.36989951],
       ..., 
       [ 0.54638978,  0.45361022],
       [ 0.6436667 ,  0.3563333 ],
       [ 0.4174988 ,  0.5825012 ]])

In [12]:
predictions = pd.DataFrame(data = y_hold_out,columns = ['prediction'])
predictions['Name'] = hold_out['Name'].values
predictions['Year'] = hold_out['Year'].values
predictions[['International','NBA']] = pd.DataFrame(best_classifier.predict_proba(X_hold_out))

In [13]:
predictions[predictions['prediction'] == 1].sort_values('NBA',ascending =False)

Unnamed: 0,prediction,Name,Year,International,NBA
311,1,Mike Daum,2018.0,0.029038,0.970962
456,1,Jock Landale,2017.0,0.054313,0.945687
1338,1,Jaren Jackson,2018.0,0.115937,0.884063
1294,1,Mohamed Bamba,2018.0,0.119534,0.880466
457,1,Jock Landale,2018.0,0.119954,0.880046
281,1,Gary Clark,2017.0,0.120562,0.879438
94,1,Reggie Lynch,2015.0,0.120870,0.879130
158,1,Kyle Washington,2017.0,0.129980,0.870020
1291,1,Marvin Bagley III,2018.0,0.132686,0.867314
229,1,Jacobi Boykins,2017.0,0.146601,0.853399


In [14]:
best_classifier.feature_importances_

array([ 0.05263589,  0.16458624,  0.22962407,  0.02469382,  0.02628696,
        0.01609981,  0.02566665,  0.01641635,  0.01777524,  0.01611141,
        0.01682579,  0.0215905 ,  0.03540273,  0.01821426,  0.039153  ,
        0.01513279,  0.0266555 ,  0.0995476 ,  0.0037743 ,  0.0264867 ,
        0.04000381,  0.02388572,  0.04343085])

In [15]:
predictions[predictions['Name'] == 'Trae Young']

Unnamed: 0,prediction,Name,Year,International,NBA
1417,1,Trae Young,2018.0,0.417499,0.582501


In [16]:
keep_data[feature_set_1].dtypes

Per GameMIN                float64
Misc StatsOWS              float64
Misc StatsDWS              float64
Advanced StatsTS%          float64
Advanced StatseFG%         float64
Advanced StatsORB%         float64
Advanced StatsDRB%         float64
Advanced StatsTRB%         float64
Advanced StatsAST%         float64
Advanced StatsTOV%         float64
Advanced StatsSTL%         float64
Advanced StatsBLK%         float64
Advanced StatsUSG%         float64
Advanced StatsPPR          float64
Advanced StatsPPS          float64
Advanced StatsORtg         float64
Advanced StatsDRtg         float64
Advanced StatsPER          float64
Pos                         object
Wingspan                   float64
Height                     float64
Per Game3P%                float64
NCAA Seasons\r\r\n(D-I)      int64
dtype: object

In [17]:

backfills = best_classifier.predict_proba(X)
backfilldata = pd.DataFrame(backfills,columns = ['Intl.','NBA'])
backfilldata['Result'] = y
backfilldata['Year'] = keep_data['Year'].values
backfilldata['Name'] = keep_data['Name'].value


In [18]:
backfilldata[backfilldata['Name'].isin(['Ben Simmons','Donovan Mitchell','Kyle Kuzma','Jayson Tatum','Malik Monk','Lauri Markkanen']) ].sort_values(by = 'NBA',ascending = False)

Unnamed: 0,Intl.,NBA,Result,Year,Name
2748,0.055506,0.944494,1,2016.0,Ben Simmons
2817,0.118965,0.881035,1,2017.0,Lauri Markkanen
2822,0.149779,0.850221,1,2017.0,Malik Monk
2842,0.27583,0.72417,1,2017.0,Jayson Tatum
2522,0.284837,0.715163,1,2017.0,Kyle Kuzma
2716,0.340671,0.659329,1,2017.0,Donovan Mitchell


In [26]:
predictions['Pos'] = hold_out['Pos'].values

In [28]:
predictions.head()
last_year = predictions.drop_duplicates(subset = ['Name'],keep = 'last')

In [41]:
last_year[last_year['NBA']>.5].groupby('Pos').apply(lambda x: x.sort_values('NBA',ascending = True).head(10))

Unnamed: 0_level_0,Unnamed: 1_level_0,prediction,Name,Year,International,NBA,Pos
Pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C,1409,1,Kaleb Wesson,2018.0,0.499521,0.500479,C
C,908,1,Doral Moore,2018.0,0.468284,0.531716,C
C,1042,1,Abdul Ado,2018.0,0.44611,0.55389,C
C,1318,1,Bruno Fernando,2018.0,0.436191,0.563809,C
C,986,1,Elijah Thomas,2018.0,0.421753,0.578247,C
C,1049,1,Udoka Azubuike,2018.0,0.413062,0.586938,C
C,1322,1,Luke Garza,2018.0,0.375785,0.624215,C
C,856,1,Jo Acuil,2018.0,0.360958,0.639042,C
C,1021,1,Mike Watkins,2018.0,0.301161,0.698839,C
C,764,1,Tyler Davis,2018.0,0.24368,0.75632,C


In [44]:
last_year[last_year['Name'].isin(['Donte DiVincenzo']) ]

Unnamed: 0,prediction,Name,Year,International,NBA,Pos
770,1,Donte DiVincenzo,2018.0,0.497975,0.502025,G
