In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import joblib

from utility_functions import *


%matplotlib inline

In [3]:
df = pd.read_csv('train_set_full.csv', index_col=0)

print(df.columns)

df = df.drop(columns=['STAR', 'DPOY_Rank', 'DPOY_First', 'DPOY_Pts Won', 'DPOY_Pts Max', 'DPOY_Share',
                      'DPOY', 'MIP_Rank', 'MIP_First', 'MIP_Pts Won', 'MIP_Pts Max',
                      'MIP_Share', 'MIP', 'MVP_First', 'MVP_Pts Won',
                      'MVP_Pts Max', 'ROTY_Rank', 'ROTY_First',
                      'ROTY_Pts Won', 'ROTY_Pts Max', 'ROTY_Share', 'ROTY', 'SMOTY_Rank',
                      'SMOTY_First', 'SMOTY_Pts Won', 'SMOTY_Pts Max', 'SMOTY_Share',
                      'SMOTY'], errors='ignore')

# df = trim_set(df)


Index(['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'MVP_Rank', 'MVP_First', 'MVP_Pts Won', 'MVP_Pts Max',
       'MVP_Share'],
      dtype='object')


In [4]:
train_set = df.copy()
test_set = pd.read_csv(f'/Users/chaseallbright/Dropbox/NBA/Data/Test Sets/{datetime.now().strftime("%Y")}/{datetime.now().strftime("%Y%m%d")}_player_stats.csv', index_col=0)
test_set = trim_set(test_set)

x_train = df.drop(columns=['MVP_Rank', 'MVP_Share', ])
y_train = train_set['MVP_Share']



In [7]:
# cm = train_set.corr().apply(np.absolute)
# f = plt.figure(figsize=(19, 15))
# plt.matshow(cm.corr(), fignum=f.number)
# plt.xticks(range(df.select_dtypes(['number']).shape[1]), cm.select_dtypes(['number']).columns, fontsize=14, rotation=45)
# plt.yticks(range(df.select_dtypes(['number']).shape[1]), cm.select_dtypes(['number']).columns, fontsize=14)
# cb = plt.colorbar()
# cb.ax.tick_params(labelsize=14, )
# plt.title('Correlation Matrix', fontsize=16)

In [9]:
# feature_list = ['MP', 'PTS', 'TOV', 'AST', 'STL', 'TRB', 'BLK','WS','PER', 'TS%']
# feature_list = ['PTS','PER','TS%','WS','BPM','VORP','USG%']
feature_list = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA',
                'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
                'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
                'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
                'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM',
                'BPM', 'VORP']

x_train = train_set[feature_list]
x_test = test_set[feature_list]

In [10]:

num_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('reg', LinearRegression())
])
ppln_fitted = num_pipeline.fit(x_train, y_train, )

In [11]:
y_pred = ppln_fitted.predict(x_test)

test_set['pred'] = y_pred
test_set['pred_scaled'] = MinMaxScaler().fit_transform(y_pred.reshape(-1, 1))
test_set['pred_rank'] = test_set.pred.rank(ascending=False).astype("int")

In [12]:
test_set.sort_values('pred_rank', ascending=True).head(10)[
    ['Player', 'G', 'PTS', 'FG%', 'AST', 'TRB', 'STL', 'BLK', 'TOV', 'pred']]

Unnamed: 0,Player,G,PTS,FG%,AST,TRB,STL,BLK,TOV,pred
10,Giannis Antetokounmpo,40,31.8,0.536,5.2,12.2,0.8,0.8,4.1,0.15227
115,Luka Dončić,46,33.4,0.501,8.3,8.9,1.5,0.5,3.6,0.149575
230,Nikola Jokić,44,25.1,0.631,10.0,11.1,1.4,0.6,3.5,0.129417
132,Joel Embiid,37,33.6,0.534,4.2,10.0,1.1,1.7,3.7,0.094525
220,LeBron James,41,30.2,0.505,7.1,8.5,1.0,0.6,3.1,0.090541
427,Jayson Tatum,48,31.1,0.465,4.4,8.7,1.1,0.8,2.9,0.088314
374,Julius Randle,52,24.7,0.458,4.1,10.9,0.7,0.3,2.8,0.08523
317,Ja Morant,42,27.3,0.465,8.2,5.7,1.0,0.3,3.5,0.083577
396,Domantas Sabonis,47,18.5,0.61,7.1,12.4,0.7,0.5,3.0,0.075492
126,Kevin Durant,39,29.7,0.559,5.3,6.7,0.8,1.5,3.5,0.072543


In [13]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import f_regression,SelectKBest,SelectPercentile,chi2

# declare a two step pipeline, explicitly giving names to both steps.
pipe = Pipeline(

    [('feat_selection', SelectPercentile(percentile=60),),('scaler', MinMaxScaler()), ('reg', RandomForestRegressor())])


param_grid = [
    {
     # 'scaler':[MinMaxScaler()],
     'feat_selection__percentile':range(45,60,5),
     # 'reg': [RandomForestRegressor(),]

     },
]




# param_grid = {
#     'feat_selection':['SelectPercentile':{SelectPercentile(),}
#                       SelectKBest()],
#     'scaler': [MinMaxScaler(), StandardScaler(), ],
#     'reg': [SVR(), GradientBoostingRegressor(), LinearRegression(), ],
#     'feat_selection__percentile': [10,25,50,75,80],
#     'feat_selection__k': [5,10,15,20],
#     {'scaler': [MinMaxScaler()],
#      'feature_range': [(0, 1), (-1, 1)],
#      'knn__n_neighbors': range(1, 10)},
#
#     {'scaler': [StandardScaler()],
#      'knn__n_neighbors': range(1, 10)}
# }

grid = GridSearchCV(pipe, param_grid, cv=5, verbose=5, n_jobs=-1,return_train_score=True)
grid.fit(x_train, y_train)
# print(grid)
print(grid.best_params_)
print(grid.best_score_)
best = grid.best_params_


Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'feat_selection__percentile': 45}
0.5964063528518981


In [12]:
# ppln_fitted = pipe.fit(x_train, y_train, )

# save
ppln_fitted = joblib.load("models/RandomForrestRegressor.pkl",)

# load




y_pred = ppln_fitted.predict(x_test)

test_set['pred'] = y_pred
test_set['pred_scaled'] = MinMaxScaler().fit_transform(y_pred.reshape(-1, 1))
test_set['pred_rank'] = test_set.pred.rank(ascending=False).astype("int")

test_set.sort_values('pred_rank', ascending=True).head(10)[
          ['pred_rank','Player', 'G', 'PTS', 'FG%', 'AST', 'TRB', 'STL', 'pred']]

Unnamed: 0,pred_rank,Player,G,PTS,FG%,AST,TRB,STL,pred
237,1,Nikola Jokić,65,26.1,0.57,8.0,13.6,1.4,0.79067
9,2,Giannis Antetokounmpo,59,29.8,0.547,5.8,11.5,1.1,0.47461
137,3,Joel Embiid,58,29.8,0.487,4.3,11.3,1.1,0.32003
120,4,Luka Dončić,55,28.1,0.455,8.5,9.2,1.2,0.16113
439,5,Jayson Tatum,68,26.9,0.445,4.3,8.2,0.9,0.10094
161,6,Rudy Gobert,56,15.5,0.711,1.1,14.7,0.7,0.09898
224,7,LeBron James,53,29.8,0.522,6.2,8.2,1.3,0.07471
130,8,Kevin Durant,44,29.4,0.523,6.0,7.2,0.8,0.05418
505,9,Trae Young,65,27.9,0.455,9.5,3.9,1.0,0.03985
324,10,Ja Morant,56,27.6,0.493,6.7,5.7,1.2,0.02952


In [18]:
test_set.sort_values('pred_rank', ascending=True).head(10)[
          ['pred_rank','Player', 'PTS', 'TS%', 'AST', 'TRB','USG%','WS/48','BPM','VORP']]

Unnamed: 0,pred_rank,Player,PTS,TS%,AST,TRB,USG%,WS/48,BPM,VORP
237,1,Nikola Jokić,26.1,0.652,8.0,13.6,31.6,0.294,13.8,8.5
9,2,Giannis Antetokounmpo,29.8,0.63,5.8,11.5,34.9,0.291,11.1,6.5
137,3,Joel Embiid,29.8,0.609,4.3,11.3,37.1,0.255,9.4,5.6
120,4,Luka Dončić,28.1,0.564,8.5,9.2,37.2,0.149,8.1,5.0
439,5,Jayson Tatum,26.9,0.57,4.3,8.2,32.2,0.162,4.5,4.1
161,6,Rudy Gobert,15.5,0.733,1.1,14.7,16.8,0.273,5.1,3.2
224,7,LeBron James,29.8,0.617,6.2,8.2,32.0,0.179,7.6,4.7
130,8,Kevin Durant,29.4,0.633,6.0,7.2,31.2,0.195,6.9,3.6
505,9,Trae Young,27.9,0.595,9.5,3.9,34.6,0.167,4.7,3.9
324,10,Ja Morant,27.6,0.575,6.7,5.7,33.8,0.175,6.0,3.7
