In [27]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from plotnine import *

# Cleaning NBA and NCAA data

In [2]:
df_draft_raw = pd.read_csv("../data/draft.csv", index_col=0)

In [3]:
df_draft = (df_draft_raw.rename(columns={'Lg': 'league', 'Rd': 'round', 
                                         'Pk': 'pick', 'Tm': 'team', 'Pos': 'position'})
              .assign(Player = lambda x: x.Player.str.split('\\').str.get(0)))
df_draft.columns = df_draft.columns.str.lower()

In [4]:
df_draft.head()

Unnamed: 0_level_0,year,league,round,pick,team,player,age,position,born,college
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2016,NBA,1,1,PHI,Ben Simmons,19.343,G-F,au,Louisiana State University
2,2016,NBA,1,2,LAL,Brandon Ingram,18.299,F,us,Duke University
3,2016,NBA,1,3,BOS,Jaylen Brown,19.247,F-G,us,University of California
4,2016,NBA,1,4,PHO,Dragan Bender,18.223,F,ba,
5,2016,NBA,1,5,MIN,Kris Dunn,22.103,G,us,Providence College


In [5]:
df_nba_raw = pd.read_csv("../data/nbarookiedata.csv", skiprows=1, index_col=0)

In [6]:
df_nba = (df_nba_raw
             .assign(Player = lambda x: x.Player.str.split('\\').str.get(0),
                     Season = lambda x: pd.to_numeric(x.Season.str.split('-').str.get(0)))
             [['Player', 'Tm', 'Season', 'PTS', 'TRB', 'AST', 'STL', 'BLK']]
             .rename(columns={'Player': 'player', 'Tm': 'team', 'Season': 'year', 
                              'PTS': 'point', 'TRB': 'rebound', 'AST': 'assist',
                              'STL': 'steal', 'BLK': 'block'}))

In [7]:
df_nba.head()

Unnamed: 0_level_0,player,team,year,point,rebound,assist,steal,block
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Alex Abrines,OKC,2016,6.0,1.3,0.6,0.5,0.1
2,Ron Baker,NYK,2016,4.1,1.9,2.1,0.7,0.2
3,Wade Baldwin,MEM,2016,3.2,1.4,1.8,0.5,0.2
4,Malik Beasley,DEN,2016,3.8,0.8,0.5,0.3,0.0
5,DeAndre' Bembry,ATL,2016,2.7,1.6,0.7,0.2,0.1


In [8]:
df_final = pd.merge(df_draft, df_nba, how='left', on=['player', 'team', 'year'])

In [9]:
# There are a lot of players drafted but don't appear in the NBA dataset, why?
df_final.head()

Unnamed: 0,year,league,round,pick,team,player,age,position,born,college,point,rebound,assist,steal,block
0,2016,NBA,1,1,PHI,Ben Simmons,19.343,G-F,au,Louisiana State University,,,,,
1,2016,NBA,1,2,LAL,Brandon Ingram,18.299,F,us,Duke University,9.4,4.0,2.1,0.6,0.5
2,2016,NBA,1,3,BOS,Jaylen Brown,19.247,F-G,us,University of California,6.6,2.8,0.8,0.4,0.2
3,2016,NBA,1,4,PHO,Dragan Bender,18.223,F,ba,,3.4,2.4,0.5,0.2,0.5
4,2016,NBA,1,5,MIN,Kris Dunn,22.103,G,us,Providence College,3.8,2.1,2.4,1.0,0.5


In [10]:
df_ncaa_raw = pd.read_csv('../data/ncaa2.csv', index_col=0)

In [11]:
df_ncaa_raw.columns

Index(['Name', 'Team', 'GP', 'Min', 'Pts', 'FG', 'FGA', 'FG%', '2Pt', '2PtA',
       '2P%', '3Pt', '3PtA', '3P%', 'FTM', 'FTA', 'FT%', 'Off', 'Def', 'TOT',
       'Asts', 'Stls', 'Blks', 'TOs', 'PFs', 'year', 'PTs/g', 'FGA/g',
       'Pts/Play', 'TS%', 'eFG%', 'FTA/FGA', '3PA/FGA', 'Ast/g', 'Ast/FGA',
       'A/TO', 'PPR', 'BK/g', 'STL/g', 'PF/g'],
      dtype='object')

In [12]:
df_ncaa = (df_ncaa_raw
              [['Name', 'Team', 'year', 'Pts', 'TOT', 'Asts', 'Stls', 'Blks']]
              .rename(columns={'Name': 'player', 'Team': 'college', 'year': 'col_year',
                               'Pts': 'col_point', 'TOT': 'col_rebound', 'Asts': 'col_assist',
                               'Stls': 'col_steal', 'Blks': 'col_block'}))

In [13]:
df_ncaa.head()

Unnamed: 0,player,college,col_year,col_point,col_rebound,col_assist,col_steal,col_block
0,Luis Rivas,Western Illinois,2002,27.9,10.7,0.0,0.0,0.5
1,Steve Reynolds,Western Michigan,2002,27.8,6.5,2.0,1.2,0.1
2,Michael Watson,UMKC,2002,24.9,3.6,3.2,2.1,0.2
3,Tyrone Hayes,Idaho,2002,34.3,12.3,0.0,2.5,2.5
4,Travis Robinson,Jacksonville,2002,27.4,7.5,1.1,1.9,0.3


In [14]:
df_final = pd.merge(df_final, df_ncaa, how='inner', on=['player'])

In [15]:
# Only keep the last year of college performance
df_final = (df_final.groupby('player')
        .apply(lambda df: df.sort_values(by='col_year', ascending=False).iloc[0, :]))

In [16]:
print("Number of unique players: {}".format(len(pd.unique(df_final.player))))

Number of unique players: 613


# Model building

In [17]:
df_final = df_final.dropna()
X = df_final[['round', 'pick', 'position',
              'col_point', 'col_rebound', 'col_assist', 'col_steal', 'col_block']]
X = pd.get_dummies(X, columns=['position'])
y = df_final['point']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Random Forest Regression

In [19]:
rf = RandomForestRegressor(criterion='mse', n_jobs=-1)

## Tuning and choosing the best hyperparams

In [20]:
gs = GridSearchCV(estimator=rf, 
                  param_grid=[{'n_estimators': [100, 1000, 1000],
                               'min_samples_leaf': [1, 10, 100]}],
                  cv=10, scoring='neg_mean_squared_error')
gs.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [100, 1000, 1000], 'min_samples_leaf': [1, 10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [21]:
print(gs.best_params_)
best_rf = gs.best_estimator_
best_rf.fit(X_train, y_train)

{'min_samples_leaf': 10, 'n_estimators': 1000}


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

## Model performance evaluation

In [26]:
print("Min / Median / Max of Point: {} {} {}".format(np.min(df_final.point),
                                                   np.median(df_final.point),
                                                   np.max(df_final.point)))
print("Train MSE: {:.3f}; Test MSE: {:.3f}".format(mean_squared_error(y_train, best_rf.predict(X_train)),
                                                 mean_squared_error(y_test, best_rf.predict(X_test))))

Min / Median / Max of Point: 0.0 4.6 21.0
Train MSE: 6.798; Test MSE: 9.499


The Train MSE is lower than Test MSE substantially, so it seems like our model overfits the training data. Also the error is substantial given that the median point scored is only 4.6.

We will build better model, potentially including team statistics along with individual statistics.