In [104]:
import joblib
import json
import requests
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [110]:
df = pd.read_csv('https://query.data.world/s/w6xlfk5uxmc7jii3altgf2aexzqw3e')
df.head()

Unnamed: 0.1,Unnamed: 0,Player,All_NBA,All.Star,Draft_Yr,Pk,Team,College,Yrs,Games,Minutes.Played,PTS,TRB,AST,FG_Percentage,TP_Percentage,FT_Percentage,Minutes.per.Game,Points.per.Game,TRB.per.game,Assits.per.Game,Win.Share,WS_per_game,BPM,VORP,Executive,Tenure,Exec_ID,Exec_draft_exp,attend_college,first_year,second_year,third_year,fourth_year,fifth_year
0,1,Robert Parish,2,9,1976,8,GSW,Centenary College of Louisiana,21,1611,45704,23334,14715,2180,0.537,0.0,0.721,28.4,14.5,9.1,1.4,147.0,0.154,1.6,41.5,Al Attles,3641 days 00:00:00.000000000,1,1,1,0,0,0,0,0
1,2,Sonny Parker,0,0,1976,17,GSW,Texas A&M University,6,452,10916,4471,1841,954,0.501,0.0,0.755,24.2,9.9,4.1,2.1,26.9,0.118,2.2,11.5,Al Attles,3641 days 00:00:00.000000000,1,1,1,0,0,0,0,0
2,3,Marshall Rogers,0,0,1976,34,GSW,University of Texas-Pan American,1,26,176,100,11,10,0.371,0.0,0.933,6.8,3.8,0.4,0.4,-0.2,-0.043,-9.9,-0.4,Al Attles,3641 days 00:00:00.000000000,1,1,1,0,0,0,0,0
3,4,Jeff Fosnes,0,0,1976,68,GSW,Vanderbilt University,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Al Attles,3641 days 00:00:00.000000000,1,1,1,0,0,0,0,0
4,5,Carl Bird,0,0,1976,86,GSW,University of California,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Al Attles,3641 days 00:00:00.000000000,1,1,1,0,0,0,0,0


In [111]:
df.columns = df.columns.str.replace('.', '_')
df.columns = df.columns.str.lower()
df.columns

Index(['unnamed: 0', 'player', 'all_nba', 'all_star', 'draft_yr', 'pk', 'team',
       'college', 'yrs', 'games', 'minutes_played', 'pts', 'trb', 'ast',
       'fg_percentage', 'tp_percentage', 'ft_percentage', 'minutes_per_game',
       'points_per_game', 'trb_per_game', 'assits_per_game', 'win_share',
       'ws_per_game', 'bpm', 'vorp', 'executive', 'tenure', 'exec_id',
       'exec_draft_exp', 'attend_college', 'first_year', 'second_year',
       'third_year', 'fourth_year', 'fifth_year'],
      dtype='object')

In [112]:
df = df.rename(columns={'assits_per_game': 'assists_per_game'})

In [113]:
# First drop: dropping useless columns
cols = ['unnamed: 0', 'executive', 'tenure', 'exec_id', 'exec_draft_exp', 'first_year', 'second_year', 'third_year',
       'fourth_year', 'fifth_year']
df = df.drop(columns=cols)
df.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,team,college,yrs,games,minutes_played,pts,trb,ast,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,win_share,ws_per_game,bpm,vorp,attend_college
0,Robert Parish,2,9,1976,8,GSW,Centenary College of Louisiana,21,1611,45704,23334,14715,2180,0.537,0.0,0.721,28.4,14.5,9.1,1.4,147.0,0.154,1.6,41.5,1
1,Sonny Parker,0,0,1976,17,GSW,Texas A&M University,6,452,10916,4471,1841,954,0.501,0.0,0.755,24.2,9.9,4.1,2.1,26.9,0.118,2.2,11.5,1
2,Marshall Rogers,0,0,1976,34,GSW,University of Texas-Pan American,1,26,176,100,11,10,0.371,0.0,0.933,6.8,3.8,0.4,0.4,-0.2,-0.043,-9.9,-0.4,1
3,Jeff Fosnes,0,0,1976,68,GSW,Vanderbilt University,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,Carl Bird,0,0,1976,86,GSW,University of California,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [114]:
df.to_csv('nba_data.csv')

In [115]:
# Second drop: dropping columns with leakage
cols = ['team', 'college', 'games', 'minutes_played', 'pts', 'trb', 'ast', 'win_share']
df = df.drop(columns=cols)
df.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
0,Robert Parish,2,9,1976,8,21,0.537,0.0,0.721,28.4,14.5,9.1,1.4,0.154,1.6,41.5,1
1,Sonny Parker,0,0,1976,17,6,0.501,0.0,0.755,24.2,9.9,4.1,2.1,0.118,2.2,11.5,1
2,Marshall Rogers,0,0,1976,34,1,0.371,0.0,0.933,6.8,3.8,0.4,0.4,-0.043,-9.9,-0.4,1
3,Jeff Fosnes,0,0,1976,68,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,Carl Bird,0,0,1976,86,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [116]:
df.describe()

Unnamed: 0,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
count,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0
mean,0.119667,0.175461,1989.792477,70.996213,3.266094,0.231377,0.109144,0.365042,9.32512,3.697122,1.629588,0.825398,0.030053,-1.425196,2.393285,0.925271
std,0.911941,1.193951,11.500696,58.287369,4.614195,0.227287,0.155779,0.365702,10.921036,4.962079,2.178307,1.356721,0.076857,3.365708,8.865625,0.262986
min,0.0,0.0,1976.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.264,-53.6,-8.3,0.0
25%,0.0,0.0,1981.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.5,-0.1,1.0
50%,0.0,0.0,1985.0,50.0,1.0,0.318,0.0,0.458,4.8,1.4,0.7,0.2,0.0,0.0,0.0,1.0
75%,0.0,0.0,1999.0,112.0,5.0,0.446,0.256,0.734,17.7,6.1,2.7,1.1,0.07,0.0,0.0,1.0
max,15.0,18.0,2015.0,228.0,21.0,1.0,1.0,1.0,41.1,30.1,13.1,11.2,1.442,19.6,108.6,1.0


In [117]:
# show total before cut
df.shape

(3961, 17)

In [118]:
df = df[df['yrs'] != 0]
        

In [119]:
# show shape after cut
df.shape

(2092, 17)

In [120]:
df.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
0,Robert Parish,2,9,1976,8,21,0.537,0.0,0.721,28.4,14.5,9.1,1.4,0.154,1.6,41.5,1
1,Sonny Parker,0,0,1976,17,6,0.501,0.0,0.755,24.2,9.9,4.1,2.1,0.118,2.2,11.5,1
2,Marshall Rogers,0,0,1976,34,1,0.371,0.0,0.933,6.8,3.8,0.4,0.4,-0.043,-9.9,-0.4,1
11,Rickey Green,0,0,1977,16,14,0.469,0.207,0.807,24.6,9.4,1.9,5.5,0.097,-1.0,5.8,1
12,Wesley Cox,0,0,1977,18,2,0.412,0.0,0.51,11.0,4.6,2.8,0.3,0.007,-6.9,-1.0,1


In [121]:
player = df[['player']]

In [122]:
# need this for later
player.head()

Unnamed: 0,player
0,Robert Parish
1,Sonny Parker
2,Marshall Rogers
11,Rickey Green
12,Wesley Cox


# Split data 3-ways. We will train the model using all players drafted before 1990, validate the model on players drafted between 1990-1999, and test on players drafted in 2000 or later.

In [123]:
train= df[df['draft_yr'] <1990]
val= df[(df['draft_yr'] >=1990) & (df['draft_yr'] <2000)]
test= df[df['draft_yr'] >=2000]

In [124]:
train.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
0,Robert Parish,2,9,1976,8,21,0.537,0.0,0.721,28.4,14.5,9.1,1.4,0.154,1.6,41.5,1
1,Sonny Parker,0,0,1976,17,6,0.501,0.0,0.755,24.2,9.9,4.1,2.1,0.118,2.2,11.5,1
2,Marshall Rogers,0,0,1976,34,1,0.371,0.0,0.933,6.8,3.8,0.4,0.4,-0.043,-9.9,-0.4,1
11,Rickey Green,0,0,1977,16,14,0.469,0.207,0.807,24.6,9.4,1.9,5.5,0.097,-1.0,5.8,1
12,Wesley Cox,0,0,1977,18,2,0.412,0.0,0.51,11.0,4.6,2.8,0.3,0.007,-6.9,-1.0,1


In [125]:
val.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
109,Adonal Foyle,0,0,1997,8,12,0.477,0.0,0.499,17.8,4.1,4.7,0.5,0.069,-0.8,3.9,1
110,Marc Jackson,0,0,1997,37,7,0.446,0.226,0.814,19.6,8.4,4.3,0.8,0.082,-3.6,-3.1,1
115,Jerrod Mustaf,0,0,1990,17,4,0.449,0.0,0.648,10.6,4.0,2.5,0.6,0.043,-4.4,-1.2,1
116,Kendall Gill,0,0,1990,5,15,0.434,0.3,0.754,30.5,13.4,4.1,3.0,0.078,0.3,16.9,1
117,Steve Scheffler,0,0,1990,39,7,0.558,0.2,0.759,5.3,1.9,1.0,0.1,0.153,-2.1,0.0,1


In [126]:
test.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
123,J.R. Smith,0,0,2004,18,12,0.423,0.375,0.734,26.9,13.2,3.2,2.2,0.094,0.5,14.3,0
125,Chris Paul,8,9,2005,4,11,0.473,0.365,0.864,35.9,18.8,4.4,9.9,0.249,7.5,66.4,1
126,Brandon Bass,0,0,2005,33,11,0.496,0.2,0.829,22.4,8.9,4.7,0.8,0.123,-1.1,3.5,1
139,Emeka Okafor,0,0,2004,2,9,0.512,0.0,0.584,31.7,12.3,9.9,0.9,0.114,-0.4,7.5,1
140,Bernard Robinson,0,0,2004,45,3,0.417,0.161,0.786,14.9,4.5,2.5,1.0,0.049,-2.0,0.0,1


# Drop target from features

In [127]:
X_train = train.drop(columns=['yrs', 'player'])
y_train = train['yrs']
X_val = val.drop(columns=['yrs', 'player'])
y_val = val['yrs']
X_test = test.drop(columns=['yrs', 'player'])
y_test = test['yrs']

In [128]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((820, 15), (820,), (478, 15), (478,), (794, 15), (794,))

# Get baseline linear regression model

In [87]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [81]:
y_pred = model.predict(X_val)

In [82]:
score = mean_absolute_error(y_val, y_pred)
score

2.293549216961948

In [83]:
y_pred = model.predict(X_test)

In [84]:
score_test = mean_absolute_error(y_test, y_pred)
score_test

3.5748824192283215

# Using random forest regressor

In [129]:
model1 = RandomForestRegressor(n_estimators=1000, random_state=42)
model1.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [130]:
y_pred1 = model1.predict(X_val)
score1 = mean_absolute_error(y_val, y_pred1)
print('Validation score:', score1)
y_pred2 = model1.predict(X_test)
score2 = mean_absolute_error(y_test, y_pred2)
print('Test score:', score2)

Validation score: 1.4806171548117153
Test score: 1.9177682619647354


# Pickle The Model

In [131]:
joblib.dump(model1, 'nba_model')

['nba_model']

# Testing the API

In [132]:
url = 'http://localhost:5000/api'

In [133]:
data = json.dumps({'all_nba': 2, 'all_star': 9, 'draft_yr': 1976, 'pk': 8, 'fg_percentage': 0.537,
'tp_percentage': 0, 'ft_percentage': 0.721, 'minutes_per_game': 28.4, 
'points_per_game': 14.5, 'trb_per_game': 9.1, 'assists_per_game': 1.4,
'ws_per_game': 0.154, 'bpm': 1.6, 'vorp': 41.5, 'attend_college': 1})

In [134]:
print(data)

{"all_nba": 2, "all_star": 9, "draft_yr": 1976, "pk": 8, "fg_percentage": 0.537, "tp_percentage": 0, "ft_percentage": 0.721, "minutes_per_game": 28.4, "points_per_game": 14.5, "trb_per_game": 9.1, "assits_per_game": 1.4, "ws_per_game": 0.154, "bpm": 1.6, "vorp": 41.5, "attend_college": 1}


In [135]:
send = requests.post(url, data)

In [136]:
send.json()

{'results': {'y_pred': 18}}