In [1]:
import joblib
import json
import requests
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
df = pd.read_csv('../data/nba_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,player,all_nba,all_star,draft_yr,pk,team,college,yrs,games,minutes_played,pts,trb,ast,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,win_share,ws_per_game,bpm,vorp,attend_college
0,0,Deandre Ayton,0,0,2018,1,PHO,Arizona,1,71,2183,1159,729,125,0.585,0.0,0.746,30.7,16.3,10.3,1.8,5.8,0.128,0.2,1.2,True
1,1,Marvin Bagley,0,0,2018,2,SAC,Duke,1,62,1567,923,471,62,0.504,0.313,0.691,25.3,14.9,7.6,1.0,3.6,0.11,-1.8,0.1,True
2,2,Luka Doncic,0,0,2018,3,ATL,No college,1,72,2318,1526,563,429,0.427,0.327,0.713,32.2,21.2,7.8,6.0,4.9,0.101,4.1,3.6,False
3,3,Jaren Jackson,0,0,2018,4,MEM,Michigan State,1,58,1515,798,272,64,0.506,0.359,0.766,26.1,13.8,4.7,1.1,3.3,0.105,0.1,0.8,True
4,4,Trae Young,0,0,2018,5,DAL,Oklahoma,1,81,2503,1549,301,653,0.418,0.324,0.829,30.9,19.1,3.7,8.1,3.3,0.062,-1.1,0.6,True


In [6]:
# First drop: dropping useless columns
df = df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,team,college,yrs,games,minutes_played,pts,trb,ast,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,win_share,ws_per_game,bpm,vorp,attend_college
0,Deandre Ayton,0,0,2018,1,PHO,Arizona,1,71,2183,1159,729,125,0.585,0.0,0.746,30.7,16.3,10.3,1.8,5.8,0.128,0.2,1.2,True
1,Marvin Bagley,0,0,2018,2,SAC,Duke,1,62,1567,923,471,62,0.504,0.313,0.691,25.3,14.9,7.6,1.0,3.6,0.11,-1.8,0.1,True
2,Luka Doncic,0,0,2018,3,ATL,No college,1,72,2318,1526,563,429,0.427,0.327,0.713,32.2,21.2,7.8,6.0,4.9,0.101,4.1,3.6,False
3,Jaren Jackson,0,0,2018,4,MEM,Michigan State,1,58,1515,798,272,64,0.506,0.359,0.766,26.1,13.8,4.7,1.1,3.3,0.105,0.1,0.8,True
4,Trae Young,0,0,2018,5,DAL,Oklahoma,1,81,2503,1549,301,653,0.418,0.324,0.829,30.9,19.1,3.7,8.1,3.3,0.062,-1.1,0.6,True


In [7]:
df.to_csv('nba_data.csv')

In [8]:
# Second drop: dropping columns with leakage
cols = ['team', 'college', 'games', 'minutes_played', 'pts', 'trb', 'ast', 'win_share']
df = df.drop(columns=cols)
df.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
0,Deandre Ayton,0,0,2018,1,1,0.585,0.0,0.746,30.7,16.3,10.3,1.8,0.128,0.2,1.2,True
1,Marvin Bagley,0,0,2018,2,1,0.504,0.313,0.691,25.3,14.9,7.6,1.0,0.11,-1.8,0.1,True
2,Luka Doncic,0,0,2018,3,1,0.427,0.327,0.713,32.2,21.2,7.8,6.0,0.101,4.1,3.6,False
3,Jaren Jackson,0,0,2018,4,1,0.506,0.359,0.766,26.1,13.8,4.7,1.1,0.105,0.1,0.8,True
4,Trae Young,0,0,2018,5,1,0.418,0.324,0.829,30.9,19.1,3.7,8.1,0.062,-1.1,0.6,True


In [9]:
df.describe()

Unnamed: 0,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp
count,4165.0,4165.0,4165.0,4165.0,4165.0,4165.0,4165.0,4165.0,4165.0,4165.0,4165.0,4165.0,4165.0,4165.0,4165.0
mean,0.12413,0.233373,1990.906122,69.540216,3.4,0.238919,0.116188,0.378026,9.626483,3.847947,1.691597,0.858944,0.031019,-1.462257,2.495486
std,0.930178,1.270099,12.541098,57.780823,4.6945,0.22701,0.157755,0.365774,10.918073,5.03811,2.198878,1.368999,0.076914,3.396649,9.074733
min,0.0,0.0,1976.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.264,-53.6,-8.3
25%,0.0,0.0,1981.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.5,-0.1
50%,0.0,0.0,1986.0,49.0,1.0,0.34,0.0,0.5,5.7,1.8,0.9,0.2,0.0,0.0,0.0
75%,0.0,0.0,2001.0,109.0,6.0,0.448,0.28,0.74,18.1,6.3,2.8,1.2,0.073,0.0,0.0
max,15.0,18.0,2018.0,228.0,21.0,1.0,1.0,1.0,41.1,30.1,13.7,11.2,1.442,19.6,129.8


In [10]:
# show total before cut
df.shape

(4165, 17)

In [11]:
df = df[df['yrs'] != 0]
        

In [12]:
# show shape after cut
df.shape

(2272, 17)

In [13]:
df.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
0,Deandre Ayton,0,0,2018,1,1,0.585,0.0,0.746,30.7,16.3,10.3,1.8,0.128,0.2,1.2,True
1,Marvin Bagley,0,0,2018,2,1,0.504,0.313,0.691,25.3,14.9,7.6,1.0,0.11,-1.8,0.1,True
2,Luka Doncic,0,0,2018,3,1,0.427,0.327,0.713,32.2,21.2,7.8,6.0,0.101,4.1,3.6,False
3,Jaren Jackson,0,0,2018,4,1,0.506,0.359,0.766,26.1,13.8,4.7,1.1,0.105,0.1,0.8,True
4,Trae Young,0,0,2018,5,1,0.418,0.324,0.829,30.9,19.1,3.7,8.1,0.062,-1.1,0.6,True


In [14]:
player = df[['player']]

In [15]:
# need this for later
player.head()

Unnamed: 0,player
0,Deandre Ayton
1,Marvin Bagley
2,Luka Doncic
3,Jaren Jackson
4,Trae Young


# Split data 3-ways. We will train the model using all players drafted before 1990, validate the model on players drafted between 1990-1999, and test on players drafted in 2000 or later.

In [16]:
train= df[df['draft_yr'] <1990]
val= df[(df['draft_yr'] >=1990) & (df['draft_yr'] <2000)]
test= df[df['draft_yr'] >=2000]

In [17]:
train.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
1688,Pervis Ellison,0,0,1989,1,11,0.51,0.05,0.689,24.5,9.5,6.7,1.5,0.09,0.5,7.4,True
1689,Danny Ferry,0,0,1989,2,13,0.446,0.393,0.84,19.8,7.0,2.8,1.3,0.092,-0.9,5.0,True
1690,Sean Elliott,0,2,1989,3,12,0.465,0.375,0.799,33.0,14.2,4.3,2.6,0.109,0.6,16.1,True
1691,Glen Rice,2,3,1989,4,15,0.456,0.4,0.846,35.0,18.3,4.4,2.1,0.122,0.5,22.1,True
1692,J.R. Reid,0,0,1989,5,11,0.472,0.135,0.716,22.9,8.5,5.0,1.0,0.07,-2.0,-0.2,True


In [18]:
val.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
1129,Elton Brand,1,2,1999,1,17,0.5,0.095,0.736,33.0,15.9,8.5,2.1,0.151,3.1,45.2,True
1130,Steve Francis,0,3,1999,2,9,0.429,0.341,0.797,37.6,18.1,5.6,6.0,0.12,3.2,28.2,True
1131,Baron Davis,1,2,1999,3,13,0.409,0.32,0.711,34.2,16.1,3.8,7.2,0.106,2.7,34.1,True
1132,Lamar Odom,0,0,1999,4,14,0.463,0.312,0.693,33.4,13.3,8.4,3.7,0.115,2.5,36.3,True
1133,Jonathan Bender,0,0,1999,5,8,0.417,0.34,0.763,14.7,5.5,2.2,0.6,0.047,-3.5,-1.5,False


In [19]:
test.head()

Unnamed: 0,player,all_nba,all_star,draft_yr,pk,yrs,fg_percentage,tp_percentage,ft_percentage,minutes_per_game,points_per_game,trb_per_game,assists_per_game,ws_per_game,bpm,vorp,attend_college
0,Deandre Ayton,0,0,2018,1,1,0.585,0.0,0.746,30.7,16.3,10.3,1.8,0.128,0.2,1.2,True
1,Marvin Bagley,0,0,2018,2,1,0.504,0.313,0.691,25.3,14.9,7.6,1.0,0.11,-1.8,0.1,True
2,Luka Doncic,0,0,2018,3,1,0.427,0.327,0.713,32.2,21.2,7.8,6.0,0.101,4.1,3.6,False
3,Jaren Jackson,0,0,2018,4,1,0.506,0.359,0.766,26.1,13.8,4.7,1.1,0.105,0.1,0.8,True
4,Trae Young,0,0,2018,5,1,0.418,0.324,0.829,30.9,19.1,3.7,8.1,0.062,-1.1,0.6,True


# Drop target from features

In [20]:
X_train = train.drop(columns=['yrs', 'player'])
y_train = train['yrs']
X_val = val.drop(columns=['yrs', 'player'])
y_val = val['yrs']
X_test = test.drop(columns=['yrs', 'player'])
y_test = test['yrs']

In [21]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((828, 15), (828,), (478, 15), (478,), (966, 15), (966,))

# Get baseline linear regression model

In [22]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
y_pred = model.predict(X_val)

In [24]:
score = mean_absolute_error(y_val, y_pred)
score

2.3253809688483194

In [25]:
y_pred = model.predict(X_test)

In [26]:
score_test = mean_absolute_error(y_test, y_pred)
score_test

3.4739595166667057

# Using random forest regressor

In [27]:
model1 = RandomForestRegressor(n_estimators=1000, random_state=42)
model1.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [28]:
y_pred1 = model1.predict(X_val)
score1 = mean_absolute_error(y_val, y_pred1)
print('Validation score:', score1)
y_pred2 = model1.predict(X_test)
score2 = mean_absolute_error(y_test, y_pred2)
print('Test score:', score2)

Validation score: 1.5035857740585772
Test score: 1.8228343685300208


# Pickle The Model

In [29]:
joblib.dump(model1, 'nba_model')

['nba_model']

# Testing the API

In [30]:
url = 'https://nba-project.herokuapp.com/api'

In [31]:
data = json.dumps({'all_nba': 2, 'all_star': 9, 'draft_yr': 1976, 'pk': 8, 'fg_percentage': 0.537,
'tp_percentage': 0, 'ft_percentage': 0.721, 'minutes_per_game': 28.4, 
'points_per_game': 14.5, 'trb_per_game': 9.1, 'assists_per_game': 1.4,
'ws_per_game': 0.154, 'bpm': 1.6, 'vorp': 41.5, 'attend_college': 1})

In [32]:
print(data)

{"all_nba": 2, "all_star": 9, "draft_yr": 1976, "pk": 8, "fg_percentage": 0.537, "tp_percentage": 0, "ft_percentage": 0.721, "minutes_per_game": 28.4, "points_per_game": 14.5, "trb_per_game": 9.1, "assists_per_game": 1.4, "ws_per_game": 0.154, "bpm": 1.6, "vorp": 41.5, "attend_college": 1}


In [33]:
send = requests.post(url, data)

In [34]:
send.json()

{'results': {'y_pred': 18}}