In [None]:
import pandas as pd
import numpy as np
import io
import boto3

In [2]:
# prepare list of files for download from s3
files = {'boxscores_2008-09.csv',
         'boxscores_2009-10.csv',
         'boxscores_2010-11.csv',
         'boxscores_2011-12.csv',
         'boxscores_2012-13.csv', 
         'boxscores_2013-14.csv', 
         'boxscores_2014-15.csv', 
         'boxscores_2015-16.csv', 
         'boxscores_2016-17.csv', 
         'boxscores_2017-18.csv', 
         'boxscores_2018-19.csv', 
         'boxscores_2019-20.csv'}

In [None]:
# download data from s3 
bucket = 'nbapc'
s3 = boto3.client('s3')

data = []
for file in files:
    s3_file_key = 'crawled_data/boxscores/' + file
    obj = s3.get_object(Bucket=bucket, Key=s3_file_key)
    df_single = pd.read_csv(obj['Body'], sep=';',encoding='utf8')
    data.append(df_single)
df = pd.concat(data)

In [6]:
# FEATURE ENGINEERING
# 1. GET BOOLEAN HOME GAME
df['HOME_GAME'] = np.where(df['MATCH_UP'].str.contains('vs.'),1,0)
# 2. GET STRING OPPONENT 
df['OPPONENT'] = np.where(df['MATCH_UP'].str.contains('vs.'), df['MATCH_UP'].map(lambda x: x.split('vs.')[-1]), df['MATCH_UP'].map(lambda x: x.split('@')[-1]))
# 3. GET WIN AS BOOLEAN
df['W'] = np.where(df['W_L']=="W",1,0)
# 4. KEEP ONLY COLUMNS
df = df.filter(['Player','Team', 'GAME_DATE', 'MIN', 'PTS', 'HOME_GAME', 'W'])

In [7]:
# Select a player
player = 'LeBron James'
df = df[df['Player']==player]
# Sort by date
df['GAME_DATE'] =pd.to_datetime(df.GAME_DATE)
df.sort_values(by='GAME_DATE')
df.head()

Unnamed: 0,Player,Team,GAME_DATE,MIN,PTS,HOME_GAME,W
517,LeBron James,CLE,2009-04-13,39,37,0,1
627,LeBron James,CLE,2009-04-12,30,29,1,1
889,LeBron James,CLE,2009-04-10,37,27,0,1
1257,LeBron James,CLE,2009-04-08,31,21,1,1
1730,LeBron James,CLE,2009-04-05,37,38,1,1


In [8]:
# Engineering, introduce L5 games feature
df['min_G-1'] = df.MIN.shift(1)
df['min_G-2'] = df.MIN.shift(2)
df['min_G-3'] = df.MIN.shift(3)
df['min_G-4'] = df.MIN.shift(4)
df['min_G-5'] = df.MIN.shift(5)
df['pts_G-1'] = df.PTS.shift(1)
df['pts_G-2'] = df.PTS.shift(2)
df['pts_G-3'] = df.PTS.shift(3)
df['pts_G-4'] = df.PTS.shift(4)
df['pts_G-5'] = df.PTS.shift(5)
df['home_G-1'] = df.HOME_GAME.shift(1)
df['home_G-2'] = df.HOME_GAME.shift(2)
df['home_G-3'] = df.HOME_GAME.shift(3)
df['home_G-4'] = df.HOME_GAME.shift(4)
df['home_G-5'] = df.HOME_GAME.shift(5)

In [9]:
df = df.drop(['Player', 'Team', 'GAME_DATE', 'MIN', 'W'], axis=1).dropna()

In [10]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

X = df.drop('PTS', axis=1)
y = df['PTS']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 5)
knnreg = KNeighborsRegressor()

grid_values = {'n_neighbors': [1,5,6,7,8,9,10]}

# default metric to optimize over grid parameters: neg_mean_squared_error
knnreg = GridSearchCV(knnreg, param_grid = grid_values, scoring='neg_mean_absolute_error')
knnreg.fit(X_train, y_train)

print('Grid best parameter (max. neg_mean_absolute_error): ', knnreg.best_params_)
print('Grid best score (neg_mean_absolute_error): ', knnreg.best_score_)

Grid best parameter (max. neg_mean_absolute_error):  {'n_neighbors': 10}
Grid best score (neg_mean_absolute_error):  -6.080744820351429


In [11]:
# knnreg.predict([[1,35.0,28.0,43.0,37.0,35.0,20.0,23.0,39.0,21.0,30.0,1.0,1.0,0.0,0.0,0.0]])

array([27.2])

In [12]:
# from joblib import dump, load
# dump(knnreg, 'knnreg.joblib')
# clf = load('filename.joblib')

['knnreg.joblib']

In [13]:
import pickle

# Save to file in the current working directory
pkl_filename = "pickle_model.pkl"
with open("knnreg.pkl", 'wb') as file:
    pickle.dump(knnreg, file)

In [1]:
# from platform import python_version
# print(python_version())

3.6.9
