# This is a playground for performing experiments before implementing them in the main code using pyspark for quick access

In [402]:
import pandas as pd
import glob

csv_path = "./NBA shot log 16-17-regular season/Shot data/"

# read all csvs in the directory
df = pd.concat([pd.read_csv(f) for f in glob.glob(csv_path + "*.csv")], ignore_index=True)

# Print the number of rows and columns
print(df.shape)

(210072, 16)


In [403]:
# print the number of shot types in the dataset that have a frequency of 1000 or more   
print(df['shot type'].value_counts()[df['shot type'].value_counts() >= 1000])
# print their count
print(df['shot type'].value_counts()[df['shot type'].value_counts() >= 1000].count())
# print their sum
print(df['shot type'].value_counts()[df['shot type'].value_counts() >= 1000].sum())

Jump Shot                     94078
Layup                         15826
Pullup Jump Shot              14671
Driving Layup                 13433
Floating Jump Shot             4522
Step Back Jump Shot            4454
Hook Shot                      4279
Tip Layup Shot                 3957
Cutting Layup Shot             3800
Running Layup                  3587
Turnaround Jump Shot           3245
Driving Floating Jump Shot     3199
Fadeaway Jumper                2780
Dunk                           2755
Putback Layup                  2263
Driving Finger Roll Layup      2261
Cutting Dunk Shot              2118
Reverse Layup                  1849
Turnaround Hook Shot           1828
Running Jump Shot              1824
Driving Reverse Layup          1688
Jump Bank Shot                 1467
Alley Oop Dunk                 1440
Driving Dunk                   1411
Driving Hook Shot              1296
Running Dunk                   1278
Name: shot type, dtype: int64
26
195309


In [404]:
# get the 10 most frequent shot types
shot_type = df['shot type'].value_counts().head(10)

# only keep shot types that are in the top 10
#df = df[df['shot type'].isin(shot_type.index)]

# encode the shot type as an integer
#df['shot type'] = df['shot type'].astype('category').cat.codes

# print the number of rows and columns
print(df.shape)

(162607, 16)


In [None]:
# 
def normalize_timer():
    for f in glob.glob(csv_path + "*.csv"):
        temp = pd.read_csv(f) 

        temp['time'] = temp['time'].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))
        temp['time'] = temp['time'] + (temp['quarter'] - 1) * 720
        # divide time by 2880 to normalize it
        temp['time'] = temp['time'] / 2880

        # save csv
        temp.to_csv(f, index=False)

In [None]:
# read player data
df_player = pd.read_csv("./NBA shot log 16-17-regular season/Player Regular 16-17 Stats.csv")



print(df_player.columns)
# remove the columns that are not needed [#Date/Time of Update: 2017-05-09 4:34:01 PM, #Player ID, #Jersey Num, #Birth Date, #Birth City, #Birth Couuntry, #Team ID
#                                         #Team Abbr, #Team City, #Team Name]
df_player = df_player.drop(['#Date/Time of Update: 2017-05-09 4:34:01 PM', '#Player ID', '#Jersey Num', '#Birth City', '#Birth Country', '#Team ID', '#Team Abbr', '#Team City', '#Team Name', '#Position'], axis=1)
print(df_player.columns)

# combine First Name and Last Name to create a new column called Player Name
df_player['shoot player'] = df_player['#FirstName'] + " " + df_player['#LastName']
df_player = df_player.drop(['#FirstName', '#LastName'], axis=1)
print(df_player.columns)

# connect df_player and df[0]
df_merged = pd.merge(df, df_player, on='shoot player', how='inner')
df_merged.to_csv("output/Standard/Single File/merged.csv", index=False)
print(df_merged.columns)

Index(['#Date/Time of Update: 2017-05-09 4:34:01 PM', '#Player ID',
       '#LastName', '#FirstName', '#Jersey Num', '#Position', '#Height',
       '#Weight', '#Birth Date', '#Age', '#Birth City', '#Birth Country',
       '#Rookie', '#Team ID', '#Team Abbr', '#Team City', '#Team Name',
       '#GamesPlayed', '#Fg2PtAtt', '#Fg2PtMade', '#Fg3PtAtt', '#Fg3PtMade',
       '#FtAtt', '#FtMade'],
      dtype='object')
Index(['#LastName', '#FirstName', '#Height', '#Weight', '#Birth Date', '#Age',
       '#Rookie', '#GamesPlayed', '#Fg2PtAtt', '#Fg2PtMade', '#Fg3PtAtt',
       '#Fg3PtMade', '#FtAtt', '#FtMade'],
      dtype='object')
Index(['#Height', '#Weight', '#Birth Date', '#Age', '#Rookie', '#GamesPlayed',
       '#Fg2PtAtt', '#Fg2PtMade', '#Fg3PtAtt', '#Fg3PtMade', '#FtAtt',
       '#FtMade', 'shoot player'],
      dtype='object')
Index(['self previous shot', 'player position', 'home game', 'location x',
       'opponent previous shot', 'home team', 'shot type', 'points',
       'away tea

In [None]:
df_merged_test = df_merged
# print unique values in self previous shot
columns_to_print = ['self previous shot', 'player position', 'home game', 'opponent previous shot', 'home team', 'points', 'time from last shot', 'quarter', 'current shot outcome', '#Position',
       '#Height', '#Weight', '#Age', '#Rookie', '#GamesPlayed', '#Fg2PtAtt',
       '#Fg2PtMade', '#Fg3PtAtt', '#Fg3PtMade', '#FtAtt', '#FtMade']



# Map poitns from [2, 3] to [0, 1]
df_merged_test['points'] = df_merged_test['points'].map({2: 0, 3: 1})

# for players with points = 2 add an extra column called accuracy for 2 pointers adn for players with points = 3 add an extra column called accuracy for 3 pointers
df_merged_test['accuracy'] = 0
df_merged_test['accuracy'][df_merged_test['points'] == 0] = df_merged_test['#Fg2PtMade'] / df_merged_test['#Fg2PtAtt']
df_merged_test['accuracy'][df_merged_test['points'] == 1] = df_merged_test['#Fg3PtMade'] / df_merged_test['#Fg3PtAtt']

# Drop players with 0 attempts in Fg2PtAtt, Fg3PtAtt, FtAtt
df_merged_test = df_merged_test[df_merged_test['#Fg2PtAtt'] != 0]
df_merged_test = df_merged_test[df_merged_test['#Fg3PtAtt'] != 0]
df_merged_test = df_merged_test[df_merged_test['#FtAtt'] != 0]

# Drop Fg2PtAtt, Fg2PtMade, Fg3PtAtt, Fg3PtMade, FtAtt, FtMade
df_merged_test = df_merged_test.drop(['#Fg2PtAtt', '#Fg2PtMade', '#Fg3PtAtt', '#Fg3PtMade', '#FtAtt', '#FtMade'], axis=1)
# Drop irrelevant columns
df_merged_test = df_merged_test.drop(['date','#Birth Date', 'away team', 'home team', 'shoot player', 'quarter', 'shot type'], axis=1)

# Convert height from feet-inches to inches
# height map
height_map = {'5\'4\"': 64, '5\'9\"': 69, '5\'10\"': 70, '5\'11\"': 71,
       '6\'0\"': 72, '6\'1\"': 73, '6\'2\"': 74, '6\'3\"': 75, '6\'4\"': 76, '6\'5\"': 77, '6\'6\"': 78, '6\'7\"': 79, '6\'8\"': 80,
       '6\'9\"': 81, '6\'10\"': 82, '6\'11\"': 83, '7\'0\"': 84, '7\'1\"': 85, '7\'2\"': 86, '7\'3\"': 87}
# convert height to inches using height_map
df_merged_test['#Height'] = df_merged_test['#Height'].map(height_map)
# replace height nans with the mean
df_merged_test['#Height'] = df_merged_test['#Height'].fillna(round(df_merged_test['#Height'].mean()))
# Normalize height subtracting the min and dividing by the range
df_merged_test['#Height'] = (df_merged_test['#Height'] - df_merged_test['#Height'].min()) / (df_merged_test['#Height'].max() - df_merged_test['#Height'].min())

# Fill age nans with the mean
df_merged_test['#Age'] = df_merged_test['#Age'].fillna(round(df_merged_test['#Age'].mean()))
# Normalize age using z-score
df_merged_test['#Age'] = (df_merged_test['#Age'] - df_merged_test['#Age'].mean()) / df_merged_test['#Age'].std()
# Fill weight nans with the mean
df_merged_test['#Weight'] = df_merged_test['#Weight'].fillna(round(df_merged_test['#Weight'].mean()))
# Normalize weight using z-score
df_merged_test['#Weight'] = (df_merged_test['#Weight'] - df_merged_test['#Weight'].mean()) / df_merged_test['#Weight'].std()

# Convert rookie to 0 and 1
df_merged_test['#Rookie'] = df_merged_test['#Rookie'].map({'N': 0, 'Y': 1})

# drop nans from rows with location x as null
# reason, very few they represent less than 0.1% of our dataset and have a null y location as well
df_merged_test = df_merged_test.dropna(subset=['location x']) 
# change the columns with location x > 470 to 940 - location x
df_merged_test.loc[df_merged_test['location x'] > 470, 'location x'] = 940 - df_merged_test['location x']

# normalize location x and location y
df_merged_test['location x'] = df_merged_test['location x'] / 470 # half court width
df_merged_test['location y'] = df_merged_test['location y'] / 500 # court length

# normalize games played z-score
df_merged_test['#GamesPlayed'] = (df_merged_test['#GamesPlayed'] - df_merged_test['#GamesPlayed'].mean()) / df_merged_test['#GamesPlayed'].std() 


# Map player position from ['SF' 'C' 'SG' 'PG' 'PF' 'G' 'F'] to [0, 1, 2, 3, 4 ,5, 6]
df_merged_test['player position'] = df_merged_test['player position'].map({'SF': 0, 'C': 1, 'SG': 2, 'PG': 3, 'PF': 4, 'G': 5, 'F': 6})

# Drop nulls in self previous shot and opponent previous shot
df_merged_test = df_merged_test.dropna(subset=['self previous shot', 'opponent previous shot'])

# Map current shot outcome from ['MISSED' 'BLOCKED' 'SCORED'] to [0, 0, 1]
df_merged_test['self previous shot'] = df_merged_test['self previous shot'].map({'MISSED': 0, 'BLOCKED': 0, 'SCORED': 1})
df_merged_test['opponent previous shot'] = df_merged_test['opponent previous shot'].map({'MISSED': 0, 'BLOCKED': 0, 'SCORED': 1})
df_merged_test['current shot outcome'] = df_merged_test['current shot outcome'].map({'MISSED': 0, 'BLOCKED': 0, 'SCORED': 1})
# for home game map home/away to 1/0
df_merged_test['home game'] = df_merged_test['home game'].map({'Yes': 1, 'No': 0})
df_merged_test = df_merged_test.drop(['home game'], axis=1)
# drop previous shots
df_merged_test = df_merged_test.drop(['self previous shot', 'opponent previous shot'], axis=1)

df_merged_test.to_csv("output/Standard/Single File/merged_test.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_test['accuracy'][df_merged_test['points'] == 0] = df_merged_test['#Fg2PtMade'] / df_merged_test['#Fg2PtAtt']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_test['accuracy'][df_merged_test['points'] == 1] = df_merged_test['#Fg3PtMade'] / df_merged_test['#Fg3PtAtt']


In [None]:
# print nulls per column in df_player
df_merged_test['time from last shot'] = df_merged_test['time from last shot'].fillna(round(df_merged_test['time from last shot'].mean()))

# drop time from last shot
df_merged_test = df_merged_test.drop(['time from last shot'], axis=1)

# print first 10 rows with time from last shot as nan
#print(df[df['time from last shot'].isnull()].head(10))
corr = df_merged_test.corr()
print(df_merged_test.columns)
# print correlation with "curent shot outcome"
print(corr['current shot outcome'].sort_values(ascending=False))

Index(['player position', 'location x', 'points', 'location y', 'time',
       'current shot outcome', '#Height', '#Weight', '#Age', '#Rookie',
       '#GamesPlayed', 'accuracy'],
      dtype='object')
current shot outcome    1.000000
accuracy                0.112709
#Weight                 0.016465
#GamesPlayed            0.016219
#Age                    0.011761
#Height                 0.008771
location y              0.002932
player position        -0.012777
#Rookie                -0.019308
time                   -0.023336
points                 -0.084325
location x             -0.099415
Name: current shot outcome, dtype: float64


In [None]:


# train a model to predict current shot outcome
# logistic regression
# Split data into train and test
# display first 5 rows of df_merged_test
print(df_merged_test.head(5))
X = df_merged_test.drop(['current shot outcome'], axis=1)
y = df_merged_test['current shot outcome']
import sklearn.model_selection as model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, r2_score


# Train model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("MSE:", mean_squared_error(y_test, y_pred))
# Print accuracy
print("Accuracy:", model.score(X_test, y_test))
# display r2 score
print("R2 Score:", r2_score(y_test, y_pred))

   player position  location x  points  location y      time  \
1                0    0.593617       1       0.260  0.066319   
2                0    0.123404       0       0.550  0.455903   
3                0    0.153191       1       0.950  0.521528   
4                0    0.529787       1       0.200  0.600694   
5                0    0.529787       0       0.362  0.635069   

   current shot outcome   #Height   #Weight      #Age  #Rookie  #GamesPlayed  \
1                     0  0.565217 -0.602271  0.017925        0      0.316118   
2                     0  0.565217 -0.602271  0.017925        0      0.316118   
3                     0  0.565217 -0.602271  0.017925        0      0.316118   
4                     1  0.565217 -0.602271  0.017925        0      0.316118   
5                     0  0.565217 -0.602271  0.017925        0      0.316118   

   accuracy  
1  0.345865  
2  0.445175  
3  0.345865  
4  0.345865  
5  0.445175  
MSE: 0.40306398700512786
Accuracy: 0.5969360129948

In [None]:
# LightGBM

import lightgbm as lgb

# Create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# predict using lightgbm

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'num_threads': 4
}

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

#print('Saving model...')

# save model to file

#gbm.save_model('model.txt')

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

# feature importances
print('Feature importances:', list(gbm.feature_importance()))

# display accuracy gbm
print("Accuracy:", accuracy_score(y_test, y_pred.round()))
# print r2 score
print("R2 score:", r2_score(y_test, y_pred.round()))

Starting training...
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's l2: 0.240229	valid_0's l1: 0.481638
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 0.239534	valid_0's l1: 0.480895
[3]	valid_0's l2: 0.23891	valid_0's l1: 0.480192
[4]	valid_0's l2: 0.238341	valid_0's l1: 0.47952
[5]	valid_0's l2: 0.23787	valid_0's l1: 0.47894
[6]	valid_0's l2: 0.237378	valid_0's l1: 0.478308
[7]	valid_0's l2: 0.236999	valid_0's l1: 0.477791
[8]	valid_0's l2: 0.2366	valid_0's l1: 0.477226
[9]	valid_0's l2: 0.236231	valid_0's l1: 0.476678
[10]	valid_0's l2: 0.235899	valid_0's l1: 0.476164
[11]	valid_0's l2: 0.235632	valid_0's l1: 0.475726
[12]	valid_0's l2: 0.235383	valid_0's l1: 0.475295
[13]	valid_0's l2: 0.235159	valid_0's l1: 0.474886
[14]	valid_0's l2: 0.23493	valid_0's l1: 0.474447




[15]	valid_0's l2: 0.234719	valid_0's l1: 0.474027
[16]	valid_0's l2: 0.234548	valid_0's l1: 0.473688
[17]	valid_0's l2: 0.234381	valid_0's l1: 0.473341
[18]	valid_0's l2: 0.234213	valid_0's l1: 0.472981
[19]	valid_0's l2: 0.234062	valid_0's l1: 0.47263
[20]	valid_0's l2: 0.233906	valid_0's l1: 0.472281
Did not meet early stopping. Best iteration is:
[20]	valid_0's l2: 0.233906	valid_0's l1: 0.472281
Starting predicting...
The rmse of prediction is: 0.48363800904937687
Feature importances: [9, 168, 17, 150, 52, 16, 26, 23, 2, 10, 127]
Accuracy: 0.6183074816065229
R2 score: -0.5847807799019709


In [None]:
# use xgboost
import xgboost as xgb

# specify parameters via map

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'rmse',
    'eta': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'silent': 1,
    'seed': 42,
    'tree_method': 'auto'
}

num_round = 50
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
evallist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(params, dtrain, num_round, evallist, early_stopping_rounds=10)

# make prediction
preds = bst.predict(dtest)

# display accuracy xgboost
# print rmse
print("RMSE:", mean_squared_error(y_test, preds) ** 0.5)
print("Accuracy:", accuracy_score(y_test, preds.round()))
print ("R2 score:", r2_score(y_test, preds.round()))

Parameters: { "silent" } are not used.

[0]	eval-rmse:0.49709	train-rmse:0.49701
[1]	eval-rmse:0.49446	train-rmse:0.49432
[2]	eval-rmse:0.49255	train-rmse:0.49231
[3]	eval-rmse:0.49081	train-rmse:0.49045
[4]	eval-rmse:0.48943	train-rmse:0.48896




[5]	eval-rmse:0.48818	train-rmse:0.48763
[6]	eval-rmse:0.48734	train-rmse:0.48672
[7]	eval-rmse:0.48659	train-rmse:0.48589
[8]	eval-rmse:0.48584	train-rmse:0.48506
[9]	eval-rmse:0.48538	train-rmse:0.48458
[10]	eval-rmse:0.48487	train-rmse:0.48396
[11]	eval-rmse:0.48448	train-rmse:0.48346
[12]	eval-rmse:0.48408	train-rmse:0.48299
[13]	eval-rmse:0.48383	train-rmse:0.48262
[14]	eval-rmse:0.48362	train-rmse:0.48233
[15]	eval-rmse:0.48342	train-rmse:0.48209
[16]	eval-rmse:0.48330	train-rmse:0.48189
[17]	eval-rmse:0.48314	train-rmse:0.48166
[18]	eval-rmse:0.48305	train-rmse:0.48149
[19]	eval-rmse:0.48290	train-rmse:0.48127
[20]	eval-rmse:0.48280	train-rmse:0.48111
[21]	eval-rmse:0.48272	train-rmse:0.48096
[22]	eval-rmse:0.48266	train-rmse:0.48081
[23]	eval-rmse:0.48258	train-rmse:0.48064
[24]	eval-rmse:0.48255	train-rmse:0.48050
[25]	eval-rmse:0.48249	train-rmse:0.48039
[26]	eval-rmse:0.48245	train-rmse:0.48026
[27]	eval-rmse:0.48244	train-rmse:0.48014
[28]	eval-rmse:0.48242	train-rmse:0.480