In [69]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

### Loading the data 

In [59]:
data = pd.read_csv('Data/data.csv')
pd.set_option('display.max_columns', None)
data.shape

(119011, 34)

In [60]:
data.tail()

Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,PLAYER_NAME,PLAYER_ID,PLAYER_HEIGHT,DEFENDER_HEIGHT,PLAYER_PTS,PLAYER_FG%,PLAYER_3P%,PLAYER_FT%,PLAYER_AST,PLAYER_WEIGHT,DEFENDER_OREB,DEFENDER_DREB,DEFENDER_STL,DEFENDER_BLK,DEFENDER_WEIGHT
119006,21400175,"NOV 21, 2014 - MIL @ TOR",A,L,-42,3,4,8:55,15.2,0,0.8,22.9,3,missed,BRUNO CABOCLO,203998,10.6,0,0,JARED DUDLEY,201162,197.5,202.5,7.2,46.8,38.5,71.6,1.8,101.25,0.1,0.1,0.0,0.1,98.1
119007,21400003,"OCT 28, 2014 - HOU @ LAL",A,W,18,3,3,1:35,3.3,1,2.4,6.7,2,made,JULIUS RANDLE,203944,5.3,1,2,KOSTAS PAPANIKOLAOU,203123,200.0,202.5,4.2,35.0,29.2,72.2,2.0,101.25,0.0,0.0,0.0,0.0,112.5
119008,21400003,"OCT 28, 2014 - HOU @ LAL",A,W,18,2,2,10:10,24.0,0,0.3,4.1,2,made,JULIUS RANDLE,203944,3.2,1,2,DWIGHT HOWARD,2730,207.5,202.5,15.8,59.3,50.0,52.8,1.2,119.25,0.0,0.0,0.0,0.0,112.5
119009,21400744,"FEB 05, 2015 - SAC vs. DAL",H,L,-23,9,4,3:21,13.3,5,5.4,21.9,2,missed,RICKY LEDO,203495,5.0,0,0,NIK STAUSKAS,203917,195.0,201.0,4.4,36.5,32.2,85.9,0.9,92.25,0.9,1.2,0.4,0.1,88.0
119010,21400498,"JAN 03, 2015 - MIN vs. UTA",H,L,-12,7,4,0:26,2.6,1,0.7,4.3,2,made,TOURE' MURRY,203315,2.8,1,2,ANTHONY BENNETT,203461,200.0,192.5,5.2,42.1,30.4,64.1,0.8,110.25,0.2,0.0,0.2,0.0,87.75


In [61]:
data.columns.values

array(['GAME_ID', 'MATCHUP', 'LOCATION', 'W', 'FINAL_MARGIN',
       'SHOT_NUMBER', 'PERIOD', 'GAME_CLOCK', 'SHOT_CLOCK', 'DRIBBLES',
       'TOUCH_TIME', 'SHOT_DIST', 'PTS_TYPE', 'SHOT_RESULT',
       'CLOSEST_DEFENDER', 'CLOSEST_DEFENDER_PLAYER_ID', 'CLOSE_DEF_DIST',
       'FGM', 'PTS', 'PLAYER_NAME', 'PLAYER_ID', 'PLAYER_HEIGHT',
       'DEFENDER_HEIGHT', 'PLAYER_PTS', 'PLAYER_FG%', 'PLAYER_3P%',
       'PLAYER_FT%', 'PLAYER_AST', 'PLAYER_WEIGHT', 'DEFENDER_OREB',
       'DEFENDER_DREB', 'DEFENDER_STL', 'DEFENDER_BLK', 'DEFENDER_WEIGHT'],
      dtype=object)

### Manipulating the data

In [62]:
# separate data into matrix of predictor variables and vector of target variables
names = ['LOCATION', 'SHOT_NUMBER', 'PERIOD', 'GAME_CLOCK', 'SHOT_CLOCK',
         'DRIBBLES', 'SHOT_DIST', 'PTS_TYPE', 'CLOSE_DEF_DIST', 'FGM', 'PTS',
         'PLAYER_HEIGHT','DEFENDER_HEIGHT', 'PLAYER_PTS', 'PLAYER_FG%', 'PLAYER_3P%',
          'PLAYER_FT%','DEFENDER_STL', 'DEFENDER_BLK', 'SHOT_RESULT']

# converting data to usable numerical data
# todo: move this to preprocessing 
def convert_time(time_string):
    time = time_string.split(':')
    return float(time[0])*60 + float(time[1])

# convert game_clock to seconds
data['GAME_CLOCK'] = data['GAME_CLOCK'].apply(lambda x: convert_time(x))

# convert shot_result to either 0 or 1
data['SHOT_RESULT'] = (data['SHOT_RESULT'] == 'made').astype(int)
# convert win to either 0 

# encode categorical data 
data_encoded = pd.get_dummies(data[names])

# split into train and test set with roughly 75-25 split
train, test = train_test_split(data_encoded, test_size=0.25)
Xtrain = train.drop(['SHOT_RESULT'], axis=1)
ytrain = train['SHOT_RESULT']
Xtest = test.drop(['SHOT_RESULT'], axis=1)
ytest = test['SHOT_RESULT']

### Setting the baseline

First baseline we consider is to predict every shot within 5 feet of the basket as a make and anything else as a miss. 

In [63]:
yhat = (Xtest['SHOT_DIST'] <= 5).astype(int)
acc = np.mean(yhat == ytest)
print('Accuracy = {0:f}'.format(acc))

Accuracy = 0.606460


A more complex baseline we can try is to use k-nn algorithm. First, we need to normalize the data so that no one parameter exerts more influence on the result than others. 

In [71]:
# without normalizing data 
neigh = KNeighborsClassifier(n_neighbors=20)
neigh.fit(Xtrain,ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')

In [74]:
yhat_ts = neigh.predict(Xtest)

In [73]:
ytest

92690     0
38460     0
76087     1
18083     0
77878     1
         ..
108621    0
90672     1
61232     0
116342    0
23770     1
Name: SHOT_RESULT, Length: 29753, dtype: int64

In [75]:
acc = np.mean(ytest == yhat_ts)
print('Accuracy = {0:f}'.format(acc))

Accuracy = 0.630121


In [76]:
# with normaliziation 
scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain_norm = scaler.transform(Xtrain)
scaler.fit(Xtest)
Xtest_norm = scaler.transform(Xtest)

In [77]:
neigh = KNeighborsClassifier(n_neighbors=20)
neigh.fit(Xtrain_norm,ytrain)
yhat_ts_norm = neigh.predict(Xtest_norm)

In [78]:
acc = np.mean(ytest == yhat_ts_norm)
print('Accuracy = {0:f}'.format(acc))

Accuracy = 1.000000


In [79]:
Xtest_norm

array([[ 1.17997593,  1.34156716,  1.50316474, ..., -0.86539633,
        -1.00772655,  1.00772655],
       [-0.09789587,  1.34156716,  0.48305355, ...,  1.35858441,
         0.99233269, -0.99233269],
       [-1.16278903, -1.28838415, -0.50734567, ...,  1.17325268,
        -1.00772655,  1.00772655],
       ...,
       [ 0.3280614 ,  0.46491672,  0.57714147, ...,  0.06126231,
        -1.00772655,  1.00772655],
       [ 3.09678362,  1.34156716,  0.1562218 , ..., -0.86539633,
        -1.00772655,  1.00772655],
       [-0.52385313, -0.41173371,  1.23575695, ..., -0.86539633,
        -1.00772655,  1.00772655]])