# This is a playground for performing experiments before implementing them in the main code using pyspark for quick access

In [233]:
import pandas as pd
import glob

csv_path = "./NBA shot log 16-17-regular season/Shot data/"

# read all csvs in the directory
df = pd.concat([pd.read_csv(f) for f in glob.glob(csv_path + "*.csv")], ignore_index=True)

# Print the number of rows and columns
print(df.shape)

(210072, 16)


In [234]:
# read player data
df_player = pd.read_csv("./NBA shot log 16-17-regular season/Player Regular 16-17 Stats.csv")



print(df_player.columns)
# remove the columns that are not needed [#Date/Time of Update: 2017-05-09 4:34:01 PM, #Player ID, #Jersey Num, #Birth Date, #Birth City, #Birth Couuntry, #Team ID
#                                         #Team Abbr, #Team City, #Team Name]
df_player = df_player.drop(['#Date/Time of Update: 2017-05-09 4:34:01 PM', '#Player ID', '#Jersey Num', '#Birth City', '#Birth Country', '#Team ID', '#Team Abbr', '#Team City', '#Team Name', '#Position'], axis=1)
print(df_player.columns)

# combine First Name and Last Name to create a new column called Player Name
df_player['shoot player'] = df_player['#FirstName'] + " " + df_player['#LastName']
df_player = df_player.drop(['#FirstName', '#LastName'], axis=1)
print(df_player.columns)

# connect df_player and df[0]
df_merged = pd.merge(df, df_player, on='shoot player', how='inner')
df_merged.to_csv("output/Standard/Single File/merged.csv", index=False)
print(df_merged.columns)

Index(['#Date/Time of Update: 2017-05-09 4:34:01 PM', '#Player ID',
       '#LastName', '#FirstName', '#Jersey Num', '#Position', '#Height',
       '#Weight', '#Birth Date', '#Age', '#Birth City', '#Birth Country',
       '#Rookie', '#Team ID', '#Team Abbr', '#Team City', '#Team Name',
       '#GamesPlayed', '#Fg2PtAtt', '#Fg2PtMade', '#Fg3PtAtt', '#Fg3PtMade',
       '#FtAtt', '#FtMade'],
      dtype='object')
Index(['#LastName', '#FirstName', '#Height', '#Weight', '#Birth Date', '#Age',
       '#Rookie', '#GamesPlayed', '#Fg2PtAtt', '#Fg2PtMade', '#Fg3PtAtt',
       '#Fg3PtMade', '#FtAtt', '#FtMade'],
      dtype='object')
Index(['#Height', '#Weight', '#Birth Date', '#Age', '#Rookie', '#GamesPlayed',
       '#Fg2PtAtt', '#Fg2PtMade', '#Fg3PtAtt', '#Fg3PtMade', '#FtAtt',
       '#FtMade', 'shoot player'],
      dtype='object')
Index(['self previous shot', 'player position', 'home game', 'location x',
       'opponent previous shot', 'home team', 'shot type', 'points',
       'away tea

In [235]:
df_merged_test = df_merged
# print unique values in self previous shot
columns_to_print = ['self previous shot', 'player position', 'home game', 'opponent previous shot', 'home team', 'points', 'time from last shot', 'quarter', 'current shot outcome', '#Position',
       '#Height', '#Weight', '#Age', '#Rookie', '#GamesPlayed', '#Fg2PtAtt',
       '#Fg2PtMade', '#Fg3PtAtt', '#Fg3PtMade', '#FtAtt', '#FtMade']



# Map poitns from [2, 3] to [0, 1]
df_merged_test['points'] = df_merged_test['points'].map({2: 0, 3: 1})

# for players with points = 2 add an extra column called accuracy for 2 pointers adn for players with points = 3 add an extra column called accuracy for 3 pointers
df_merged_test['accuracy'] = 0
df_merged_test['accuracy'][df_merged_test['points'] == 0] = df_merged_test['#Fg2PtMade'] / df_merged_test['#Fg2PtAtt']
df_merged_test['accuracy'][df_merged_test['points'] == 1] = df_merged_test['#Fg3PtMade'] / df_merged_test['#Fg3PtAtt']

# Drop players with 0 attempts in Fg2PtAtt, Fg3PtAtt, FtAtt
df_merged_test = df_merged_test[df_merged_test['#Fg2PtAtt'] != 0]
df_merged_test = df_merged_test[df_merged_test['#Fg3PtAtt'] != 0]
df_merged_test = df_merged_test[df_merged_test['#FtAtt'] != 0]

# Drop Fg2PtAtt, Fg2PtMade, Fg3PtAtt, Fg3PtMade, FtAtt, FtMade
df_merged_test = df_merged_test.drop(['#Fg2PtAtt', '#Fg2PtMade', '#Fg3PtAtt', '#Fg3PtMade', '#FtAtt', '#FtMade'], axis=1)
# Drop irrelevant columns
df_merged_test = df_merged_test.drop(['date','#Birth Date', 'away team', 'home team', 'shoot player'], axis=1)

# Convert height from feet-inches to inches
# height map
height_map = {'5\'4\"': 64, '5\'9\"': 69, '5\'10\"': 70, '5\'11\"': 71,
       '6\'0\"': 72, '6\'1\"': 73, '6\'2\"': 74, '6\'3\"': 75, '6\'4\"': 76, '6\'5\"': 77, '6\'6\"': 78, '6\'7\"': 79, '6\'8\"': 80,
       '6\'9\"': 81, '6\'10\"': 82, '6\'11\"': 83, '7\'0\"': 84, '7\'1\"': 85, '7\'2\"': 86, '7\'3\"': 87}
# convert height to inches using height_map
df_merged_test['#Height'] = df_merged_test['#Height'].map(height_map)
# replace height nans with the mean
df_merged_test['#Height'] = df_merged_test['#Height'].fillna(round(df_merged_test['#Height'].mean()))
# Normalize height subtracting the min and dividing by the range
df_merged_test['#Height'] = (df_merged_test['#Height'] - df_merged_test['#Height'].min()) / (df_merged_test['#Height'].max() - df_merged_test['#Height'].min())

# Fill age nans with the mean
df_merged_test['#Age'] = df_merged_test['#Age'].fillna(round(df_merged_test['#Age'].mean()))
# Normalize age using z-score
df_merged_test['#Age'] = (df_merged_test['#Age'] - df_merged_test['#Age'].mean()) / df_merged_test['#Age'].std()
# Fill weight nans with the mean
df_merged_test['#Weight'] = df_merged_test['#Weight'].fillna(round(df_merged_test['#Weight'].mean()))
# Normalize weight using z-score
df_merged_test['#Weight'] = (df_merged_test['#Weight'] - df_merged_test['#Weight'].mean()) / df_merged_test['#Weight'].std()

# Convert rookie to 0 and 1
df_merged_test['#Rookie'] = df_merged_test['#Rookie'].map({'N': 0, 'Y': 1})

# drop nans from rows with location x as null
# reason, very few they represent less than 0.1% of our dataset and have a null y location as well
df_merged_test = df_merged_test.dropna(subset=['location x']) 
# change the columns with location x > 470 to 940 - location x
df_merged_test.loc[df_merged_test['location x'] > 470, 'location x'] = 940 - df_merged_test['location x']

# normalize location x and location y
df_merged_test['location x'] = df_merged_test['location x'] / 470 # half court width
df_merged_test['location y'] = df_merged_test['location y'] / 500 # court length

# get time from time table in the form(MM:SS) and convert to seconds and get quarter and multiply by 720
df_merged_test['time'] = df_merged_test['time'].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))
df_merged_test['time'] = df_merged_test['time'] + (df_merged_test['quarter'] - 1) * 720

# divide time by 2880 to normalize it
df_merged_test['time'] = df_merged_test['time'] / 2880

# drop quarter column
#df_merged_test = df_merged_test.drop(['quarter'], axis=1)


# Map player position from ['SF' 'C' 'SG' 'PG' 'PF' 'G' 'F'] to [0, 1, 2, 3, 4 ,5, 6]
df_merged_test['player position'] = df_merged_test['player position'].map({'SF': 0, 'C': 1, 'SG': 2, 'PG': 3, 'PF': 4, 'G': 5, 'F': 6})

# Drop nulls in self previous shot and opponent previous shot
df_merged_test = df_merged_test.dropna(subset=['self previous shot', 'opponent previous shot'])

# Map current shot outcome from ['MISSED' 'BLOCKED' 'SCORED'] to [0, 0, 1]
df_merged_test['self previous shot'] = df_merged_test['self previous shot'].map({'MISSED': 0, 'BLOCKED': 0, 'SCORED': 1})
df_merged_test['opponent previous shot'] = df_merged_test['opponent previous shot'].map({'MISSED': 0, 'BLOCKED': 0, 'SCORED': 1})
df_merged_test['current shot outcome'] = df_merged_test['current shot outcome'].map({'MISSED': 0, 'BLOCKED': 0, 'SCORED': 1})

df_merged_test.to_csv("output/Standard/Single File/merged_test.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_test['accuracy'][df_merged_test['points'] == 0] = df_merged_test['#Fg2PtMade'] / df_merged_test['#Fg2PtAtt']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_test['accuracy'][df_merged_test['points'] == 1] = df_merged_test['#Fg3PtMade'] / df_merged_test['#Fg3PtAtt']


In [236]:
for column in ['self previous shot', 'opponent previous shot']:
    print(column + ":", df_merged_test[column].unique())

# print nulls per column in df_player
df_merged_test['time from last shot'] = df_merged_test['time from last shot'].fillna(round(df_merged_test['time from last shot'].mean()))


# print first 10 rows with time from last shot as nan
#print(df[df['time from last shot'].isnull()].head(10))
corr = df_merged_test.corr()
print(df_merged_test.columns)
# print correlation with "curent shot outcome"
print(corr['current shot outcome'].sort_values(ascending=False))

self previous shot: [0 1]
opponent previous shot: [1 0]
Index(['self previous shot', 'player position', 'home game', 'location x',
       'opponent previous shot', 'shot type', 'points', 'location y', 'time',
       'time from last shot', 'quarter', 'current shot outcome', '#Height',
       '#Weight', '#Age', '#Rookie', '#GamesPlayed', 'accuracy'],
      dtype='object')
current shot outcome      1.000000
accuracy                  0.172877
#Weight                   0.047568
#Height                   0.045343
#GamesPlayed              0.019273
location y                0.003124
#Age                     -0.004877
self previous shot       -0.008196
opponent previous shot   -0.014190
player position          -0.015410
quarter                  -0.016007
time                     -0.019987
#Rookie                  -0.020353
time from last shot      -0.040114
points                   -0.135422
location x               -0.159820
Name: current shot outcome, dtype: float64


In [237]:
# train a model to predict current shot outcome