In [1]:
# Load Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

import seaborn as sns
from sklearn.cluster import KMeans

In [2]:
# Concatenating PBP scoring files
pbp_sc_files = []
for i in np.arange(1,22):
    name = 'pbpstats_export-{}.csv'.format(i)
    file = pd.read_csv(name)
    file['Season'] = 2022-i
    pbp_sc_files.append(file)
pbp_sc = pd.concat(pbp_sc_files)

# Final PBP scoring csv
pbp_sc.to_csv('pbp_sc.csv')

In [3]:
# Concatenating PBP assist files
pbp_as_files = []
for i in np.arange(-42,-21):
    name = 'pbpstats_export-{}.csv'.format(-i)
    file = pd.read_csv(name)
    file['Season'] = 2000-21-i
    pbp_as_files.append(file)    
pbp_as = pd.concat(pbp_as_files)

# Final PBP assist csv
pbp_as.to_csv('pbp_as.csv')

In [4]:
# Reading in csv files
pbp_score = pd.read_csv('pbp_sc.csv')
pbp_assist = pd.read_csv('pbp_as.csv')
seasonStats1950to2017 = pd.read_csv('Season_Stats_1950to2017.csv')
seasonStats2017to2019 = pd.read_csv('Season_Stats_2017to2019.csv')
seasonStats2019to2021 = pd.read_csv('Season_Stats_2019to2021.csv')

In [5]:
# Drop data from before 2000-2001 season
seasonStats2000to2017 = seasonStats1950to2017[seasonStats1950to2017['Year'] >= 2001]
# Reset index to default so idx doesn't start at some random value
seasonStats2000to2017 = seasonStats2000to2017.reset_index()
# Drop some irrelevant columns
seasonStats2000to2017 = seasonStats2000to2017.drop(['index', 'Unnamed: 0', 'blanl', 'blank2', 'Pos'], axis = 1)
# Change the datatype of year (= season) from float64 to int
seasonStats2000to2017 = seasonStats2000to2017.astype({'Year': int})

In [6]:
# Add missing columns into 2019 to 2021 data
seasonStats2019to2021['FG%'] = round(seasonStats2019to2021['FG']/seasonStats2019to2021['FGA'], 3)
seasonStats2019to2021['2P%'] = round(seasonStats2019to2021['2P']/seasonStats2019to2021['2PA'],3)
seasonStats2019to2021['3P%'] = round(seasonStats2019to2021['3P']/seasonStats2019to2021['3PA'],3)
seasonStats2019to2021['FT%'] = round(seasonStats2019to2021['FT']/seasonStats2019to2021['FTA'],3)
seasonStats2019to2021['eFG%'] = round(((seasonStats2019to2021['FG'] + (0.5*seasonStats2019to2021['3P']))/
                                 seasonStats2019to2021['FGA']),3)
seasonStats2019to2021['TS%'] = round(((seasonStats2019to2021['PTS'])/
                                (2*(seasonStats2019to2021['FGA'])+(0.44*seasonStats2019to2021['FTA']))),3)

In [7]:
# Drop extraneous columns from 2017 to 2019 and 2019 to 2021 data
seasonStats2017to2019 = seasonStats2017to2019.drop(['ORtg','DRtg', 'Lg'], axis = 1)
seasonStats2019to2021 = seasonStats2019to2021.drop(['ORtg','DRtg', 'Lg'], axis = 1)

In [8]:
# Drop all rows with NaNs from 2017 to 2019 data. There are rows of NaNs at the end of the 2017 to 2019 dataframe.
seasonStats2017to2019 = seasonStats2017to2019.dropna(how = 'all')

In [9]:
# Renaming and reordering columns across the 3 dfs
seasonStats2000to2017 = seasonStats2000to2017.rename(columns = {'Year': 'Season'})
cols = list(seasonStats2000to2017.columns)
seasonStats2017to2019 = seasonStats2017to2019[cols]
seasonStats2019to2021 = seasonStats2019to2021[cols]

In [10]:
# Converting Season column values to be same across the 3 dfs
for i in np.arange(len(seasonStats2017to2019)):
    seasonStats2017to2019.loc[i,'Season'] = int(seasonStats2017to2019.loc[i,'Season'][-2:])+2000

for i in np.arange(len(seasonStats2019to2021)):
    seasonStats2019to2021.loc[i,'Season'] = int(seasonStats2019to2021.loc[i,'Season'][-2:])+2000

In [11]:
# Combining the 3 df into a single stathead df
seasonStats = [seasonStats2000to2017,seasonStats2017to2019,seasonStats2019to2021]
nba = pd.concat(seasonStats)

In [12]:
# Cleaning Team Names
nba = nba.rename(columns = {'tm': 'team'})
nba.columns = [header.lower() for header in nba.columns]

# Changing team names to modern abbreviations
nba = nba.replace({'team':{
    'CHO': 'CHA',
    'CHH': 'CHA',
    'BRK': 'BKN',
    'NJN': 'BKN',
    'VAN': 'MEM',
    'PHO': 'PHX',
    'NOH': 'NOP',
    'NOK': 'NOP',
    'SEA': 'OKC',
}})

In [13]:
## Addressing player names with accents
names = nba.player.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
nba.player = names

In [14]:
# Removing duplicate rows with players playing for multiple teams in one season
# Keeping row of aggregate stats for that player for that season

nba = nba.drop_duplicates(subset = ['player','season'])

In [15]:
# 70% mins cutoff
mins70 = list(nba.groupby('season').mp.quantile(q=0.3))
all_bl = [False]*len(nba)
year = 2001
for i in np.arange(len(mins70)):
    bl = np.logical_and(nba.season==year , nba.mp > mins70[i])
    all_bl = np.logical_or(all_bl, bl)
    year += 1

In [16]:
# Assigning final rows of df (NEED TO ADD PBP though)
nba_filtered = nba.loc[all_bl]

In [17]:
# Change datatypes of certain columns to match their true type
nba_filtered = nba_filtered.astype({'season': int, 'age': int, 'g': int, 'gs': int, 'mp': int, 'fg': int, 'fga': int,
                                  '3p': int, '3pa': int, '2p': int, '2pa': int, 'ft': int, 'fta': int, 'orb': int,
                                  'drb': int, 'trb': int, 'ast': int, 'stl': int, 'blk': int, 'tov': int, 'pf': int,
                                  'pts': int})
nba_filtered.head()

Unnamed: 0,season,player,age,tm,g,gs,mp,per,ts%,3par,...,ft%,orb,drb,trb,ast,stl,blk,tov,pf,pts
2,2001,Shareef Abdur-Rahim,24,VAN,81,81,3241,19.1,0.549,0.05,...,0.834,175,560,735,250,90,77,231,238,1663
4,2001,Courtney Alexander,23,TOT,65,24,1382,11.6,0.484,0.08,...,0.82,42,101,143,62,45,5,75,139,618
7,2001,Ray Allen,25,MIL,82,82,3129,22.9,0.61,0.357,...,0.888,101,327,428,374,124,20,204,192,1806
9,2001,John Amaechi,30,ORL,82,36,1710,8.7,0.455,0.012,...,0.631,77,191,268,74,28,29,124,175,650
10,2001,Derek Anderson,26,SAS,82,82,2859,17.1,0.542,0.255,...,0.851,75,288,363,301,120,14,165,188,1269


In [18]:
# Need to make decision about which cols to keep
#list(nba_filtered.columns)

In [19]:
# Drop some irrelevant, unnecessary, and/or redundant columns from PBP data
pbp_score = pbp_score.drop(['Unnamed: 0', 'TeamAbbreviation', 'GamesPlayed', 'Minutes', 'Points','FG2M', 'FG2A',
                            'Fg2Pct', 'FG3M', 'FG3A', 'Fg3Pct', 'FG3APct','EfgPct', 'TsPct','Usage'], axis = 1)
pbp_assist = pbp_assist.drop(['Unnamed: 0', 'TeamAbbreviation', 'GamesPlayed', 'Minutes', 'Assists'], axis = 1)

In [20]:
# Tidying up columns in PBP data
pbp_score.columns = [header.lower() for header in pbp_score.columns]
pbp_assist.columns = [header.lower() for header in pbp_assist.columns]

In [38]:
# Rename name to score for pbp
pbp_score = pbp_score.rename(columns = {'name' : 'player'})
pbp_assist = pbp_assist.rename(columns = {'name' : 'player'})

# Merge 2 pbp files
pbp = pbp_score.merge(pbp_assist)

In [47]:
test = nba_filtered.merge(pbp,how = 'inner', on = ['player', 'season'])
test ## MAY WORK
list(test.columns)
len(list(test.columns))
len(nba_filtered.columns)
test

Unnamed: 0,season,player,age,tm,g,gs,mp,per,ts%,3par,...,fg3ablocked,fg3apctblocked,assistpoints,twoptassists,threeptassists,atrimassists,shortmidrangeassists,longmidrangeassists,corner3assists,arc3assists
0,2001,Shareef Abdur-Rahim,24,VAN,81,81,3241,19.1,0.549,0.050,...,1,0.015625,556,194,56,58,25,111,9,47
1,2001,Courtney Alexander,23,TOT,65,24,1382,11.6,0.484,0.080,...,0,0.000000,131,55,7,19,6,30,2,5
2,2001,Ray Allen,25,MIL,82,82,3129,22.9,0.610,0.357,...,2,0.004283,846,276,98,101,78,97,36,62
3,2001,John Amaechi,30,ORL,82,36,1710,8.7,0.455,0.012,...,0,0.000000,171,51,23,31,6,14,3,20
4,2001,Derek Anderson,26,SAS,82,82,2859,17.1,0.542,0.255,...,2,0.007905,674,229,72,115,51,63,33,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6629,2021,Delon Wright,28,TOT,63,39,1748,16.3,0.591,0.332,...,1,0.005814,703,131,147,83,36,12,44,103
6630,2021,Thaddeus Young,32,CHI,68,23,1652,20.3,0.599,0.068,...,0,0.000000,685,188,103,124,36,28,37,66
6631,2021,Trae Young,22,ATL,63,63,2125,23.0,0.647,0.357,...,2,0.005038,1403,379,215,289,67,23,70,145
6632,2021,Cody Zeller,28,CHO,48,21,1005,18.2,0.644,0.086,...,0,0.000000,212,46,40,26,9,11,15,25
