In [1]:
## Load Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

import seaborn as sns
from sklearn.cluster import KMeans

In [2]:
## Concatenating PBP scoring files
pbp_sc_files = []
for i in np.arange(1,22):
    name = 'pbpstats_export-{}.csv'.format(i)
    file = pd.read_csv(name)
    file['Season'] = 2022-i
    pbp_sc_files.append(file)
pbp_sc = pd.concat(pbp_sc_files)

## Final PBP scoring csv
pbp_sc.to_csv('pbp_sc.csv')

In [3]:
## Concatenating PBP assist files
pbp_as_files = []
for i in np.arange(-42,-21):
    name = 'pbpstats_export-{}.csv'.format(-i)
    file = pd.read_csv(name)
    file['Season'] = 2000-21-i
    pbp_as_files.append(file)    
pbp_as = pd.concat(pbp_as_files)

## Final PBP assist csv
pbp_as.to_csv('pbp_as.csv')

In [4]:
## Reading in csv files
pbp_score = pd.read_csv('pbp_sc.csv')
pbp_assist = pd.read_csv('pbp_as.csv')
seasonStats1950to2017 = pd.read_csv('Season_Stats_1950to2017.csv')
seasonStats2017to2019 = pd.read_csv('Season_Stats_2017to2019.csv')
seasonStats2019to2021 = pd.read_csv('Season_Stats_2019to2021.csv')

In [5]:
## Drop data from before 2000-2001 season
seasonStats2000to2017 = seasonStats1950to2017[seasonStats1950to2017['Year'] >= 2001]
# Reset index to default so idx doesn't start at some random value
seasonStats2000to2017 = seasonStats2000to2017.reset_index()
# Drop some irrelevant columns
seasonStats2000to2017 = seasonStats2000to2017.drop(['index', 'Unnamed: 0', 'blanl', 'blank2', 'Pos'], axis = 1)
# Change the datatype of year (= season) from float64 to int
seasonStats2000to2017 = seasonStats2000to2017.astype({'Year': int})

In [6]:
# Add missing columns into 2019 to 2021 data
seasonStats2019to2021['FG%'] = round(seasonStats2019to2021['FG']/seasonStats2019to2021['FGA'], 3)
seasonStats2019to2021['2P%'] = round(seasonStats2019to2021['2P']/seasonStats2019to2021['2PA'],3)
seasonStats2019to2021['3P%'] = round(seasonStats2019to2021['3P']/seasonStats2019to2021['3PA'],3)
seasonStats2019to2021['FT%'] = round(seasonStats2019to2021['FT']/seasonStats2019to2021['FTA'],3)
seasonStats2019to2021['eFG%'] = round(((seasonStats2019to2021['FG'] + (0.5*seasonStats2019to2021['3P']))/
                                 seasonStats2019to2021['FGA']),3)
seasonStats2019to2021['TS%'] = round(((seasonStats2019to2021['PTS'])/
                                (2*(seasonStats2019to2021['FGA'])+(0.44*seasonStats2019to2021['FTA']))),3)

In [7]:
# Drop extraneous columns from 2017 to 2019 and 2019 to 2021 data
seasonStats2017to2019 = seasonStats2017to2019.drop(['ORtg','DRtg', 'Lg'], axis = 1)
seasonStats2019to2021 = seasonStats2019to2021.drop(['ORtg','DRtg', 'Lg'], axis = 1)

In [8]:
# Drop all rows with NaNs from 2017 to 2019 data. There are rows at the end of the 2017 to 2019 dataframe.
seasonStats2017to2019 = seasonStats2017to2019.dropna(how = 'all')

In [11]:
## Renaming and reordering columns across the 3 dfs
seasonStats2000to2017 = seasonStats2000to2017.rename(columns = {'Year': 'Season'})
cols = list(seasonStats2000to2017.columns)
seasonStats2017to2019 = seasonStats2017to2019[cols]
seasonStats2019to2021 = seasonStats2019to2021[cols]

In [12]:
## Converting Season column values to be same across the 3 dfs
for i in np.arange(len(seasonStats2017to2019)):
    seasonStats2017to2019.loc[i,'Season'] = int(seasonStats2017to2019.loc[i,'Season'][-2:])+2000

for i in np.arange(len(seasonStats2019to2021)):
    seasonStats2019to2021.loc[i,'Season'] = int(seasonStats2019to2021.loc[i,'Season'][-2:])+2000

In [14]:
## Combining the 3 df into a single stathead df
seasonStats = [seasonStats2000to2017,seasonStats2017to2019,seasonStats2019to2021]
nba = pd.concat(seasonStats)

In [21]:
## Cleaning Team Names
nba = nba.rename(columns = {'tm': 'team'})
nba.columns = [header.lower() for header in nba.columns]

## Changing team names to modern abbreviations
nba = nba.replace({'team':{
    'CHO': 'CHA',
    'CHH': 'CHA',
    'BRK': 'BKN',
    'NJN': 'BKN',
    'VAN': 'MEM',
    'PHO': 'PHX',
    'NOH': 'NOP',
    'NOK': 'NOP'
}})

## DROP SEATTLE SONICS???

In [16]:
## NEED TO ADDRESS NAMES WITH ACCENTS

nba.loc[nba.player=="Dario Šarić"]

Unnamed: 0,season,player,age,tm,g,gs,mp,per,ts%,3par,...,ft%,orb,drb,trb,ast,stl,blk,tov,pf,pts
440,2018,Dario Šarić,23.0,PHI,78.0,73.0,2310.0,15.8,0.582,0.448,...,0.86,154.0,366.0,520.0,202.0,51.0,20.0,148.0,160.0,1141.0
977,2019,Dario Šarić,24.0,TOT,81.0,41.0,2023.0,13.1,0.563,0.464,...,0.88,126.0,331.0,457.0,127.0,45.0,9.0,97.0,182.0,858.0
438,2020,Dario Šarić,25.0,PHO,66.0,51.0,1632.0,14.2,0.622,0.437,...,0.844,100.0,306.0,406.0,123.0,37.0,16.0,88.0,156.0,704.0
975,2021,Dario Šarić,26.0,PHO,50.0,4.0,871.0,14.3,0.601,0.395,...,0.848,46.0,144.0,190.0,65.0,30.0,4.0,57.0,95.0,437.0


In [17]:
## 70% mins cutoff

mins70 = list(nba.groupby('season').mp.quantile(q=0.3))
all_bl = [False]*len(nba)
year = 2001
for i in np.arange(len(mins70)):
    bl = np.logical_and(nba.season==year , nba.mp > mins70[i])
    all_bl = np.logical_or(all_bl, bl)
    year += 1

In [18]:
## Assigning Final df (NEED TO ADD PBP though)
stats = nba.loc[all_bl]
stats

Unnamed: 0,season,player,age,tm,g,gs,mp,per,ts%,3par,...,ft%,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,2001,Mahmoud Abdul-Rauf,31.0,VAN,41.0,0.0,486.0,16.7,0.514,0.057,...,0.759,5.0,20.0,25.0,76.0,9.0,1.0,26.0,50.0,266.0
1,2001,Tariq Abdul-Wahad,26.0,DEN,29.0,12.0,420.0,5.8,0.438,0.090,...,0.583,14.0,45.0,59.0,22.0,14.0,13.0,34.0,54.0,111.0
2,2001,Shareef Abdur-Rahim,24.0,VAN,81.0,81.0,3241.0,19.1,0.549,0.050,...,0.834,175.0,560.0,735.0,250.0,90.0,77.0,231.0,238.0,1663.0
4,2001,Courtney Alexander,23.0,TOT,65.0,24.0,1382.0,11.6,0.484,0.080,...,0.820,42.0,101.0,143.0,62.0,45.0,5.0,75.0,139.0,618.0
5,2001,Courtney Alexander,23.0,DAL,38.0,6.0,472.0,6.8,0.404,0.056,...,0.733,20.0,43.0,63.0,21.0,16.0,3.0,21.0,76.0,160.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1064,2021,Delon Wright,28.0,TOT,63.0,39.0,1748.0,16.3,0.591,0.332,...,0.802,65.0,204.0,269.0,278.0,101.0,30.0,83.0,75.0,645.0
1065,2021,Thaddeus Young,32.0,CHI,68.0,23.0,1652.0,20.3,0.599,0.068,...,0.628,168.0,255.0,423.0,291.0,74.0,40.0,137.0,152.0,823.0
1066,2021,Trae Young,22.0,ATL,63.0,63.0,2125.0,23.0,0.647,0.357,...,0.886,38.0,207.0,245.0,594.0,53.0,12.0,261.0,111.0,1594.0
1067,2021,Cody Zeller,28.0,CHO,48.0,21.0,1005.0,18.2,0.644,0.086,...,0.714,119.0,209.0,328.0,86.0,27.0,17.0,51.0,121.0,451.0


In [24]:
## Make decision about which cols to keep
list(stats.columns)

['season',
 'player',
 'age',
 'tm',
 'g',
 'gs',
 'mp',
 'per',
 'ts%',
 '3par',
 'ftr',
 'orb%',
 'drb%',
 'trb%',
 'ast%',
 'stl%',
 'blk%',
 'tov%',
 'usg%',
 'ows',
 'dws',
 'ws',
 'ws/48',
 'obpm',
 'dbpm',
 'bpm',
 'vorp',
 'fg',
 'fga',
 'fg%',
 '3p',
 '3pa',
 '3p%',
 '2p',
 '2pa',
 '2p%',
 'efg%',
 'ft',
 'fta',
 'ft%',
 'orb',
 'drb',
 'trb',
 'ast',
 'stl',
 'blk',
 'tov',
 'pf',
 'pts']