# Data Collection and Preprocessing

**Importing Libraries**

In [66]:
from nba_api.stats.endpoints import leaguedashteamstats
import pandas as pd
from sklearn.preprocessing import StandardScaler

**Loading in Data (Looking at one year of data looks like)**

In [3]:
data =  leaguedashteamstats.LeagueDashTeamStats(season='2023-24',measure_type_detailed_defense='Advanced')#Focusing more on advanced stats
datadf = data.get_data_frames()[0]
datadf.head()


Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,...,AST_TO_RANK,AST_RATIO_RANK,OREB_PCT_RANK,DREB_PCT_RANK,REB_PCT_RANK,TM_TOV_PCT_RANK,EFG_PCT_RANK,TS_PCT_RANK,PACE_RANK,PIE_RANK
0,1610612737,Atlanta Hawks,82,36,46,0.439,3971.0,114.0,116.4,116.7,...,18,21,5,16,12,13,17,18,6,22
1,1610612738,Boston Celtics,82,64,18,0.78,3966.0,120.2,122.2,109.0,...,3,13,14,7,5,1,2,1,19,1
2,1610612751,Brooklyn Nets,82,32,50,0.39,3961.0,110.4,112.4,113.1,...,19,19,12,19,20,14,25,27,25,21
3,1610612766,Charlotte Hornets,82,21,61,0.256,3946.0,107.1,108.6,117.0,...,25,22,27,22,29,21,26,28,22,29
4,1610612741,Chicago Bulls,82,39,43,0.476,3996.0,112.4,114.0,113.2,...,13,24,11,17,14,3,22,21,28,20


**Creating Dataframe of Team Data from the last 5 years(Not including the two COVID shortened seasons since the difference in games played will mess up season wins predictor models)**

In [52]:
seasons = ['2023-24','2022-23','2021-22','2018-19','2017-18']

dfs = []
for season in seasons:
    stats = leaguedashteamstats.LeagueDashTeamStats(season=season,measure_type_detailed_defense='Advanced')
    statsdf = stats.get_data_frames()[0]
    statsdf.insert(loc = 0 , column = 'SEASON', value=season)
    dfs.append(statsdf)

NBA_df = pd.concat(dfs, ignore_index= True)
NBA_df

Unnamed: 0,SEASON,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,E_OFF_RATING,OFF_RATING,...,AST_TO_RANK,AST_RATIO_RANK,OREB_PCT_RANK,DREB_PCT_RANK,REB_PCT_RANK,TM_TOV_PCT_RANK,EFG_PCT_RANK,TS_PCT_RANK,PACE_RANK,PIE_RANK
0,2023-24,1610612737,Atlanta Hawks,82,36,46,0.439,3971.0,114.0,116.4,...,18,21,5,16,12,13,17,18,6,22
1,2023-24,1610612738,Boston Celtics,82,64,18,0.780,3966.0,120.2,122.2,...,3,13,14,7,5,1,2,1,19,1
2,2023-24,1610612751,Brooklyn Nets,82,32,50,0.390,3961.0,110.4,112.4,...,19,19,12,19,20,14,25,27,25,21
3,2023-24,1610612766,Charlotte Hornets,82,21,61,0.256,3946.0,107.1,108.6,...,25,22,27,22,29,21,26,28,22,29
4,2023-24,1610612741,Chicago Bulls,82,39,43,0.476,3996.0,112.4,114.0,...,13,24,11,17,14,3,22,21,28,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,2017-18,1610612758,Sacramento Kings,82,27,55,0.329,3951.0,101.1,103.1,...,20,23,17,15,27,14,27,30,30,29
146,2017-18,1610612759,San Antonio Spurs,82,47,35,0.573,3946.0,105.5,107.1,...,6,13,9,8,8,6,26,25,28,6
147,2017-18,1610612761,Toronto Raptors,82,59,23,0.720,3966.0,111.0,113.0,...,4,6,12,21,15,4,5,4,14,3
148,2017-18,1610612762,Utah Jazz,82,48,34,0.585,3951.0,106.2,107.4,...,24,17,21,3,7,24,9,11,25,4


**Taking Out Irrelevant Metrics that do not make sense to use for wins predictions**

In [50]:
NBA_df.columns #Looking at all the metrics in this dataframe to see which ones are irrelevant and not needed

Index(['SEASON', 'TEAM_ID', 'TEAM_NAME', 'GP', 'W', 'L', 'W_PCT', 'MIN',
       'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
       'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TO', 'AST_RATIO',
       'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT',
       'E_PACE', 'PACE', 'PACE_PER40', 'POSS', 'PIE', 'GP_RANK', 'W_RANK',
       'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'OFF_RATING_RANK',
       'DEF_RATING_RANK', 'NET_RATING_RANK', 'AST_PCT_RANK', 'AST_TO_RANK',
       'AST_RATIO_RANK', 'OREB_PCT_RANK', 'DREB_PCT_RANK', 'REB_PCT_RANK',
       'TM_TOV_PCT_RANK', 'EFG_PCT_RANK', 'TS_PCT_RANK', 'PACE_RANK',
       'PIE_RANK'],
      dtype='object')

In [53]:
NBA_df = NBA_df.drop(['SEASON','TEAM_ID','TEAM_NAME','GP','L','W_PCT','MIN'], axis = 1) #Getting rid of columns not needed or interfere with wins predictions

print("Number of Columns Left: ",len(NBA_df.columns)) #Amount of metrics left

NBA_df

Number of Columns Left:  40


Unnamed: 0,W,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,E_NET_RATING,NET_RATING,AST_PCT,AST_TO,AST_RATIO,...,AST_TO_RANK,AST_RATIO_RANK,OREB_PCT_RANK,DREB_PCT_RANK,REB_PCT_RANK,TM_TOV_PCT_RANK,EFG_PCT_RANK,TS_PCT_RANK,PACE_RANK,PIE_RANK
0,36,114.0,116.4,116.7,118.4,-2.6,-2.0,0.618,1.96,18.4,...,18,21,5,16,12,13,17,18,6,22
1,64,120.2,122.2,109.0,110.6,11.2,11.7,0.613,2.25,19.3,...,3,13,14,7,5,1,2,1,19,1
2,32,110.4,112.4,113.1,115.4,-2.7,-2.9,0.630,1.95,18.6,...,19,19,12,19,20,14,25,27,25,21
3,21,107.1,108.6,117.0,119.2,-9.9,-10.6,0.620,1.80,18.4,...,25,22,27,22,29,21,26,28,22,29
4,39,112.4,114.0,113.2,115.7,-0.8,-1.7,0.594,2.04,18.2,...,13,24,11,17,14,3,22,21,28,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,27,101.1,103.1,109.0,110.4,-7.9,-7.3,0.556,1.57,16.6,...,20,23,17,15,27,14,27,30,30,29
146,47,105.5,107.1,102.4,104.1,3.1,3.0,0.583,1.73,17.3,...,6,13,9,8,8,6,26,25,28,6
147,59,111.0,113.0,103.4,105.3,7.6,7.7,0.590,1.82,17.9,...,4,6,12,21,15,4,5,4,14,3
148,48,106.2,107.4,101.6,103.0,4.6,4.4,0.586,1.53,17.1,...,24,17,21,3,7,24,9,11,25,4


**Checking for any empty values**

In [59]:
sum(NBA_df.isnull().sum()) #Finds and counts up anywhere there is an empty value, it appears there are no empty values

0

**Notable NBA team averages over past 5 non-COVID seasons**

In [38]:
avg = NBA_df.mean()
print('NBA team averages the past 5 non-COVID seasons')
pd.DataFrame(avg).iloc[6:26,:]

NBA team averages the past 5 non-COVID seasons


  avg = NBA_df.mean()


Unnamed: 0,0
E_OFF_RATING,109.604
OFF_RATING,111.513333
E_DEF_RATING,109.592667
DEF_RATING,111.503333
E_NET_RATING,0.009333
NET_RATING,0.012667
AST_PCT,0.605213
AST_TO,1.792067
AST_RATIO,18.003333
OREB_PCT,0.275313


**Establishing Input and Target Variables**

In [63]:
X = NBA_df.drop('W', axis = 1) #Basically the dataframe with all the metrics except wins(the target variable)
y = NBA_df['W']


0      36
1      64
2      32
3      21
4      39
       ..
145    27
146    47
147    59
148    48
149    43
Name: W, Length: 150, dtype: int64

**Standardizing Input Features**

In [76]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X

array([[ 1.10965351,  1.23004066,  1.9596556 , ...,  0.28883564,
        -1.0968174 ,  0.75097267],
       [ 2.67467893,  2.68997977, -0.16341186, ..., -1.67524673,
         0.40620012, -1.67524673],
       [ 0.20092907,  0.22318609,  0.96705263, ...,  1.32864396,
         1.09990051,  0.63543842],
       ...,
       [ 0.35238314,  0.37421428, -1.70746092, ..., -1.32864396,
        -0.17188354, -1.44417822],
       [-0.85924944, -1.03538211, -2.20376241, ..., -0.51990416,
         1.09990051, -1.32864396],
       [-0.68255302, -0.85918256, -0.93543639, ..., -0.4043699 ,
         0.05934992, -0.28883564]])