# NBA Hall of Fame Workbook

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 100

## Data Import

In [64]:
nba = pd.read_csv("data/Seasons_Stats.csv", index_col = 0)

In [115]:
nba.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1950,Curly Armstrong,G-F,31,FTW,63.0,,,,0.368,,0.467,,,,,,,,,,-0.1,3.6,3.5,,,,,,,144.0,516.0,0.279,,,,144.0,516.0,0.279,0.279,170.0,241.0,0.705,,,,176.0,,,,217.0,458.0
1,1950,Cliff Barker,SG,29,INO,49.0,,,,0.435,,0.387,,,,,,,,,,1.6,0.6,2.2,,,,,,,102.0,274.0,0.372,,,,102.0,274.0,0.372,0.372,75.0,106.0,0.708,,,,109.0,,,,99.0,279.0
2,1950,Leo Barnhorst,SF,25,CHS,67.0,,,,0.394,,0.259,,,,,,,,,,0.9,2.8,3.6,,,,,,,174.0,499.0,0.349,,,,174.0,499.0,0.349,0.349,90.0,129.0,0.698,,,,140.0,,,,192.0,438.0
3,1950,Ed Bartels,F,24,TOT,15.0,,,,0.312,,0.395,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,22.0,86.0,0.256,,,,22.0,86.0,0.256,0.256,19.0,34.0,0.559,,,,20.0,,,,29.0,63.0
4,1950,Ed Bartels,F,24,DNN,13.0,,,,0.308,,0.378,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,21.0,82.0,0.256,,,,21.0,82.0,0.256,0.256,17.0,31.0,0.548,,,,20.0,,,,27.0,59.0


In [66]:
nba.shape

(24691, 52)

In [67]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24691 entries, 0 to 24690
Data columns (total 52 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    24624 non-null  float64
 1   Player  24624 non-null  object 
 2   Pos     24624 non-null  object 
 3   Age     24616 non-null  float64
 4   Tm      24624 non-null  object 
 5   G       24624 non-null  float64
 6   GS      18233 non-null  float64
 7   MP      24138 non-null  float64
 8   PER     24101 non-null  float64
 9   TS%     24538 non-null  float64
 10  3PAr    18839 non-null  float64
 11  FTr     24525 non-null  float64
 12  ORB%    20792 non-null  float64
 13  DRB%    20792 non-null  float64
 14  TRB%    21571 non-null  float64
 15  AST%    22555 non-null  float64
 16  STL%    20792 non-null  float64
 17  BLK%    20792 non-null  float64
 18  TOV%    19582 non-null  float64
 19  USG%    19640 non-null  float64
 20  blanl   0 non-null      float64
 21  OWS     24585 non-null  float64
 22

In [68]:
nba.Tm.value_counts()

TOT    2123
NYK    1043
BOS     998
DET     917
PHI     871
       ... 
DNN      15
CAP      14
STB      14
CHS      13
CHP      12
Name: Tm, Length: 69, dtype: int64

In [69]:
nba.isnull().sum()

Year         67
Player       67
Pos          67
Age          75
Tm           67
G            67
GS         6458
MP          553
PER         590
TS%         153
3PAr       5852
FTr         166
ORB%       3899
DRB%       3899
TRB%       3120
AST%       2136
STL%       3899
BLK%       3899
TOV%       5109
USG%       5051
blanl     24691
OWS         106
DWS         106
WS          106
WS/48       590
blank2    24691
OBPM       3894
DBPM       3894
BPM        3894
VORP       3894
FG           67
FGA          67
FG%         166
3P         5764
3PA        5764
3P%        9275
2P           67
2PA          67
2P%         195
eFG%        166
FT           67
FTA          67
FT%         925
ORB        3894
DRB        3894
TRB         379
AST          67
STL        3894
BLK        3894
TOV        5046
PF           67
PTS          67
dtype: int64

## Data Cleaning

### Cleaning the NBA Dataframe

In [73]:
nba['Year'].value_counts(ascending = True)

1961.0    104
1956.0    106
1957.0    107
1955.0    109
1959.0    110
         ... 
2017.0    595
2008.0    595
2014.0    611
2011.0    625
2015.0    651
Name: Year, Length: 68, dtype: int64

In [100]:
nba['Year'] = np.where((nba['Year'].isna()), 1950, nba['Year'])
nba['Age'] = np.where((nba['Age'].isna()), nba['Age'].mean(), nba['Age'])

In [101]:
nba['Year'].isna().sum()

0

In [102]:
nba['Age'].isna().sum()

0

In [112]:
integer_types = ['Year', 'Age']

nba[integer_types] = nba[integer_types].astype(int)
nba.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1950,Curly Armstrong,G-F,31,FTW,63.0,,,,0.368,,0.467,,,,,,,,,,-0.1,3.6,3.5,,,,,,,144.0,516.0,0.279,,,,144.0,516.0,0.279,0.279,170.0,241.0,0.705,,,,176.0,,,,217.0,458.0
1,1950,Cliff Barker,SG,29,INO,49.0,,,,0.435,,0.387,,,,,,,,,,1.6,0.6,2.2,,,,,,,102.0,274.0,0.372,,,,102.0,274.0,0.372,0.372,75.0,106.0,0.708,,,,109.0,,,,99.0,279.0
2,1950,Leo Barnhorst,SF,25,CHS,67.0,,,,0.394,,0.259,,,,,,,,,,0.9,2.8,3.6,,,,,,,174.0,499.0,0.349,,,,174.0,499.0,0.349,0.349,90.0,129.0,0.698,,,,140.0,,,,192.0,438.0
3,1950,Ed Bartels,F,24,TOT,15.0,,,,0.312,,0.395,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,22.0,86.0,0.256,,,,22.0,86.0,0.256,0.256,19.0,34.0,0.559,,,,20.0,,,,29.0,63.0
4,1950,Ed Bartels,F,24,DNN,13.0,,,,0.308,,0.378,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,21.0,82.0,0.256,,,,21.0,82.0,0.256,0.256,17.0,31.0,0.548,,,,20.0,,,,27.0,59.0


In [113]:
nba.dtypes

Year        int32
Player     object
Pos        object
Age         int32
Tm         object
G         float64
GS        float64
MP        float64
PER       float64
TS%       float64
3PAr      float64
FTr       float64
ORB%      float64
DRB%      float64
TRB%      float64
AST%      float64
STL%      float64
BLK%      float64
TOV%      float64
USG%      float64
blanl     float64
OWS       float64
DWS       float64
WS        float64
WS/48     float64
blank2    float64
OBPM      float64
DBPM      float64
BPM       float64
VORP      float64
FG        float64
FGA       float64
FG%       float64
3P        float64
3PA       float64
3P%       float64
2P        float64
2PA       float64
2P%       float64
eFG%      float64
FT        float64
FTA       float64
FT%       float64
ORB       float64
DRB       float64
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV       float64
PF        float64
PTS       float64
dtype: object

### Reading in Separate Dataframe to Capture NBA Champions

In [130]:
champs = pd.read_csv("data/champions.csv")

In [131]:
champs.head()

Unnamed: 0,Year,League,Champion,Tm,Runner_Up,Runner_Up_Code
0,2020,NBA,Los Angeles Lakers,LAL,Miami Heat,LAL
1,2019,NBA,Toronto Raptors,TOR,Golden State Warriors,TOR
2,2018,NBA,Golden State Warriors,GSW,Cleveland Cavaliers,GSW
3,2017,NBA,Golden State Warriors,GSW,Cleveland Cavaliers,GSW
4,2016,NBA,Cleveland Cavaliers,CLE,Golden State Warriors,CLE


In [132]:
champs.shape

(80, 6)

### Removing Total Index Values

In [135]:
nba = nba[nba.Tm != 'TOT']

### Adding Indexes for Comparing Datasets

In [157]:
nba["id"] = nba.index + 1
champs["id"] = champs.index + 1

### Group Dataframe by Player Name

In [123]:
nba_total = nba.groupby('Player', dropna = False)[['PTS', 'AST', 'TOV', 'ORB', 'DRB','TRB', 'BLK','STL', 'G', 'GS','MP','FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'PF']].sum()
nba_total.head()

Unnamed: 0_level_0,PTS,AST,TOV,ORB,DRB,TRB,BLK,STL,G,GS,MP,FG,FGA,3P,3PA,FT,FTA,PF
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
A.C. Green,12928.0,1469.0,1508.0,3576.0,6553.0,10129.0,562.0,1103.0,1361.0,905.0,39044.0,4778.0,9686.0,125.0,509.0,3247.0,4447.0,2581.0
A.J. Bramlett,8.0,0.0,3.0,12.0,10.0,22.0,0.0,1.0,8.0,0.0,61.0,4.0,21.0,0.0,0.0,0.0,0.0,13.0
A.J. English,1502.0,320.0,203.0,140.0,175.0,315.0,24.0,57.0,151.0,18.0,3108.0,617.0,1418.0,9.0,65.0,259.0,333.0,287.0
A.J. Guyton,442.0,147.0,62.0,22.0,58.0,80.0,12.0,20.0,80.0,14.0,1246.0,166.0,440.0,73.0,193.0,37.0,45.0,58.0
A.J. Hammons,48.0,4.0,10.0,8.0,28.0,36.0,13.0,1.0,22.0,0.0,163.0,17.0,42.0,5.0,10.0,9.0,20.0,21.0


In [170]:
nba_1 = nba[['Year', 'Tm', 'id']]
champs_1 = champs[['Year', 'Tm', 'id']]

nba_1.set_index('id', inplace = True)
champs_1.set_index('id', inplace = True)

In [177]:
integer_types = ['Year']

champs_1[integer_types] = champs_1[integer_types].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [178]:
nba_1.dtypes

Year     int32
Tm      object
dtype: object

In [179]:
champs_1.dtypes

Year     int32
Tm      object
dtype: object

#### Numpy Select for Adding Championship Teams

In [180]:
nba['won_chip'] = np.where((nba_1['Year'] == champs_1['Year']) & (nba_2['Tm'] == champs_2['Tm']), 1, 0)

ValueError: Can only compare identically-labeled Series objects