# Create classifier to predict probability of NBA players of being in/being selected in the future to the hall of fame
### in the process create a robust basketball statistics database that, while not all used for this task, can be used for later analysis in the future 

# SCRAPE / LOAD IN AND CLEAN ALL DATA

In [1]:
# Import all necessary libraries and remove row and column restrictions

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import pickle 
import os 

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Read in dataset with all individual player statistics from 1950 - 2017

stats = pd.read_csv('/Users/cookedkaledev/Downloads/nba-players-stats/Seasons_Stats.csv')

In [3]:
# Check the head to make sure all data was loaded in correctly 

stats.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,,0.467,,,,,,,,,,-0.1,3.6,3.5,,,,,,,144.0,516.0,0.279,,,,144.0,516.0,0.279,0.279,170.0,241.0,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,0.435,,0.387,,,,,,,,,,1.6,0.6,2.2,,,,,,,102.0,274.0,0.372,,,,102.0,274.0,0.372,0.372,75.0,106.0,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,0.394,,0.259,,,,,,,,,,0.9,2.8,3.6,,,,,,,174.0,499.0,0.349,,,,174.0,499.0,0.349,0.349,90.0,129.0,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,,0.395,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,22.0,86.0,0.256,,,,22.0,86.0,0.256,0.256,19.0,34.0,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,,0.378,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,21.0,82.0,0.256,,,,21.0,82.0,0.256,0.256,17.0,31.0,0.548,,,,20.0,,,,27.0,59.0


In [4]:
# Drop all blank columns to make the data a little cleaner 

stats = stats.drop(['Unnamed: 0','blanl','blank2'], axis = 1)

In [5]:
# Check tail to make sure columns were dropped and the back end of the dataset was loaded in properly as well 

stats.tail()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
24686,2017.0,Cody Zeller,PF,24.0,CHO,62.0,58.0,1725.0,16.7,0.604,0.002,0.442,8.6,17.3,12.9,9.1,1.8,3.0,10.9,15.5,3.4,2.2,5.6,0.157,-0.2,2.3,2.1,1.8,253.0,443.0,0.571,0.0,1.0,0.0,253.0,442.0,0.572,0.571,133.0,196.0,0.679,135.0,270.0,405.0,99.0,62.0,58.0,65.0,189.0,639.0
24687,2017.0,Tyler Zeller,C,27.0,BOS,51.0,5.0,525.0,13.0,0.508,0.006,0.247,9.2,17.0,13.2,12.2,0.7,3.3,10.2,16.5,0.5,0.6,1.0,0.094,-3.2,0.8,-2.5,-0.1,78.0,158.0,0.494,0.0,1.0,0.0,78.0,157.0,0.497,0.494,22.0,39.0,0.564,43.0,81.0,124.0,42.0,7.0,21.0,20.0,61.0,178.0
24688,2017.0,Stephen Zimmerman,C,20.0,ORL,19.0,0.0,108.0,7.3,0.346,0.0,0.161,10.8,24.9,17.6,5.3,0.9,3.7,8.3,14.8,-0.1,0.1,0.0,-0.005,-7.8,0.4,-7.3,-0.1,10.0,31.0,0.323,0.0,0.0,,10.0,31.0,0.323,0.323,3.0,5.0,0.6,11.0,24.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0
24689,2017.0,Paul Zipser,SF,22.0,CHI,44.0,18.0,843.0,6.9,0.503,0.448,0.181,1.9,14.2,8.0,6.1,0.9,1.5,14.4,14.4,-0.3,0.8,0.5,0.03,-3.6,-0.1,-3.7,-0.4,88.0,221.0,0.398,33.0,99.0,0.333,55.0,122.0,0.451,0.473,31.0,40.0,0.775,15.0,110.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0
24690,2017.0,Ivica Zubac,C,19.0,LAL,38.0,11.0,609.0,17.0,0.547,0.013,0.206,7.1,21.9,14.3,8.1,1.1,4.4,10.4,20.3,0.6,0.5,1.1,0.086,-2.7,0.3,-2.5,-0.1,126.0,238.0,0.529,0.0,3.0,0.0,126.0,235.0,0.536,0.529,32.0,49.0,0.653,41.0,118.0,159.0,30.0,14.0,33.0,30.0,66.0,284.0


In [6]:
# Get the response for wikipedias list of hall of fame players 

response_hof = requests.get("https://en.wikipedia.org/wiki/List_of_players_in_the_Naismith_Memorial_Basketball_Hall_of_Fame")

print(response_hof.status_code)

200


In [7]:
# Create a BeautifulSoup object to parse the html code from the website 

soup_hof = BeautifulSoup(response_hof.content , 'html.parser')

In [8]:
# Loop through the aspects of the html containing the data we're interested in and read it into a dataframe 

df_hof = pd.DataFrame(columns = ['Year', 'playerName', 'Position', 'Summary'])
table_branches = soup_hof.find_all('table')[0].find_all('tr')
for i in range(1, len(table_branches)):
    df_hof = df_hof.append({'Year' : (table_branches[i].find_all('td')[0].text),
                    'playerName' : (table_branches[i].find_all('td')[1].text),
                    'Position' : (table_branches[i].find_all('td')[2].text),
                    'Summary' : (table_branches[i].find_all('td')[3].text)}, ignore_index = True)

# Check head to make sure data was scraped and read in properly     
    
df_hof.head()

Unnamed: 0,Year,playerName,Position,Summary
0,1959,Chuck Hyatt,G,"National championship (Pittsburgh, 1928, 1930)..."
1,1959,Hank Luisetti,F,3 Pacific Coast Conference championships (Stan...
2,1959,George Mikan,C,"All-America (DePaul, 1944–45); All-NBA First-T..."
3,1959,John Schommer,G,"Big Ten Championships (Chicago, 1907–09); All-..."
4,1960,Vic Hanson,G,"Helms Foundation Championship (Syracuse, 1926)..."


In [9]:
# To avoid a later discrepancy with Basketball Reference's datasets, change Nate Archibald's name to the more commonly used Tiny

df_hof['playerName'] = df_hof['playerName'].replace('Nate Archibald', 'Tiny Archibald')

In [10]:
# Create a list of all team abbreviations used in Basketball Refernce URLs to loop through and read in all team data 

scrape_list = ['ATL', 'DET', 'TOR', 'BOS', 'PHI', 'NJN', 'NYK', 'DEN', 'UTA', 'OKC', 'POR', 'MIN', 'MIL',
              'IND', 'CHI', 'CLE', 'LAL', 'LAC', 'SAC', 'PHO', 'GSW', 'MIA', 'ORL', 'WAS', 'CHA',
              'HOU', 'DAL', 'MEM', 'NOH', 'SAS']

In [11]:
# Get the response for Atlanta's web page so that some basic operations can be completed before looping through all teams 

response_team = requests.get('https://www.basketball-reference.com/teams/ATL/stats_basic_totals.html')

print(response_team.status_code)

200


In [12]:
# Create a BeautifulSoup object for the web page 

soup_team = BeautifulSoup(response_team.content, 'html.parser')

In [13]:
""" 
create empty dataframe using columns from website using initial parser, then looping through 
each team using scrape list and the common URL aspects that each share
read all data from each team creating tempory dataframe and appending each one to our initial empty dataframe

td_gardens variable is just storing all of the td tags from the HTML which contain the data we're interested in
the name is just a reference to the famous Boston Celtics arena

Some of the aspects of the scraping may look confusing without further context or slightly hacked together
essentially after trial and error and examining the site's original HTML this code will give us our 
needed information 

"""
teams_table = soup_team.find_all('table')
cols = teams_table[0].find_all('thead')[0].find_all('th')
td_gardens = teams_table[0].find_all('tbody')[0].find_all('tr', class_=lambda x: x != 'thead')
df_teams = pd.DataFrame(columns = [cols[i].text for i in range(len(cols)) if cols[i].text != "\xa0"])
for team in scrape_list:
    response_temp = requests.get('https://www.basketball-reference.com/teams/{}/stats_basic_totals.html'.format(team))
    soup_temp = BeautifulSoup(response_temp.content, 'html.parser')
    teams_table_temp = soup_temp.find_all('table')
    td_gardens_temp = teams_table_temp[0].find_all('tbody')[0].find_all('tr', class_=lambda x: x != 'thead')
    teams_df_temp = pd.DataFrame(index = range(len(td_gardens_temp)) , columns = [cols[i].text for i in range(len(cols)) if cols[i].text != "\xa0"])
    for i in range(len(td_gardens_temp)):
        teams_df_temp.iloc[i,0] = td_gardens_temp[i].find_all('a')[0].text
        count = 0
        for j in range(33):
            if "foo" not in str(td_gardens_temp[i].find_all('td')[j]):
                teams_df_temp.iloc[i,j+1-count] = td_gardens_temp[i].find_all('td')[j].text
            else:
                count += 1
        
    df_teams = df_teams.append(teams_df_temp, ignore_index = True)

# check head to make sure everything was scraped and read in properly     
    
df_teams.head()

Unnamed: 0,Season,Lg,Tm,W,L,Finish,Age,Ht.,Wt.,G,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2019-20,NBA,ATL,20,47,5,24.1,6-6,214,67,16280,2723,6067,0.449,805,2416,0.333,1918,3651,0.525,1237,1566,0.79,661,2237,2898,1605,523,341,1086,1548,7488
1,2018-19,NBA,ATL,29,53,5,25.1,6-7,215,82,19855,3392,7524,0.451,1067,3034,0.352,2325,4490,0.518,1443,1918,0.752,955,2825,3780,2118,675,419,1397,1932,9294
2,2017-18,NBA,ATL,24,58,5,25.4,6-6,212,82,19705,3130,7015,0.446,917,2544,0.36,2213,4471,0.495,1298,1654,0.785,743,2693,3436,1946,638,348,1276,1606,8475
3,2016-17,NBA,ATL,43,39,2,27.9,6-6,219,82,19880,3123,6918,0.451,729,2137,0.341,2394,4781,0.501,1484,2039,0.728,842,2793,3635,1938,672,397,1294,1491,8459
4,2015-16,NBA,ATL,48,34,2,28.2,6-6,217,82,19830,3168,6923,0.458,815,2326,0.35,2353,4597,0.512,1282,1638,0.783,679,2772,3451,2100,747,486,1226,1570,8433


In [14]:
# recast all columns that we need as numeric to float to be able to later perform aggregate functions 

df_teams['PTS'] = df_teams['PTS'].astype(float)
df_teams['PF'] = df_teams['PF'].astype(float)
df_teams['G'] = df_teams['G'].astype(float)
df_teams['MP'] = pd.to_numeric(df_teams['MP'])
df_teams['FG'] = df_teams['FG'].astype(float)
df_teams['FGA'] = df_teams['FGA'].astype(float)
df_teams['3P'] = pd.to_numeric(df_teams['3P'])
df_teams['3PA'] = pd.to_numeric(df_teams['3PA'])
df_teams['2P'] = df_teams['2P'].astype(float)
df_teams['2PA'] = df_teams['2PA'].astype(float)
df_teams['FT'] = df_teams['FT'].astype(float)
df_teams['FTA'] = df_teams['FTA'].astype(float)
df_teams['ORB'] = pd.to_numeric(df_teams['ORB'])
df_teams['DRB'] = pd.to_numeric(df_teams['DRB'])
df_teams['TRB'] = pd.to_numeric(df_teams['TRB'])
df_teams['AST'] = df_teams['AST'].astype(float)
df_teams['STL'] = pd.to_numeric(df_teams['STL'])
df_teams['BLK'] = pd.to_numeric(df_teams['BLK'])
df_teams['TOV'] = pd.to_numeric(df_teams['TOV'])

In [15]:
# check info to make sure all type casting worked properly 

df_teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1543 entries, 0 to 1542
Data columns (total 32 columns):
Season    1543 non-null object
Lg        1543 non-null object
Tm        1543 non-null object
W         1543 non-null object
L         1543 non-null object
Finish    1543 non-null object
Age       1543 non-null object
Ht.       1543 non-null object
Wt.       1543 non-null object
G         1543 non-null float64
MP        1408 non-null float64
FG        1543 non-null float64
FGA       1543 non-null float64
FG%       1543 non-null object
3P        1170 non-null float64
3PA       1170 non-null float64
3P%       1543 non-null object
2P        1543 non-null float64
2PA       1543 non-null float64
2P%       1543 non-null object
FT        1543 non-null float64
FTA       1543 non-null float64
FT%       1543 non-null object
ORB       1285 non-null float64
DRB       1285 non-null float64
TRB       1523 non-null float64
AST       1543 non-null float64
STL       1265 non-null float64
BLK       

In [16]:
# Read in all opponent data that will be needed for later advanced metric calculations, same procedure as with the team data scraping 

response_opps = requests.get('https://www.basketball-reference.com/teams/TOR/opp_stats_basic_totals.html')

print(response_opps.status_code)

200


In [17]:
soup_opps = BeautifulSoup(response_opps.content, 'html.parser')

In [18]:
opps_table = soup_opps.find_all('table')
cols_opps = opps_table[0].find_all('thead')[0].find_all('th')
td_gardens_opps = opps_table[0].find_all('tbody')[0].find_all('tr', class_=lambda x: x != 'thead')
opps_df = pd.DataFrame(columns = [cols_opps[i].text for i in range(len(cols_opps)) if cols_opps[i].text != "\xa0"])
for team in scrape_list:
    response_temp = requests.get('https://www.basketball-reference.com/teams/{}/opp_stats_basic_totals.html'.format(team))
    soup_temp = BeautifulSoup(response_temp.content, 'html.parser')
    teams_table_temp = soup_temp.find_all('table')
    td_gardens_temp = teams_table_temp[0].find_all('tbody')[0].find_all('tr', class_=lambda x: x != 'thead')
    teams_df_temp = pd.DataFrame(index = range(len(td_gardens_temp)) , columns = [cols_opps[i].text for i in range(len(cols_opps)) if cols_opps[i].text != "\xa0"])
    for i in range(len(td_gardens_temp)):
        teams_df_temp.iloc[i,0] = td_gardens_temp[i].find_all('a')[0].text
        count = 0
        for j in range(29):
            if "foo" not in str(td_gardens_temp[i].find_all('td')[j]):
                teams_df_temp.iloc[i,j+1-count] = td_gardens_temp[i].find_all('td')[j].text
            else:
                count += 1
        
    opps_df = opps_df.append(teams_df_temp, ignore_index = True)

In [19]:
opps_df.head()

Unnamed: 0,Season,Lg,Tm,W,L,Finish,G,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2019-20,NBA,ATL,20,47,5,67,16280,2903,6078,0.478,797,2194,0.363,2106,3884,0.542,1419,1839,0.772,748,2402,3150,1674,600,428,1006,1404,8022
1,2018-19,NBA,ATL,29,53,5,82,19855,3508,7417,0.473,1013,2820,0.359,2495,4597,0.543,1759,2321,0.758,871,2904,3775,2202,810,448,1234,1817,9788
2,2017-18,NBA,ATL,24,58,5,82,19705,3336,7107,0.469,949,2519,0.377,2387,4588,0.52,1301,1686,0.772,841,2785,3626,2137,710,453,1238,1661,8922
3,2016-17,NBA,ATL,43,39,2,82,19880,3178,7155,0.444,893,2502,0.357,2285,4653,0.491,1280,1658,0.772,876,2722,3598,1919,738,424,1262,1768,8529
4,2015-16,NBA,ATL,48,34,2,82,19830,3046,7058,0.432,679,2008,0.338,2367,5050,0.469,1366,1810,0.755,944,2869,3813,1808,706,411,1324,1500,8137


In [20]:
opps_df['PTS'] = opps_df['PTS'].astype(float)
opps_df['PF'] = pd.to_numeric(opps_df['PF'])
opps_df['G'] = opps_df['G'].astype(float)
opps_df['MP'] = pd.to_numeric(opps_df['MP'])
opps_df['FG'] = pd.to_numeric(opps_df['FG'])
opps_df['FGA'] = pd.to_numeric(opps_df['FGA'])
opps_df['3P'] = pd.to_numeric(opps_df['3P'])
opps_df['3PA'] = pd.to_numeric(opps_df['3PA'])
opps_df['2P'] = pd.to_numeric(opps_df['2P'])
opps_df['2PA'] = pd.to_numeric(opps_df['2PA'])
opps_df['FT'] = pd.to_numeric(opps_df['FT'])
opps_df['FTA'] = pd.to_numeric(opps_df['FTA'])
opps_df['ORB'] = pd.to_numeric(opps_df['ORB'])
opps_df['DRB'] = pd.to_numeric(opps_df['DRB'])
opps_df['TRB'] = pd.to_numeric(opps_df['TRB'])
opps_df['AST'] = pd.to_numeric(opps_df['AST'])
opps_df['STL'] = pd.to_numeric(opps_df['STL'])
opps_df['BLK'] = pd.to_numeric(opps_df['BLK'])
opps_df['TOV'] = pd.to_numeric(opps_df['TOV'])

In [21]:
opps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1543 entries, 0 to 1542
Data columns (total 29 columns):
Season    1543 non-null object
Lg        1543 non-null object
Tm        1543 non-null object
W         1543 non-null object
L         1543 non-null object
Finish    1543 non-null object
G         1543 non-null float64
MP        1408 non-null float64
FG        1340 non-null float64
FGA       1340 non-null float64
FG%       1543 non-null object
3P        1170 non-null float64
3PA       1170 non-null float64
3P%       1543 non-null object
2P        1340 non-null float64
2PA       1340 non-null float64
2P%       1543 non-null object
FT        1340 non-null float64
FTA       1340 non-null float64
FT%       1543 non-null object
ORB       1273 non-null float64
DRB       1273 non-null float64
TRB       1340 non-null float64
AST       1340 non-null float64
STL       1265 non-null float64
BLK       1265 non-null float64
TOV       1289 non-null float64
PF        1340 non-null float64
PTS    

In [22]:
# For a later merging we will need full team names, create list that can be mapped to the abbreviations in the current dataframe 

team_abv = df_teams['Tm'].unique()

team_full = np.array(['Atlanta Hawks', 'St. Louis Hawks', 'Milwaukee Hawks', 'Tri-Cities Blackhawks', 'Detroit Pistons',
                     'Fort Wayne Pistons', 'Toronto Raptors', 'Boston Celtics', 'Philadelphia 76ers', 'Syracuse Nationals',
                     'Brooklyn Nets', 'New Jersey Nets', 'New York Nets', 'New York Nets', 'New Jersey Americans',
                     'New York Knicks', 'Denver Nuggets', 'Denver Nuggets', 'Denver Rockets', 'Utah Jazz',
                     'New Orleans Jazz', 'Oklahoma City Thunder', 'Seattle SuperSonics', 'Portland Trail Blazers',
                     'Minnesota Timberwolves', 'Milwaukee Bucks', 'Indiana Pacers', 'Indiana Pacers', 'Chicago Bulls',
                     'Cleveland Cavaliers', 'Los Angeles Lakers', 'Minneapolis Lakers', 'Los Angeles Clippers',
                     'San Diego Clippers', 'Buffalo Braves', 'Sacramento Kings', 'Kansas City Kings', 
                     'Kansas City-Omaha Kings', 'Cincinnati Royals', 'Rochester Royals', 'Phoenix Suns', 'Golden State Warriors',
                     'San Francisco Warriors', 'Philadelphia Warriors', 'Miami Heat', 'Orlando Magic', 'Washington Wizards',
                     'Washington Bullets', 'Capital Bullets', 'Baltimore Bullets', 'Chicago Zephyrs', 'Chicago Packers',
                     'Charlotte Hornets', 'Charlotte Bobcats', 'Charlotte Hornets', 'Houston Rockets', 'San Diego Rockets',
                     'Dallas Mavericks', 'Memphis Grizzlies', 'Vancouver Grizzlies', 'New Orleans Pelicans', 'New Orleans Hornets',
                     'NO/Ok. City Hornets', 'San Antonio Spurs', 'San Antonio Spurs', 'Dallas Chaparrals', 'Texas Chaparrals'])

team_dict = [*zip(team_abv, team_full)]
team_dict = dict(team_dict)
df_teams['TmFull'] = df_teams['Tm'].map(team_dict)

In [23]:
# For dataframes using season format containing 2 year range, create singular year column that can more easily be used for comparisons 

df_teams['Year'] = [df_teams['Season'].iloc[i][0:2] + df_teams['Season'].iloc[i][5:7] for i in range(len(df_teams))]

df_teams['Year'] = df_teams['Year'].astype(float)

opps_df['Year'] = [opps_df['Season'].iloc[i][0:2] + opps_df['Season'].iloc[i][5:7] for i in range(len(opps_df))]

opps_df['Year'] = opps_df['Year'].astype(float)

# Merge player stats and team stats into one dataframe based on the player 

stats_complete = stats.merge(right = df_teams, on = ['Tm', 'Year'])

In [24]:
# remove all asterisk characters from player column to make all future calls and merges easier 

stats_complete['Player'] = stats_complete['Player'].str.replace('*','')

In [25]:
# Call head to make sure all operations above carried out correctly 

stats_complete.head()

Unnamed: 0,Year,Player,Pos,Age_x,Tm,G_x,GS,MP_x,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG_x,FGA_x,FG%_x,3P_x,3PA_x,3P%_x,2P_x,2PA_x,2P%_x,eFG%,FT_x,FTA_x,FT%_x,ORB_x,DRB_x,TRB_x,AST_x,STL_x,BLK_x,TOV_x,PF_x,PTS_x,Season,Lg,W,L,Finish,Age_y,Ht.,Wt.,G_y,MP_y,FG_y,FGA_y,FG%_y,3P_y,3PA_y,3P%_y,2P_y,2PA_y,2P%_y,FT_y,FTA_y,FT%_y,ORB_y,DRB_y,TRB_y,AST_y,STL_y,BLK_y,TOV_y,PF_y,PTS_y,TmFull
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,,0.467,,,,,,,,,-0.1,3.6,3.5,,,,,,144.0,516.0,0.279,,,,144.0,516.0,0.279,0.279,170.0,241.0,0.705,,,,176.0,,,,217.0,458.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons
1,1950.0,Charlie Black,F-C,28.0,FTW,36.0,,,,0.362,,0.48,,,,,,,,,-0.7,2.2,1.5,,,,,,125.0,435.0,0.287,,,,125.0,435.0,0.287,0.287,132.0,209.0,0.632,,,,75.0,,,,140.0,382.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons
2,1950.0,Bob Carpenter,F-C,32.0,FTW,66.0,,,,0.421,,0.415,,,,,,,,,2.2,2.8,5.0,,,,,,212.0,617.0,0.344,,,,212.0,617.0,0.344,0.344,190.0,256.0,0.742,,,,92.0,,,,168.0,614.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons
3,1950.0,Bob Harris,F-C,22.0,FTW,62.0,,,,0.423,,0.48,,,,,,,,,2.1,3.0,5.0,,,,,,168.0,465.0,0.361,,,,168.0,465.0,0.361,0.361,140.0,223.0,0.628,,,,129.0,,,,190.0,476.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons
4,1950.0,Bill Henry,C,25.0,FTW,44.0,,,,0.405,,0.598,,,,,,,,,0.6,1.3,1.9,,,,,,65.0,209.0,0.311,,,,65.0,209.0,0.311,0.311,84.0,125.0,0.672,,,,39.0,,,,99.0,214.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons


In [26]:
# merge our newly created dataframe with the needed columns from the opponent statistics dataframe 

stats_complete = stats_complete.merge(right = opps_df[['Tm','Year','DRB','ORB','TRB','FGA','3PA','TOV','FTA']], 
                                      how = 'left', on = ['Tm', 'Year'])

In [27]:
# double check the merge worked correctly 

stats_complete.head()

Unnamed: 0,Year,Player,Pos,Age_x,Tm,G_x,GS,MP_x,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG_x,FGA_x,FG%_x,3P_x,3PA_x,3P%_x,2P_x,2PA_x,2P%_x,eFG%,FT_x,FTA_x,FT%_x,ORB_x,DRB_x,TRB_x,AST_x,STL_x,BLK_x,TOV_x,PF_x,PTS_x,Season,Lg,W,L,Finish,Age_y,Ht.,Wt.,G_y,MP_y,FG_y,FGA_y,FG%_y,3P_y,3PA_y,3P%_y,2P_y,2PA_y,2P%_y,FT_y,FTA_y,FT%_y,ORB_y,DRB_y,TRB_y,AST_y,STL_y,BLK_y,TOV_y,PF_y,PTS_y,TmFull,DRB,ORB,TRB,FGA,3PA,TOV,FTA
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,,0.467,,,,,,,,,-0.1,3.6,3.5,,,,,,144.0,516.0,0.279,,,,144.0,516.0,0.279,0.279,170.0,241.0,0.705,,,,176.0,,,,217.0,458.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,
1,1950.0,Charlie Black,F-C,28.0,FTW,36.0,,,,0.362,,0.48,,,,,,,,,-0.7,2.2,1.5,,,,,,125.0,435.0,0.287,,,,125.0,435.0,0.287,0.287,132.0,209.0,0.632,,,,75.0,,,,140.0,382.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,
2,1950.0,Bob Carpenter,F-C,32.0,FTW,66.0,,,,0.421,,0.415,,,,,,,,,2.2,2.8,5.0,,,,,,212.0,617.0,0.344,,,,212.0,617.0,0.344,0.344,190.0,256.0,0.742,,,,92.0,,,,168.0,614.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,
3,1950.0,Bob Harris,F-C,22.0,FTW,62.0,,,,0.423,,0.48,,,,,,,,,2.1,3.0,5.0,,,,,,168.0,465.0,0.361,,,,168.0,465.0,0.361,0.361,140.0,223.0,0.628,,,,129.0,,,,190.0,476.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,
4,1950.0,Bill Henry,C,25.0,FTW,44.0,,,,0.405,,0.598,,,,,,,,,0.6,1.3,1.9,,,,,,65.0,209.0,0.311,,,,65.0,209.0,0.311,0.311,84.0,125.0,0.672,,,,39.0,,,,99.0,214.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,


In [28]:
# rename all duplicate columns to represent whether it represents a player, team, or opponent's statistics 

stats_complete = stats_complete.rename(columns = {'Age_x' : 'Age_Player' , 'G_x' : 'G_Player', 'MP_x' : 'MP_Player', 
                                                  'FG_x' : 'FG_Player', 'FGA_x' : 'FGA_Player', 'FG%_x' : 'FG%_Player',
                                                  '3P_x' : '3P_Player', '3PA_x' : '3PA_Player', '3P%_x' : '3P%_Player',
                                                  '2P_x' : '2P_Player', '2PA_x' : '2PA_Player', '2P%_x' : '2P%_Player',
                                                  'FT_x' : 'FT_Player', 'FTA_x' : 'FTA_Player', 'FT%_x' : 'FT%_Player',
                                                  'ORB_x' : 'ORB_Player', 'DRB_x' : 'DRB_Player', 'TRB_x' : 'TRB_Player',
                                                  'AST_x' : 'AST_Player', 'STL_x' : 'STL_Player', 'BLK_x' : 'BLK_Player',
                                                  'TOV_x' : 'TOV_Player', 'PF_x' : 'PF_Player', 'PTS_x' : 'PTS_Player',
                                                  'Age_y' : 'Age_Team' , 'G_y' : 'G_Team', 'MP_y' : 'MP_Team', 
                                                  'FG_y' : 'FG_Team', 'FGA_y' : 'FGA_Team', 'FG%_y' : 'FG%_Team',
                                                  '3P_y' : '3P_Team', '3PA_y' : '3PA_Team', '3P%_y' : '3P%_Team',
                                                  '2P_y' : '2P_Team', '2PA_y' : '2PA_Team', '2P%_y' : '2P%_Team',
                                                  'FT_y' : 'FT_Team', 'FTA_y' : 'FTA_Team', 'FT%_y' : 'FT%_Team',
                                                  'ORB_y' : 'ORB_Team', 'DRB_y' : 'DRB_Team', 'TRB_y' : 'TRB_Team',
                                                  'AST_y' : 'AST_Team', 'STL_y' : 'STL_Team', 'BLK_y' : 'BLK_Team',
                                                  'TOV_y' : 'TOV_Team', 'PF_y' : 'PF_Team', 'PTS_y' : 'PTS_Team',
                                                  'DRB' : 'DRB_opp', 'ORB' : 'ORB_opp', 'TRB' : 'TRB_opp', 'FGA' : 'FGA_opp',
                                                  '3PA' : '3PA_opp', 'TOV' : 'TOV_opp', 'FTA' : 'FTA_opp'})

In [29]:
# Examine the info and see that even after all merges the memory footprint is still relatively low under 15 mbs
# Any null values are due to era specific availability for some statitistics such as offensive rebounds 

stats_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21764 entries, 0 to 21763
Data columns (total 89 columns):
Year          21764 non-null float64
Player        21764 non-null object
Pos           21764 non-null object
Age_Player    21757 non-null float64
Tm            21764 non-null object
G_Player      21764 non-null float64
GS            16190 non-null float64
MP_Player     21536 non-null float64
PER           21531 non-null float64
TS%           21679 non-null float64
3PAr          16737 non-null float64
FTr           21667 non-null float64
ORB%          18535 non-null float64
DRB%          18535 non-null float64
TRB%          19249 non-null float64
AST%          20160 non-null float64
STL%          18535 non-null float64
BLK%          18535 non-null float64
TOV%          17404 non-null float64
USG%          17461 non-null float64
OWS           21764 non-null float64
DWS           21764 non-null float64
WS            21764 non-null float64
WS/48         21531 non-null float64
OBPM  

In [30]:
# We're going to start scraping in player award stats from Basketball Reference starting with all star appearances 

response_allstars = requests.get('https://en.wikipedia.org/wiki/List_of_NBA_All-Stars')

print(response_allstars.status_code)

200


In [31]:
soup_allstars = BeautifulSoup(response_allstars.content, 'html.parser')

In [32]:
allstar_table = soup_allstars.find_all('table')[1].find_all('tbody')[0]

In [33]:
df_allstars = pd.DataFrame(index = range(len(allstar_table.find_all('tr')) - 1), columns = ['Player', 'AllStarSelections'])

In [34]:
for i in range(len(allstar_table.find_all('tr')) - 1):
    df_allstars['Player'].iloc[i] = allstar_table.find_all('tr')[i+1].find_all('td')[0].find_all('a')[0].text
    df_allstars['AllStarSelections'].iloc[i] = allstar_table.find_all('tr')[i+1].find_all('td')[1].text

In [35]:
df_allstars.head()

Unnamed: 0,Player,AllStarSelections
0,Kareem Abdul-Jabbar,19
1,Kobe Bryant,18
2,LeBron James,16
3,Tim Duncan,15
4,Kevin Garnett,15


In [36]:
"""
 Merging award dataframes that contain career count values rather than binary yes or no values that can later be summed
 Could have more easily been done after other aggregate functions had already been performed, saving a step
 MVPs were loaded in later and merged in this way, moral of the story: "lesson learned"
"""

stats_complete = stats_complete.merge(right = df_allstars, how = 'left', on = 'Player')

In [37]:
# fill na values (no all star appearances) with zeroes 

stats_complete['AllStarSelections'] = stats_complete['AllStarSelections'].fillna(0)

In [38]:
# read in championships/playoff data , this time using a more binary approach that can later be summed when grouping by players 

response_champs = requests.get('https://www.basketball-reference.com/playoffs/')

print(response_champs.status_code)

200


In [39]:
soup_champs = BeautifulSoup(response_champs.content, 'html.parser')

In [40]:
champs_tr_tags = soup_champs.find_all('table')[0].find_all('tbody')[0].find_all('tr', class_=lambda x: x != 'thead')

In [41]:
df_Championships = pd.DataFrame(index = range(len(champs_tr_tags) - 1), columns = ['Year', 
                                'Champion', 'Runner-up', 'Finals MVP', 'Points', 'Rebounds',
                                'Assists', 'Win Shares'])

In [42]:
for i in range(len(champs_tr_tags) - 1):
    df_Championships['Year'].iloc[i] = champs_tr_tags[i + 1].find_all('th')[0].text
    df_Championships['Champion'].iloc[i] = champs_tr_tags[i + 1].find_all('td')[1].text
    df_Championships['Runner-up'].iloc[i] = champs_tr_tags[i + 1].find_all('td')[2].text
    df_Championships['Finals MVP'].iloc[i] = champs_tr_tags[i + 1].find_all('td')[3].text
    df_Championships['Points'].iloc[i] = champs_tr_tags[i + 1].find_all('td')[4].text
    df_Championships['Rebounds'].iloc[i] = champs_tr_tags[i + 1].find_all('td')[5].text
    df_Championships['Assists'].iloc[i] = champs_tr_tags[i + 1].find_all('td')[6].text
    df_Championships['Win Shares'].iloc[i] = champs_tr_tags[i + 1].find_all('td')[7].text

In [43]:
df_Championships['Year'] = df_Championships['Year'].astype(int)

In [44]:
stats_complete = stats_complete.merge(right = df_Championships, how = 'left', on = 'Year')

In [45]:
stats_complete['Championships'] = stats_complete.apply(lambda x: 1 if x.TmFull == x.Champion else 0, axis = 1)
stats_complete['Runner-Ups'] = stats_complete.apply(lambda x: 1 if x.TmFull == x['Runner-up'] else 0, axis = 1)

stats_complete['Championships'] = stats_complete['Championships'].fillna(0)
stats_complete['Runner-Ups'] = stats_complete['Runner-Ups'].fillna(0)

In [46]:
# Read in rookie of the year data 

response_roy = requests.get('https://www.basketball-reference.com/awards/roy.html')

print(response_roy.status_code)

200


In [47]:
soup_roy = BeautifulSoup(response_roy.content, 'html.parser')

In [48]:
roy_tr_tags = soup_roy.find_all('table')[0].find_all('tbody')[0].find_all('tr', class_=lambda x: x != 'thead')

In [49]:
df_roy = pd.DataFrame(index = range(len(roy_tr_tags)), columns = ['Season', 'Player', 'Year'])

In [50]:
for i in range(len(df_roy)):
    df_roy['Season'].iloc[i] = roy_tr_tags[i].find_all('th')[0].text
    df_roy['Player'].iloc[i] = roy_tr_tags[i].find_all('td')[1].text
    df_roy['Year'].iloc[i] = roy_tr_tags[i].find_all('th')[0].text[0:2] + roy_tr_tags[i].find_all('th')[0].text[5:7]

In [51]:
df_roy['Year'] = df_roy['Year'].astype(int)

In [52]:
stats_complete['ROY'] = stats_complete.apply(lambda x: 1 if 
                                             x.Year in df_roy[df_roy['Player'] == x.Player]['Year'].values else 0, axis = 1)

In [53]:
# Read in defensive player of the year data 

response_dpoy = requests.get('https://www.basketball-reference.com/awards/dpoy.html')

print(response_dpoy.status_code)

200


In [54]:
soup_dpoy = BeautifulSoup(response_dpoy.content, 'html.parser')

In [55]:
dpoy_tags = soup_dpoy.find_all('tbody')[1].find_all('tr')

In [56]:
df_dpoy = pd.DataFrame(index = range(len(dpoy_tags)), columns = ['Player', 'dpoy_count'])

In [57]:
for i in range(len(dpoy_tags)):
    df_dpoy['Player'].iloc[i] = dpoy_tags[i].find_all('th')[0].text
    df_dpoy['dpoy_count'].iloc[i] = dpoy_tags[i].find_all('td')[1].text

In [58]:
stats_complete = stats_complete.merge(right = df_dpoy, how = 'left', on = 'Player')

In [59]:
stats_complete['dpoy_count'] = stats_complete['dpoy_count'].fillna(0)

In [60]:
# Read in sixth man of the year data 

response_six = requests.get('https://www.basketball-reference.com/awards/smoy.html')

print(response_six.status_code)

200


In [61]:
soup_six = BeautifulSoup(response_six.content, 'html.parser')

In [62]:
six_tr_tags = soup_six.find_all('table')[0].find_all('tbody')[0].find_all('tr')

In [63]:
df_six_man = pd.DataFrame(index = range(len(six_tr_tags)), columns = ['Season', 'Player', 'Year'])

In [64]:
for i in range(len(df_six_man)):
    df_six_man['Season'].iloc[i] = six_tr_tags[i].find_all('th')[0].text
    df_six_man['Player'].iloc[i] = six_tr_tags[i].find_all('td')[1].text
    df_six_man['Year'].iloc[i] = six_tr_tags[i].find_all('th')[0].text[0:2] + roy_tr_tags[i].find_all('th')[0].text[5:7]

In [65]:
stats_complete['6Man'] = stats_complete.apply(lambda x: 1 if 
                                             x.Year in df_six_man[df_six_man['Player'] == x.Player]['Year'].values else 0, axis = 1)

In [66]:
# read in most improved player data 

response_mip = requests.get('https://www.basketball-reference.com/awards/mip.html')

print(response_mip.status_code)

200


In [67]:
soup_mip = BeautifulSoup(response_mip.content, 'html.parser')

In [68]:
mip_tags = soup_mip.find_all('tbody')[1].find_all('tr')

In [69]:
df_mip = pd.DataFrame(index = range(len(mip_tags)), columns = ['Player', 'mip_count'])

In [70]:
for i in range(len(mip_tags)):
    df_mip['Player'].iloc[i] = mip_tags[i].find_all('th')[0].text
    df_mip['mip_count'].iloc[i] = mip_tags[i].find_all('td')[1].text

In [71]:
stats_complete = stats_complete.merge(right = df_mip, how = 'left', on = 'Player')

stats_complete['mip_count'] = stats_complete['mip_count'].fillna(0)

In [72]:
# read in data about players who have won ABA finals MVP(s)

response_abafinmvp = requests.get('https://www.basketball-reference.com/awards/playoffs_mvp.html')

print(response_abafinmvp.status_code)

200


In [73]:
soup_abafinmvp = BeautifulSoup(response_abafinmvp.content, 'html.parser')

In [74]:
abafinmvp_tags = soup_abafinmvp.find_all('tbody')[1].find_all('tr')

In [75]:
df_aba_finals_mvps = pd.DataFrame(index = range(len(abafinmvp_tags)), columns = ['Player', 'abafinmvp_count'])

In [76]:
for i in range(len(abafinmvp_tags)):
    df_aba_finals_mvps['Player'].iloc[i] = abafinmvp_tags[i].find_all('th')[0].text
    df_aba_finals_mvps['abafinmvp_count'].iloc[i] = abafinmvp_tags[i].find_all('td')[1].text

In [77]:
stats_complete = stats_complete.merge(right = df_aba_finals_mvps, how = 'left', on = 'Player')

stats_complete['abafinmvp_count'] = stats_complete['abafinmvp_count'].fillna(0)

In [78]:
# Read in data about players who have won all star game MVPs 

response_allstar_mvp = requests.get('https://www.basketball-reference.com/awards/all_star_mvp.html')

print(response_allstar_mvp.status_code)

200


In [79]:
soup_allstar_mvp = BeautifulSoup(response_allstar_mvp.content, 'html.parser')

In [80]:
all_star_mvp_tags = soup_allstar_mvp.find_all('tbody')[2].find_all('tr')

In [81]:
df_allstar_mvps = pd.DataFrame(index = range(len(all_star_mvp_tags)), columns = ['Player', 'Lg_all_star_mvp', 'all_star_mvp_count'])

In [82]:
for i in range(len(all_star_mvp_tags)):
    df_allstar_mvps['Player'].iloc[i] = all_star_mvp_tags[i].find_all('th')[0].text
    df_allstar_mvps['Lg_all_star_mvp'].iloc[i] = all_star_mvp_tags[i].find_all('td')[0].text
    df_allstar_mvps['all_star_mvp_count'].iloc[i] = all_star_mvp_tags[i].find_all('td')[1].text

In [83]:
stats_complete = stats_complete.merge(right = df_allstar_mvps, how = 'left', on = 'Player')

stats_complete['all_star_mvp_count'] = stats_complete['all_star_mvp_count'].fillna(0)

In [84]:
# read in data about players who were named to an all-nba team(s)

response_allnba = requests.get('https://www.basketball-reference.com/awards/all_league.html')

print(response_allnba.status_code)

200


In [85]:
soup_allnba = BeautifulSoup(response_allnba.content, 'html.parser')

In [86]:
allnba_tags = soup_allnba.find_all('table')[0].find_all('tbody')[0].find_all('tr', class_=lambda x: x != 'thead')

In [87]:
df_allnba = pd.DataFrame(index = range(len(allnba_tags)), columns = ['Season', 'Lg_all_lg', 'Team', 
                                                                    'C1', 'F1', 'F2', 'G1', 'G2'])

In [88]:
for i in range(len(allnba_tags)):
    df_allnba["Season"].iloc[i] = allnba_tags[i].find_all('th')[0].text
    df_allnba["Lg_all_lg"].iloc[i] = allnba_tags[i].find_all('td')[0].text
    df_allnba["Team"].iloc[i] = allnba_tags[i].find_all('td')[1].text
    df_allnba["C1"].iloc[i] = allnba_tags[i].find_all('td')[2].text
    df_allnba["F1"].iloc[i] = allnba_tags[i].find_all('td')[3].text
    df_allnba["F2"].iloc[i] = allnba_tags[i].find_all('td')[4].text
    df_allnba["G1"].iloc[i] = allnba_tags[i].find_all('td')[5].text
    df_allnba["G2"].iloc[i] = allnba_tags[i].find_all('td')[6].text

In [89]:
df_allnba['Year'] = df_allnba['Season'].str[0:2] + df_allnba['Season'].str[5:7]

In [90]:
df_allnba['Year'] = df_allnba['Year'].astype(int)

In [91]:
df_allnba['C1'] = df_allnba['C1'].str[:-2]
df_allnba['F1'] = df_allnba['F1'].str[:-2]
df_allnba['F2'] = df_allnba['F2'].str[:-2]
df_allnba['G1'] = df_allnba['G1'].str[:-2]
df_allnba['G2'] = df_allnba['G2'].str[:-2]

In [92]:
stats_complete['1st Team All NBA'] = stats_complete.apply(lambda x: 1 if x.Player in 
                                                          df_allnba[(df_allnba['Year'] == x.Year) &
                                                                    (df_allnba['Team'] == '1st')].values else 0, axis = 1)
stats_complete['2nd Team All NBA'] = stats_complete.apply(lambda x: 1 if x.Player in 
                                                          df_allnba[(df_allnba['Year'] == x.Year) &
                                                                    (df_allnba['Team'] == '2nd')].values else 0, axis = 1)
stats_complete['3rd Team All NBA'] = stats_complete.apply(lambda x: 1 if x.Player in 
                                                          df_allnba[(df_allnba['Year'] == x.Year) &
                                                                    (df_allnba['Team'] == '3rd')].values else 0, axis = 1)

In [93]:
stats_complete['Finals MVP'] = stats_complete['Finals MVP'].replace(r'^\s*$', 'nan', regex=True)

In [94]:
# read in data about players who have won a regular season MVP(s)

response_season_mvp = requests.get('https://www.basketball-reference.com/awards/mvp.html')

print(response_season_mvp.status_code)

200


In [95]:
soup_season_mvp = BeautifulSoup(response_season_mvp.content, 'html.parser')

In [96]:
season_mvp_tags = soup_season_mvp.find_all('tbody')[2].find_all('tr')

In [97]:
df_season_mvps = pd.DataFrame(index = range(len(season_mvp_tags)), columns = ['Player', 'MVP_league', 'season_mvp_count'])

In [98]:
for i in range(len(season_mvp_tags)):
    df_season_mvps['Player'].iloc[i] = season_mvp_tags[i].find_all('th')[0].text
    df_season_mvps['MVP_league'].iloc[i] = season_mvp_tags[i].find_all('td')[0].text
    df_season_mvps['season_mvp_count'].iloc[i] = season_mvp_tags[i].find_all('td')[1].text

In [99]:
df_season_mvps

Unnamed: 0,Player,MVP_league,season_mvp_count
0,Kareem Abdul-Jabbar,NBA,6
1,Michael Jordan,NBA,5
2,Bill Russell,NBA,5
3,Wilt Chamberlain,NBA,4
4,LeBron James,NBA,4
5,Larry Bird,NBA,3
6,Julius Erving,ABA,3
7,Magic Johnson,NBA,3
8,Moses Malone,NBA,3
9,Stephen Curry,NBA,2


In [100]:
# remove white space from columns so that dataframes can be formatted properly 

stats_complete['Points'] = stats_complete['Points'].replace(r'^\s*$', 'nan', regex=True)
stats_complete['Rebounds'] = stats_complete['Rebounds'].replace(r'^\s*$', 'nan', regex=True)
stats_complete['Assists'] = stats_complete['Assists'].replace(r'^\s*$', 'nan', regex=True)
stats_complete['Win Shares'] = stats_complete['Win Shares'].replace(r'^\s*$', 'nan', regex=True)

In [101]:
stats_complete.head(15)

Unnamed: 0,Year,Player,Pos,Age_Player,Tm,G_Player,GS,MP_Player,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG_Player,FGA_Player,FG%_Player,3P_Player,3PA_Player,3P%_Player,2P_Player,2PA_Player,2P%_Player,eFG%,FT_Player,FTA_Player,FT%_Player,ORB_Player,DRB_Player,TRB_Player,AST_Player,STL_Player,BLK_Player,TOV_Player,PF_Player,PTS_Player,Season,Lg,W,L,Finish,Age_Team,Ht.,Wt.,G_Team,MP_Team,FG_Team,FGA_Team,FG%_Team,3P_Team,3PA_Team,3P%_Team,2P_Team,2PA_Team,2P%_Team,FT_Team,FTA_Team,FT%_Team,ORB_Team,DRB_Team,TRB_Team,AST_Team,STL_Team,BLK_Team,TOV_Team,PF_Team,PTS_Team,TmFull,DRB_opp,ORB_opp,TRB_opp,FGA_opp,3PA_opp,TOV_opp,FTA_opp,AllStarSelections,Champion,Runner-up,Finals MVP,Points,Rebounds,Assists,Win Shares,Championships,Runner-Ups,ROY,dpoy_count,6Man,mip_count,abafinmvp_count,Lg_all_star_mvp,all_star_mvp_count,1st Team All NBA,2nd Team All NBA,3rd Team All NBA
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,,0.467,,,,,,,,,-0.1,3.6,3.5,,,,,,144.0,516.0,0.279,,,,144.0,516.0,0.279,0.279,170.0,241.0,0.705,,,,176.0,,,,217.0,458.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,,0,Minneapolis Lakers,Syracuse Nationals,,,G. Mikan (376),,J. Pollard (56),0,0,0,0,0,0,0,,0,0,0,0
1,1950.0,Charlie Black,F-C,28.0,FTW,36.0,,,,0.362,,0.48,,,,,,,,,-0.7,2.2,1.5,,,,,,125.0,435.0,0.287,,,,125.0,435.0,0.287,0.287,132.0,209.0,0.632,,,,75.0,,,,140.0,382.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,,0,Minneapolis Lakers,Syracuse Nationals,,,G. Mikan (376),,J. Pollard (56),0,0,0,0,0,0,0,,0,0,0,0
2,1950.0,Bob Carpenter,F-C,32.0,FTW,66.0,,,,0.421,,0.415,,,,,,,,,2.2,2.8,5.0,,,,,,212.0,617.0,0.344,,,,212.0,617.0,0.344,0.344,190.0,256.0,0.742,,,,92.0,,,,168.0,614.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,,0,Minneapolis Lakers,Syracuse Nationals,,,G. Mikan (376),,J. Pollard (56),0,0,0,0,0,0,0,,0,0,0,0
3,1950.0,Bob Harris,F-C,22.0,FTW,62.0,,,,0.423,,0.48,,,,,,,,,2.1,3.0,5.0,,,,,,168.0,465.0,0.361,,,,168.0,465.0,0.361,0.361,140.0,223.0,0.628,,,,129.0,,,,190.0,476.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,,0,Minneapolis Lakers,Syracuse Nationals,,,G. Mikan (376),,J. Pollard (56),0,0,0,0,0,0,0,,0,0,0,0
4,1950.0,Bill Henry,C,25.0,FTW,44.0,,,,0.405,,0.598,,,,,,,,,0.6,1.3,1.9,,,,,,65.0,209.0,0.311,,,,65.0,209.0,0.311,0.311,84.0,125.0,0.672,,,,39.0,,,,99.0,214.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,,0,Minneapolis Lakers,Syracuse Nationals,,,G. Mikan (376),,J. Pollard (56),0,0,0,0,0,0,0,,0,0,0,0
5,1950.0,Ralph Johnson,PG,28.0,FTW,32.0,,,,0.339,,0.13,,,,,,,,,-1.2,1.7,0.5,,,,,,110.0,353.0,0.312,,,,110.0,353.0,0.312,0.312,33.0,46.0,0.717,,,,67.0,,,,95.0,253.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,,0,Minneapolis Lakers,Syracuse Nationals,,,G. Mikan (376),,J. Pollard (56),0,0,0,0,0,0,0,,0,0,0,0
6,1950.0,Jack Kerris,PF,25.0,FTW,64.0,,,,0.405,,0.545,,,,,,,,,1.3,2.6,3.9,,,,,,149.0,455.0,0.327,,,,149.0,455.0,0.327,0.327,159.0,248.0,0.641,,,,110.0,,,,162.0,457.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,,0,Minneapolis Lakers,Syracuse Nationals,,,G. Mikan (376),,J. Pollard (56),0,0,0,0,0,0,0,,0,0,0,0
7,1950.0,Leo Klier,F-G,26.0,FTW,66.0,,,,0.379,,0.368,,,,,,,,,0.1,2.9,3.0,,,,,,157.0,516.0,0.304,,,,157.0,516.0,0.304,0.304,141.0,190.0,0.742,,,,121.0,,,,177.0,455.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,,0,Minneapolis Lakers,Syracuse Nationals,,,G. Mikan (376),,J. Pollard (56),0,0,0,0,0,0,0,,0,0,0,0
8,1950.0,Duane Klueh,G,24.0,FTW,19.0,,,,0.506,,0.616,,,,,,,,,1.4,0.6,2.1,,,,,,49.0,112.0,0.438,,,,49.0,112.0,0.438,0.438,46.0,69.0,0.667,,,,28.0,,,,38.0,144.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,,0,Minneapolis Lakers,Syracuse Nationals,,,G. Mikan (376),,J. Pollard (56),0,0,0,0,0,0,0,,0,0,0,0
9,1950.0,John Mahnken,C,27.0,FTW,2.0,,,,0.376,,0.375,,,,,,,,,0.0,0.1,0.1,,,,,,3.0,8.0,0.375,,,,3.0,8.0,0.375,0.375,1.0,3.0,0.333,,,,2.0,,,,8.0,7.0,1949-50,NBA,40,28,3,,,,68.0,,1878.0,5901.0,0.318,,,,1878.0,5901.0,0.318,1634.0,2331.0,0.701,,,,1364.0,,,,2065.0,5390.0,Fort Wayne Pistons,,,,,,,,0,Minneapolis Lakers,Syracuse Nationals,,,G. Mikan (376),,J. Pollard (56),0,0,0,0,0,0,0,,0,0,0,0


In [102]:
stats_complete.tail(15)

Unnamed: 0,Year,Player,Pos,Age_Player,Tm,G_Player,GS,MP_Player,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG_Player,FGA_Player,FG%_Player,3P_Player,3PA_Player,3P%_Player,2P_Player,2PA_Player,2P%_Player,eFG%,FT_Player,FTA_Player,FT%_Player,ORB_Player,DRB_Player,TRB_Player,AST_Player,STL_Player,BLK_Player,TOV_Player,PF_Player,PTS_Player,Season,Lg,W,L,Finish,Age_Team,Ht.,Wt.,G_Team,MP_Team,FG_Team,FGA_Team,FG%_Team,3P_Team,3PA_Team,3P%_Team,2P_Team,2PA_Team,2P%_Team,FT_Team,FTA_Team,FT%_Team,ORB_Team,DRB_Team,TRB_Team,AST_Team,STL_Team,BLK_Team,TOV_Team,PF_Team,PTS_Team,TmFull,DRB_opp,ORB_opp,TRB_opp,FGA_opp,3PA_opp,TOV_opp,FTA_opp,AllStarSelections,Champion,Runner-up,Finals MVP,Points,Rebounds,Assists,Win Shares,Championships,Runner-Ups,ROY,dpoy_count,6Man,mip_count,abafinmvp_count,Lg_all_star_mvp,all_star_mvp_count,1st Team All NBA,2nd Team All NBA,3rd Team All NBA
23754,2017.0,DeMar DeRozan,SG,27.0,TOR,74.0,74.0,2620.0,24.0,0.552,0.08,0.419,3.0,13.6,8.3,20.6,1.5,0.4,9.0,34.3,6.7,2.4,9.0,0.166,2.4,-1.5,0.9,1.9,721.0,1545.0,0.467,33.0,124.0,0.266,688.0,1421.0,0.484,0.477,545.0,647.0,0.842,70.0,316.0,386.0,290.0,78.0,13.0,180.0,134.0,2020.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,4,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,1
23755,2017.0,Serge Ibaka,PF,27.0,TOR,23.0,23.0,712.0,13.8,0.556,0.369,0.122,4.6,20.1,12.4,3.3,0.5,4.0,11.7,20.9,0.4,0.8,1.3,0.085,-1.5,-0.5,-2.1,0.0,128.0,279.0,0.459,41.0,103.0,0.398,87.0,176.0,0.494,0.532,30.0,34.0,0.882,29.0,127.0,156.0,15.0,7.0,33.0,39.0,76.0,327.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0
23756,2017.0,Cory Joseph,SG,25.0,TOR,80.0,22.0,2003.0,13.2,0.518,0.204,0.185,2.9,10.4,6.6,20.0,1.7,0.6,13.1,18.4,1.7,1.7,3.4,0.082,-0.4,-0.4,-0.8,0.6,299.0,661.0,0.452,48.0,135.0,0.356,251.0,526.0,0.477,0.489,94.0,122.0,0.77,51.0,184.0,235.0,265.0,66.0,13.0,108.0,140.0,740.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0
23757,2017.0,Kyle Lowry,PG,30.0,TOR,60.0,60.0,2244.0,22.9,0.623,0.51,0.398,2.4,12.1,7.3,29.9,2.0,0.7,13.8,24.9,7.8,2.3,10.1,0.216,7.1,-0.4,6.7,4.9,426.0,918.0,0.464,193.0,468.0,0.412,233.0,450.0,0.518,0.569,299.0,365.0,0.819,48.0,240.0,288.0,417.0,88.0,19.0,173.0,170.0,1344.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,6,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0
23758,2017.0,Lucas Nogueira,C,24.0,TOR,57.0,6.0,1088.0,15.5,0.682,0.077,0.429,8.5,16.7,12.6,5.4,2.4,7.2,19.9,9.5,1.7,2.0,3.7,0.164,-0.1,5.6,5.4,2.0,103.0,156.0,0.66,3.0,12.0,0.25,100.0,144.0,0.694,0.67,44.0,67.0,0.657,82.0,161.0,243.0,42.0,52.0,90.0,46.0,137.0,253.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0
23759,2017.0,Patrick Patterson,PF,27.0,TOR,65.0,8.0,1599.0,10.8,0.542,0.659,0.156,4.4,16.2,10.3,6.6,1.3,1.2,8.5,12.5,1.7,1.6,3.3,0.1,0.5,0.4,0.9,1.2,154.0,384.0,0.401,94.0,253.0,0.372,60.0,131.0,0.458,0.523,43.0,60.0,0.717,62.0,229.0,291.0,76.0,40.0,23.0,38.0,120.0,445.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0
23760,2017.0,Jakob Poeltl,C,21.0,TOR,54.0,4.0,626.0,12.2,0.589,0.0,0.496,14.1,15.7,14.9,2.7,1.4,2.8,17.2,12.1,0.9,0.7,1.6,0.125,-1.2,0.5,-0.7,0.2,67.0,115.0,0.583,0.0,0.0,,67.0,115.0,0.583,0.583,31.0,57.0,0.544,78.0,87.0,165.0,12.0,17.0,20.0,29.0,113.0,165.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0
23761,2017.0,Norman Powell,SG,23.0,TOR,76.0,18.0,1368.0,14.0,0.552,0.342,0.314,2.2,11.7,6.9,9.3,1.9,0.9,10.8,21.1,1.4,1.4,2.8,0.097,-0.4,-1.2,-1.5,0.2,227.0,506.0,0.449,56.0,173.0,0.324,171.0,333.0,0.514,0.504,126.0,159.0,0.792,26.0,142.0,168.0,82.0,51.0,14.0,70.0,127.0,636.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0
23762,2017.0,Terrence Ross,SF,25.0,TOR,54.0,0.0,1207.0,14.9,0.558,0.534,0.104,1.3,11.7,6.5,5.9,2.3,1.4,6.2,19.8,1.6,1.4,2.9,0.116,1.2,-0.8,0.4,0.7,211.0,479.0,0.441,96.0,256.0,0.375,115.0,223.0,0.516,0.541,41.0,50.0,0.82,14.0,125.0,139.0,45.0,54.0,20.0,33.0,82.0,559.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0
23763,2017.0,Pascal Siakam,PF,22.0,TOR,55.0,38.0,859.0,11.5,0.523,0.034,0.156,8.4,15.9,12.2,2.9,1.5,4.5,13.1,13.1,0.4,1.2,1.6,0.089,-3.0,1.8,-1.1,0.2,103.0,205.0,0.502,1.0,7.0,0.143,102.0,198.0,0.515,0.505,22.0,32.0,0.688,64.0,121.0,185.0,17.0,26.0,45.0,33.0,109.0,229.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,1,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,1,0,,0,0,0,0


In [103]:
# Collect counts of players who have won finals MVP or led the playoffs for a certain year
# since only first initial and last name were used for this data, string manipulation needed to be used to match players 

stats_complete['Finals MVPs'] = stats_complete.apply(lambda x: 1 if (x.Player[0] == x['Finals MVP'][0]) 
                                                    & (x.Player.split(' ')[1] in x['Finals MVP'])
                                                    & (x.Champion == x.TmFull) else 0, axis = 1)

In [104]:
stats_complete['Playoff Leading PTS'] = stats_complete.apply(lambda x: 1 if (x.Player[0] == x['Points'][0]) 
                                                    & (x.Player.split(' ')[1] in x['Points']) else 0, axis = 1)

In [105]:
stats_complete['Playoff Leading RBS'] = stats_complete.apply(lambda x: 1 if (x.Player[0] == x['Rebounds'][0]) 
                                                    & (x.Player.split(' ')[1] in x['Rebounds']) else 0, axis = 1)

In [106]:
stats_complete['Playoff Leading ASTS'] = stats_complete.apply(lambda x: 1 if (x.Player[0] == x['Assists'][0]) 
                                                    & (x.Player.split(' ')[1] in x['Assists']) else 0, axis = 1)

In [107]:
stats_complete['Playoff Leading WS'] = stats_complete.apply(lambda x: 1 if (x.Player[0] == x['Win Shares'][0]) 
                                                    & (x.Player.split(' ')[1] in x['Win Shares']) else 0, axis = 1)

In [108]:
stats_complete.tail(15)

Unnamed: 0,Year,Player,Pos,Age_Player,Tm,G_Player,GS,MP_Player,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG_Player,FGA_Player,FG%_Player,3P_Player,3PA_Player,3P%_Player,2P_Player,2PA_Player,2P%_Player,eFG%,FT_Player,FTA_Player,FT%_Player,ORB_Player,DRB_Player,TRB_Player,AST_Player,STL_Player,BLK_Player,TOV_Player,PF_Player,PTS_Player,Season,Lg,W,L,Finish,Age_Team,Ht.,Wt.,G_Team,MP_Team,FG_Team,FGA_Team,FG%_Team,3P_Team,3PA_Team,3P%_Team,2P_Team,2PA_Team,2P%_Team,FT_Team,FTA_Team,FT%_Team,ORB_Team,DRB_Team,TRB_Team,AST_Team,STL_Team,BLK_Team,TOV_Team,PF_Team,PTS_Team,TmFull,DRB_opp,ORB_opp,TRB_opp,FGA_opp,3PA_opp,TOV_opp,FTA_opp,AllStarSelections,Champion,Runner-up,Finals MVP,Points,Rebounds,Assists,Win Shares,Championships,Runner-Ups,ROY,dpoy_count,6Man,mip_count,abafinmvp_count,Lg_all_star_mvp,all_star_mvp_count,1st Team All NBA,2nd Team All NBA,3rd Team All NBA,Finals MVPs,Playoff Leading PTS,Playoff Leading RBS,Playoff Leading ASTS,Playoff Leading WS
23754,2017.0,DeMar DeRozan,SG,27.0,TOR,74.0,74.0,2620.0,24.0,0.552,0.08,0.419,3.0,13.6,8.3,20.6,1.5,0.4,9.0,34.3,6.7,2.4,9.0,0.166,2.4,-1.5,0.9,1.9,721.0,1545.0,0.467,33.0,124.0,0.266,688.0,1421.0,0.484,0.477,545.0,647.0,0.842,70.0,316.0,386.0,290.0,78.0,13.0,180.0,134.0,2020.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,4,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,1,0,0,0,0,0
23755,2017.0,Serge Ibaka,PF,27.0,TOR,23.0,23.0,712.0,13.8,0.556,0.369,0.122,4.6,20.1,12.4,3.3,0.5,4.0,11.7,20.9,0.4,0.8,1.3,0.085,-1.5,-0.5,-2.1,0.0,128.0,279.0,0.459,41.0,103.0,0.398,87.0,176.0,0.494,0.532,30.0,34.0,0.882,29.0,127.0,156.0,15.0,7.0,33.0,39.0,76.0,327.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0
23756,2017.0,Cory Joseph,SG,25.0,TOR,80.0,22.0,2003.0,13.2,0.518,0.204,0.185,2.9,10.4,6.6,20.0,1.7,0.6,13.1,18.4,1.7,1.7,3.4,0.082,-0.4,-0.4,-0.8,0.6,299.0,661.0,0.452,48.0,135.0,0.356,251.0,526.0,0.477,0.489,94.0,122.0,0.77,51.0,184.0,235.0,265.0,66.0,13.0,108.0,140.0,740.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0
23757,2017.0,Kyle Lowry,PG,30.0,TOR,60.0,60.0,2244.0,22.9,0.623,0.51,0.398,2.4,12.1,7.3,29.9,2.0,0.7,13.8,24.9,7.8,2.3,10.1,0.216,7.1,-0.4,6.7,4.9,426.0,918.0,0.464,193.0,468.0,0.412,233.0,450.0,0.518,0.569,299.0,365.0,0.819,48.0,240.0,288.0,417.0,88.0,19.0,173.0,170.0,1344.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,6,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0
23758,2017.0,Lucas Nogueira,C,24.0,TOR,57.0,6.0,1088.0,15.5,0.682,0.077,0.429,8.5,16.7,12.6,5.4,2.4,7.2,19.9,9.5,1.7,2.0,3.7,0.164,-0.1,5.6,5.4,2.0,103.0,156.0,0.66,3.0,12.0,0.25,100.0,144.0,0.694,0.67,44.0,67.0,0.657,82.0,161.0,243.0,42.0,52.0,90.0,46.0,137.0,253.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0
23759,2017.0,Patrick Patterson,PF,27.0,TOR,65.0,8.0,1599.0,10.8,0.542,0.659,0.156,4.4,16.2,10.3,6.6,1.3,1.2,8.5,12.5,1.7,1.6,3.3,0.1,0.5,0.4,0.9,1.2,154.0,384.0,0.401,94.0,253.0,0.372,60.0,131.0,0.458,0.523,43.0,60.0,0.717,62.0,229.0,291.0,76.0,40.0,23.0,38.0,120.0,445.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0
23760,2017.0,Jakob Poeltl,C,21.0,TOR,54.0,4.0,626.0,12.2,0.589,0.0,0.496,14.1,15.7,14.9,2.7,1.4,2.8,17.2,12.1,0.9,0.7,1.6,0.125,-1.2,0.5,-0.7,0.2,67.0,115.0,0.583,0.0,0.0,,67.0,115.0,0.583,0.583,31.0,57.0,0.544,78.0,87.0,165.0,12.0,17.0,20.0,29.0,113.0,165.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0
23761,2017.0,Norman Powell,SG,23.0,TOR,76.0,18.0,1368.0,14.0,0.552,0.342,0.314,2.2,11.7,6.9,9.3,1.9,0.9,10.8,21.1,1.4,1.4,2.8,0.097,-0.4,-1.2,-1.5,0.2,227.0,506.0,0.449,56.0,173.0,0.324,171.0,333.0,0.514,0.504,126.0,159.0,0.792,26.0,142.0,168.0,82.0,51.0,14.0,70.0,127.0,636.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0
23762,2017.0,Terrence Ross,SF,25.0,TOR,54.0,0.0,1207.0,14.9,0.558,0.534,0.104,1.3,11.7,6.5,5.9,2.3,1.4,6.2,19.8,1.6,1.4,2.9,0.116,1.2,-0.8,0.4,0.7,211.0,479.0,0.441,96.0,256.0,0.375,115.0,223.0,0.516,0.541,41.0,50.0,0.82,14.0,125.0,139.0,45.0,54.0,20.0,33.0,82.0,559.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,0,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0
23763,2017.0,Pascal Siakam,PF,22.0,TOR,55.0,38.0,859.0,11.5,0.523,0.034,0.156,8.4,15.9,12.2,2.9,1.5,4.5,13.1,13.1,0.4,1.2,1.6,0.089,-3.0,1.8,-1.1,0.2,103.0,205.0,0.502,1.0,7.0,0.143,102.0,198.0,0.515,0.505,22.0,32.0,0.688,64.0,121.0,185.0,17.0,26.0,45.0,33.0,109.0,229.0,2016-17,NBA,51,31,2,26.1,6-6,222,82.0,19780.0,3211.0,6918.0,0.464,725.0,1996.0,0.363,2486.0,4922.0,0.505,1615.0,2028.0,0.796,871.0,2676.0,3547.0,1517.0,677.0,400.0,1041.0,1708.0,8762.0,Toronto Raptors,2619.0,831.0,3450.0,6801.0,2240.0,1200.0,1948.0,1,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),0,0,0,0,0,1,0,,0,0,0,0,0,0,0,0,0


In [109]:
stats_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23769 entries, 0 to 23768
Columns: 114 entries, Year to Playoff Leading WS
dtypes: float64(73), int64(12), object(29)
memory usage: 20.9+ MB


In [110]:
"""
define all of our functions to calculate career values for certain advanced statistics
as well as some more basic percentages that cannot be just averaged year to year because of different values/weights each year
these will then be applied to our dataframe of aggregate stats by player 
"""


def TrueShootingCalc(PTS, FGA, FTA):
    try:
        ts_perc = ((PTS)/(2 * (FGA + (0.44*FTA)))) * 100
        return(ts_perc)
    except:
        return 0

def offensiveReboundPercCalc(orb, team_mins, player_mins, team_off_reb, opp_def_reb):
    try:
        oreb_perc = ((orb * (team_mins / 5)) / (player_mins * (team_off_reb + opp_def_reb))) * 100
        return(oreb_perc)
    except:
        return 0

def defensiveReboundPercCalc(drb, team_mins, player_mins, team_def_reb, opp_off_reb):
    try:
        dreb_perc = ((drb * (team_mins / 5)) / (player_mins * (team_def_reb + opp_off_reb))) * 100
        return(dreb_perc)
    except:
        return 0

def totalReboundPercCalc(trb, team_mins, player_mins, team_tot_reb, opp_tot_reb):
    try:
        treb_perc = ((trb * (team_mins / 5)) / (player_mins * (team_tot_reb + opp_tot_reb))) * 100
        return(treb_perc)
    except:
        return 0

def assistPercentage(Assists, mins, team_mins, team_field_goals, field_goals):
    try:
        assist_perc = (Assists / (((mins /(team_mins/5)) * team_field_goals) - field_goals)) * 100
        return(assist_perc)
    except:
        return 0

def stealPercentage(steals, team_mins, mins, opp_pos):
    try:
        steal_perc = ((steals * (team_mins / 5)) / (mins * opp_pos)) * 100
        return(steal_perc)
    except:
        return 0

def blockPercentage(blks, team_mins, mins, opp_fga, opp_3fga):
    try:
        block_perc = ((blks * (team_mins / 5)) / (mins * (opp_fga - opp_3fga))) * 100
        return(block_perc)
    except:
        return 0

def turnoverPercentage(tov, fga, fta):
    try:
        to_perc = ((tov) / (fga + (0.44 * fta) + tov)) * 100
        return(to_perc)
    except:
        return 0

def usagePercentage(fga, fta, tov, team_mins, mins, team_fga, team_fta, team_tov):
    try:
        usg_perc = (((fga + (0.44 * fta) + tov) * (team_mins / 5)) / (mins * (team_fga + (0.44 * team_fta) + team_tov))) * 100
        return(usg_perc)
    except:
        return 0

def freeThrowPercentage(fta, ft):
    try:
        ft_perc = (ft / fta) * 100
        return(ft_perc)
    except:
        return 0

def fieldGoalPercentage(fga, fg):
    try:
        fg_perc = (fg / fga) * 100
        return(fg_perc)
    except:
        return 0

def twoPointPercentage(two_pa, two_p):
    try:
        two_perc = (two_p / two_pa) * 100
        return(two_perc)
    except:
        return 0

def threePointPercentage(three_pa, three_p):
    try:
        three_perc = (three_p / three_pa) * 100
        return(three_perc)
    except:
        return 0

def effectiveFieldGoal(fgs, three_fgs, fga):
    try:
        efg_perc = (((fgs + (0.5 * three_fgs)) / (fga))) * 100
        return(efg_perc)
    except:
        return 0
    
def hofCheck(player):
    if player in df_hof['playerName'].values:
        return 1
    else:
        return 0

In [111]:
# aggregate by all columns that can simply be summed 

stats_complete_agg = stats_complete.groupby('Player', as_index = False)[['G_Player', 'GS', 'MP_Player', 'OWS', 'DWS', 'WS', 'FG_Player',
                                                       'FGA_Player', '3P_Player', '3PA_Player', '2P_Player',
                                                       '2PA_Player', 'FT_Player', 'FTA_Player', 'ORB_Player', 'DRB_Player',
                                                       'TRB_Player', 'AST_Player', 'STL_Player', 
                                                       'BLK_Player', 'TOV_Player', 'PF_Player', 'PTS_Player', 
                                                       'G_Team', 'MP_Team', 'FG_Team',
                                                       'FGA_Team', '3P_Team', '3PA_Team', '2P_Team',
                                                       '2PA_Team', 'FT_Team', 'FTA_Team', 'ORB_Team', 'DRB_Team',
                                                       'TRB_Team', 'AST_Team', 'STL_Team', 
                                                       'BLK_Team', 'TOV_Team', 'PF_Team', 'PTS_Team', 'DRB_opp',
                                                       'ORB_opp', 'TRB_opp', 'FGA_opp', '3PA_opp', 'TOV_opp', 'FTA_opp',
                                                       'Championships', 'Runner-Ups', 'ROY', '6Man', '1st Team All NBA',
                                                       '2nd Team All NBA', '3rd Team All NBA', 'Finals MVPs',
                                                       'Playoff Leading PTS', 'Playoff Leading RBS', 'Playoff Leading ASTS',
                                                                        'Playoff Leading WS']].sum()

In [112]:
# Perform the slightly more verbose way of merging stats that are already summed as counts that was mentioned previously 

stats_complete_unique = stats_complete.drop_duplicates(subset = 'Player')

In [113]:
stats_complete_agg = stats_complete_agg.merge(right = stats_complete_unique[['Player', 'AllStarSelections', 'dpoy_count', 'mip_count',
                                                                     'abafinmvp_count', 'all_star_mvp_count']], on = 'Player')

In [114]:
# create a column for the number of seasons that a player played 

stats_complete_agg['Seasons_Played'] = stats_complete_agg.apply(
    lambda x: len(stats_complete[stats_complete['Player'] == x.Player]['Year'].unique()), axis = 1)

In [115]:
# Peform a groupby and mean for PER, without more advanced methods this is the best way to estimate this career value 

PER = stats_complete.groupby('Player')['PER'].mean()

stats_complete_agg = stats_complete_agg.merge(right = PER, on = 'Player')

stats_complete_agg['PER'] = stats_complete_agg['PER'].fillna(0)

In [116]:
# Check the list of the 25 highest scoring players of all time to make sure everything has still gone correctly so far 

stats_complete_agg.sort_values('PTS_Player', ascending = False).head(25)

Unnamed: 0,Player,G_Player,GS,MP_Player,OWS,DWS,WS,FG_Player,FGA_Player,3P_Player,3PA_Player,2P_Player,2PA_Player,FT_Player,FTA_Player,ORB_Player,DRB_Player,TRB_Player,AST_Player,STL_Player,BLK_Player,TOV_Player,PF_Player,PTS_Player,G_Team,MP_Team,FG_Team,FGA_Team,3P_Team,3PA_Team,2P_Team,2PA_Team,FT_Team,FTA_Team,ORB_Team,DRB_Team,TRB_Team,AST_Team,STL_Team,BLK_Team,TOV_Team,PF_Team,PTS_Team,DRB_opp,ORB_opp,TRB_opp,FGA_opp,3PA_opp,TOV_opp,FTA_opp,Championships,Runner-Ups,ROY,6Man,1st Team All NBA,2nd Team All NBA,3rd Team All NBA,Finals MVPs,Playoff Leading PTS,Playoff Leading RBS,Playoff Leading ASTS,Playoff Leading WS,AllStarSelections,dpoy_count,mip_count,abafinmvp_count,all_star_mvp_count,Seasons_Played,PER
2106,Kareem Abdul-Jabbar,2109.0,625.0,80779.0,266.5,138.8,405.0,22653.0,40822.0,1.0,18.0,22652.0,40804.0,9566.0,13402.0,3728.0,12114.0,25984.0,8081.0,1456.0,4022.0,2527.0,6382.0,54873.0,2214.0,534085.0,101705.0,203616.0,879.0,2906.0,100826.0,200710.0,44146.0,58302.0,21130.0,50170.0,105854.0,60593.0,13900.0,9026.0,29197.0,51169.0,248435.0,45688.0,24110.0,93482.0,194966.0,3107.0,27660.0,52304.0,6,4,2,0,15,6,0,2,0,3,1,0,19,0,0,0,0,20,24.8
1213,Elvin Hayes,1955.0,129.0,77843.0,55.3,137.6,192.8,17390.0,38790.0,5.0,34.0,17385.0,38756.0,8448.0,12578.0,3563.0,9533.0,26152.0,3746.0,1212.0,2400.0,1358.0,6202.0,43233.0,1968.0,474645.0,86209.0,190584.0,285.0,1080.0,85924.0,189504.0,40616.0,55565.0,17017.0,37550.0,100839.0,47132.0,9666.0,6091.0,21634.0,46982.0,213319.0,37413.0,16547.0,79698.0,154405.0,874.0,21185.0,42833.0,1,2,0,0,4,6,0,0,0,1,1,0,12,0,0,0,0,16,17.7375
1963,John Havlicek,1992.0,0.0,75499.0,101.6,119.2,221.0,16993.0,38452.0,0.0,0.0,16993.0,38452.0,8910.0,10854.0,1018.0,2266.0,13012.0,10396.0,778.0,194.0,204.0,5185.0,42896.0,2041.0,453465.0,91284.0,206610.0,0.0,0.0,91284.0,206610.0,46420.0,61168.0,10600.0,23806.0,120880.0,50336.0,4717.0,2264.0,13385.0,51094.0,228988.0,21098.0,8708.0,54764.0,111139.0,0.0,11773.0,31802.0,8,0,0,0,8,12,0,1,0,1,0,1,13,0,0,0,0,16,17.644
3768,Wilt Chamberlain,1466.0,0.0,66510.0,200.7,130.9,331.2,15860.0,28776.0,0.0,0.0,15860.0,28776.0,7676.0,15456.0,0.0,0.0,32400.0,6806.0,0.0,0.0,0.0,2969.0,39396.0,1699.0,314950.0,77934.0,170603.0,0.0,0.0,77934.0,170603.0,41856.0,60934.0,0.0,0.0,107517.0,43089.0,0.0,0.0,0.0,38624.0,197724.0,0.0,0.0,25886.0,50946.0,0.0,0.0,11324.0,2,4,1,0,8,5,0,1,0,1,6,1,13,0,0,0,1,14,24.828571
2801,Oscar Robertson,1541.0,0.0,63775.0,204.5,56.2,260.7,13266.0,27394.0,0.0,0.0,13266.0,27394.0,10478.0,12491.0,142.0,416.0,10543.0,14006.0,154.0,8.0,0.0,4154.0,37010.0,1708.0,335235.0,77928.0,165935.0,0.0,0.0,77928.0,165935.0,40549.0,53961.0,2266.0,5762.0,99729.0,43393.0,1452.0,1038.0,3388.0,41876.0,196405.0,4974.0,2538.0,31196.0,64152.0,0.0,3108.0,16864.0,1,1,1,0,11,4,0,0,0,0,0,2,12,0,0,0,3,14,21.766667
1816,Jerry West,1364.0,0.0,53235.0,179.8,57.7,238.1,13120.0,27488.0,0.0,0.0,13120.0,27488.0,10232.0,12548.0,60.0,172.0,7312.0,9740.0,162.0,46.0,0.0,3510.0,36472.0,1708.0,335385.0,76458.0,166494.0,0.0,0.0,76458.0,166494.0,43675.0,59442.0,2730.0,5940.0,101547.0,42462.0,1588.0,1306.0,3826.0,39741.0,196591.0,5572.0,3050.0,34508.0,67674.0,0.0,3438.0,15412.0,1,8,0,0,14,4,0,0,0,5,0,3,14,0,0,0,1,14,22.933333
2108,Karl Malone,1394.0,1389.0,51905.0,131.6,88.1,219.3,12776.0,24734.0,83.0,302.0,12693.0,24432.0,9198.0,12449.0,3393.0,10796.0,14189.0,4944.0,2006.0,1074.0,4293.0,4349.0,34833.0,1444.0,348635.0,56001.0,116071.0,3853.0,11144.0,52148.0,104927.0,32747.0,43311.0,17742.0,43217.0,60959.0,37955.0,12603.0,8350.0,23366.0,34297.0,148602.0,39954.0,18745.0,58699.0,118056.0,16535.0,23333.0,40932.0,0,3,0,0,11,1,1,0,0,0,1,0,14,0,0,0,2,18,23.45
1324,Gail Goodrich,1738.0,0.0,58974.0,101.4,36.8,138.4,13269.0,29116.0,0.0,0.0,13269.0,29116.0,7776.0,9601.0,738.0,1049.0,5756.0,8478.0,896.0,107.0,390.0,4847.0,34314.0,1883.0,454170.0,83781.0,181015.0,0.0,0.0,83781.0,181015.0,45139.0,60829.0,11410.0,25617.0,102526.0,47704.0,6481.0,4638.0,15784.0,45256.0,212701.0,25052.0,12739.0,63677.0,123151.0,0.0,14504.0,30849.0,1,3,0,0,2,0,0,0,0,0,0,0,5,0,0,0,0,14,16.682609
857,Dave Bing,1578.0,0.0,59004.0,84.4,41.5,125.6,12567.0,28479.0,0.0,0.0,12567.0,28479.0,8153.0,10524.0,706.0,1257.0,6126.0,9889.0,826.0,155.0,216.0,4616.0,33287.0,1721.0,415540.0,73937.0,161962.0,0.0,0.0,73937.0,161962.0,39349.0,53436.0,8862.0,21624.0,87501.0,37621.0,5621.0,3296.0,13313.0,41625.0,187223.0,20878.0,9153.0,55407.0,108062.0,0.0,12628.0,29733.0,0,0,1,0,4,2,0,0,0,0,0,0,7,0,0,0,1,12,17.780952
2593,Michael Jordan,1072.0,1039.0,41011.0,149.9,64.2,213.9,12192.0,24537.0,581.0,1778.0,11611.0,22759.0,7327.0,8772.0,1668.0,5004.0,6672.0,5633.0,2514.0,893.0,2924.0,2783.0,32292.0,1230.0,297075.0,50053.0,104246.0,3591.0,10264.0,46462.0,93982.0,24199.0,31810.0,17279.0,35691.0,52970.0,30723.0,10578.0,5999.0,18290.0,27501.0,127896.0,34056.0,15681.0,49737.0,101622.0,11631.0,19579.0,31730.0,6,0,1,0,10,1,0,6,0,8,0,0,14,1,0,0,3,15,27.373333


In [117]:
# Apply all of our calculations on our aggregate dataframe 

stats_complete_agg['TS%'] = stats_complete_agg.apply(lambda x: TrueShootingCalc(x.PTS_Player , x.FGA_Player , x.FTA_Player), axis = 1)

In [118]:
stats_complete_agg['ORB%'] = stats_complete_agg.apply(lambda x: offensiveReboundPercCalc(x.ORB_Player , x.MP_Team , x.MP_Player ,
                                                                                 x.ORB_Team , x.DRB_opp), axis = 1)

In [119]:
stats_complete_agg['DRB%'] = stats_complete_agg.apply(lambda x: defensiveReboundPercCalc(x.DRB_Player , x.MP_Team , x.MP_Player ,
                                                                                 x.DRB_Team , x.ORB_opp), axis = 1)

In [120]:
stats_complete_agg['TRB%'] = stats_complete_agg.apply(lambda x: totalReboundPercCalc(x.TRB_Player , x.MP_Team , x.MP_Player ,
                                                                                 x.TRB_Team , x.TRB_opp), axis = 1)

In [121]:
stats_complete_agg['AST%'] = stats_complete_agg.apply(lambda x: assistPercentage(x.AST_Player , x.MP_Player , x.MP_Team ,
                                                                                 x.FG_Team , x.FG_Player), axis = 1)

In [122]:
stats_complete_agg['STL%'] = stats_complete_agg.apply(lambda x: stealPercentage(x.STL_Player , x.MP_Team , x.MP_Player ,
                                                                                (x.FGA_opp + x.TOV_opp + (x.FTA_opp / 2)) ), axis = 1)

In [123]:
stats_complete_agg['BLK%'] = stats_complete_agg.apply(lambda x: blockPercentage(x.BLK_Player , x.MP_Team , x.MP_Player ,
                                                                                 x.FGA_opp , x['3PA_opp']), axis = 1)

In [124]:
stats_complete_agg['TOV%'] = stats_complete_agg.apply(lambda x: turnoverPercentage(x.TOV_Player , x.FGA_Player , x.FTA_Player), axis = 1)

In [125]:
stats_complete_agg['USG%'] = stats_complete_agg.apply(lambda x: usagePercentage(x.FGA_Player , x.FTA_Player , x.TOV_Player ,
                                                                                 x.MP_Team , x.MP_Player ,
                                                                                 x.FGA_Team , x.FTA_Team , x.TOV_Team), axis = 1)

In [126]:
stats_complete_agg['FT%'] = stats_complete_agg.apply(lambda x: freeThrowPercentage(x.FTA_Player , x.FT_Player), axis = 1)

In [127]:
stats_complete_agg['FG%'] = stats_complete_agg.apply(lambda x: fieldGoalPercentage(x.FGA_Player , x.FG_Player), axis = 1)

In [128]:
stats_complete_agg['2P%'] = stats_complete_agg.apply(lambda x: twoPointPercentage(x['2PA_Player'] , x['2P_Player']), axis = 1)

In [129]:
stats_complete_agg['3P%'] = stats_complete_agg.apply(lambda x: threePointPercentage(x['3PA_Player'] , x['3P_Player']), axis = 1)

In [130]:
stats_complete_agg['EFG%'] = stats_complete_agg.apply(lambda x: effectiveFieldGoal(x['FG_Player'] , x['3P_Player'] , x.FGA_Player), axis = 1)

In [131]:
stats_complete_agg['PPG_career'] = stats_complete_agg.apply(lambda x: x.PTS_Player / x.G_Player, axis = 1)

In [132]:
stats_complete_agg['RPG_career'] = stats_complete_agg.apply(lambda x: x.TRB_Player / x.G_Player, axis = 1)

In [133]:
stats_complete_agg['APG_career'] = stats_complete_agg.apply(lambda x: x.AST_Player / x.G_Player, axis = 1)

In [134]:
stats_complete_agg['WS_per_season'] = stats_complete_agg.apply(lambda x: x.WS / x.Seasons_Played, axis = 1)

In [135]:
# Merge our MVP dataframe to have the count of times winning the award per player 

stats_complete_agg = stats_complete_agg.merge(right = df_season_mvps, on = 'Player', how = 'left')

In [136]:
stats_complete_agg['season_mvp_count'] = stats_complete_agg['season_mvp_count'].fillna(0)

In [137]:
# Add a column of a binary value of whether or not a player is in the hall of fame, will be used as the target for our classifier 

stats_complete_agg['HOF'] = stats_complete_agg.apply(lambda x: hofCheck(x.Player), axis = 1)

In [138]:
# output an even longer list of leading scorers to dive even deeper into our new dataframe 

stats_complete_agg.sort_values('PTS_Player', ascending = False).head(50)

Unnamed: 0,Player,G_Player,GS,MP_Player,OWS,DWS,WS,FG_Player,FGA_Player,3P_Player,3PA_Player,2P_Player,2PA_Player,FT_Player,FTA_Player,ORB_Player,DRB_Player,TRB_Player,AST_Player,STL_Player,BLK_Player,TOV_Player,PF_Player,PTS_Player,G_Team,MP_Team,FG_Team,FGA_Team,3P_Team,3PA_Team,2P_Team,2PA_Team,FT_Team,FTA_Team,ORB_Team,DRB_Team,TRB_Team,AST_Team,STL_Team,BLK_Team,TOV_Team,PF_Team,PTS_Team,DRB_opp,ORB_opp,TRB_opp,FGA_opp,3PA_opp,TOV_opp,FTA_opp,Championships,Runner-Ups,ROY,6Man,1st Team All NBA,2nd Team All NBA,3rd Team All NBA,Finals MVPs,Playoff Leading PTS,Playoff Leading RBS,Playoff Leading ASTS,Playoff Leading WS,AllStarSelections,dpoy_count,mip_count,abafinmvp_count,all_star_mvp_count,Seasons_Played,PER,TS%,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,FT%,FG%,2P%,3P%,EFG%,PPG_career,RPG_career,APG_career,WS_per_season,MVP_league,season_mvp_count,HOF
2107,Kareem Abdul-Jabbar,2109.0,625.0,80779.0,266.5,138.8,405.0,22653.0,40822.0,1.0,18.0,22652.0,40804.0,9566.0,13402.0,3728.0,12114.0,25984.0,8081.0,1456.0,4022.0,2527.0,6382.0,54873.0,2214.0,534085.0,101705.0,203616.0,879.0,2906.0,100826.0,200710.0,44146.0,58302.0,21130.0,50170.0,105854.0,60593.0,13900.0,9026.0,29197.0,51169.0,248435.0,45688.0,24110.0,93482.0,194966.0,3107.0,27660.0,52304.0,6,4,2,0,15,6,0,2,0,3,1,0,19,0,0,0,0,20,24.8,58.726793,7.377757,21.565403,17.237019,14.893076,0.773912,2.772055,5.131394,25.194665,71.377406,55.492137,55.514165,5.555556,55.493361,26.018492,12.320531,3.831674,20.25,NBA,6,1
1213,Elvin Hayes,1955.0,129.0,77843.0,55.3,137.6,192.8,17390.0,38790.0,5.0,34.0,17385.0,38756.0,8448.0,12578.0,3563.0,9533.0,26152.0,3746.0,1212.0,2400.0,1358.0,6202.0,43233.0,1968.0,474645.0,86209.0,190584.0,285.0,1080.0,85924.0,189504.0,40616.0,55565.0,17017.0,37550.0,100839.0,47132.0,9666.0,6091.0,21634.0,46982.0,213319.0,37413.0,16547.0,79698.0,154405.0,874.0,21185.0,42833.0,1,2,0,0,4,6,0,0,0,1,1,0,12,0,0,0,0,16,17.7375,48.768938,7.982829,21.489967,17.665178,7.027815,0.750242,1.906314,2.972704,23.539136,67.164891,44.831142,44.85757,14.705882,44.837587,22.114066,13.376982,1.916113,12.05,,0,1
1963,John Havlicek,1992.0,0.0,75499.0,101.6,119.2,221.0,16993.0,38452.0,0.0,0.0,16993.0,38452.0,8910.0,10854.0,1018.0,2266.0,13012.0,10396.0,778.0,194.0,204.0,5185.0,42896.0,2041.0,453465.0,91284.0,206610.0,0.0,0.0,91284.0,206610.0,46420.0,61168.0,10600.0,23806.0,120880.0,50336.0,4717.0,2264.0,13385.0,51094.0,228988.0,21098.0,8708.0,54764.0,111139.0,0.0,11773.0,31802.0,8,0,0,0,8,12,0,1,0,1,0,1,13,0,0,0,0,16,17.644,49.616265,3.857878,8.371862,8.899043,17.62094,0.673259,0.209685,0.469702,21.130181,82.089552,44.19276,44.19276,0.0,44.19276,21.534137,6.532129,5.218876,13.8125,,0,1
3769,Wilt Chamberlain,1466.0,0.0,66510.0,200.7,130.9,331.2,15860.0,28776.0,0.0,0.0,15860.0,28776.0,7676.0,15456.0,0.0,0.0,32400.0,6806.0,0.0,0.0,0.0,2969.0,39396.0,1699.0,314950.0,77934.0,170603.0,0.0,0.0,77934.0,170603.0,41856.0,60934.0,0.0,0.0,107517.0,43089.0,0.0,0.0,0.0,38624.0,197724.0,0.0,0.0,25886.0,50946.0,0.0,0.0,11324.0,2,4,1,0,8,5,0,1,0,1,6,1,13,0,0,0,1,14,24.828571,55.367792,0.0,0.0,23.001919,10.24551,0.0,0.0,0.0,17.067571,49.663561,55.115374,55.115374,0.0,55.115374,26.873124,22.100955,4.642565,23.657143,NBA,4,1
2802,Oscar Robertson,1541.0,0.0,63775.0,204.5,56.2,260.7,13266.0,27394.0,0.0,0.0,13266.0,27394.0,10478.0,12491.0,142.0,416.0,10543.0,14006.0,154.0,8.0,0.0,4154.0,37010.0,1708.0,335235.0,77928.0,165935.0,0.0,0.0,77928.0,165935.0,40549.0,53961.0,2266.0,5762.0,99729.0,43393.0,1452.0,1038.0,3388.0,41876.0,196405.0,4974.0,2538.0,31196.0,64152.0,0.0,3108.0,16864.0,1,1,1,0,11,4,0,0,0,0,0,2,12,0,0,0,3,14,21.766667,56.263233,2.061953,5.269193,8.465849,23.013855,0.213895,0.01311,0.0,17.909681,83.884397,48.426663,48.426663,0.0,48.426663,24.016872,6.841661,9.088903,18.621429,NBA,1,1
1816,Jerry West,1364.0,0.0,53235.0,179.8,57.7,238.1,13120.0,27488.0,0.0,0.0,13120.0,27488.0,10232.0,12548.0,60.0,172.0,7312.0,9740.0,162.0,46.0,0.0,3510.0,36472.0,1708.0,335385.0,76458.0,166494.0,0.0,0.0,76458.0,166494.0,43675.0,59442.0,2730.0,5940.0,101547.0,42462.0,1588.0,1306.0,3826.0,39741.0,196591.0,5572.0,3050.0,34508.0,67674.0,0.0,3438.0,15412.0,1,8,0,0,14,4,0,0,0,5,0,3,14,0,0,0,1,14,22.933333,55.245338,0.910636,2.410711,6.771705,20.479335,0.25898,0.085647,0.0,21.169187,81.542875,47.729919,47.729919,0.0,47.729919,26.739003,5.360704,7.140762,17.007143,,0,1
2109,Karl Malone,1394.0,1389.0,51905.0,131.6,88.1,219.3,12776.0,24734.0,83.0,302.0,12693.0,24432.0,9198.0,12449.0,3393.0,10796.0,14189.0,4944.0,2006.0,1074.0,4293.0,4349.0,34833.0,1444.0,348635.0,56001.0,116071.0,3853.0,11144.0,52148.0,104927.0,32747.0,43311.0,17742.0,43217.0,60959.0,37955.0,12603.0,8350.0,23366.0,34297.0,148602.0,39954.0,18745.0,58699.0,118056.0,16535.0,23333.0,40932.0,0,3,0,0,11,1,1,0,0,0,1,0,14,0,0,0,2,18,23.45,57.648463,7.900052,23.40611,15.929489,17.100567,1.664932,1.421151,12.441834,29.245287,73.885453,51.653594,51.952358,27.483444,51.821379,24.987805,10.178623,3.546628,12.183333,NBA,2,1
1324,Gail Goodrich,1738.0,0.0,58974.0,101.4,36.8,138.4,13269.0,29116.0,0.0,0.0,13269.0,29116.0,7776.0,9601.0,738.0,1049.0,5756.0,8478.0,896.0,107.0,390.0,4847.0,34314.0,1883.0,454170.0,83781.0,181015.0,0.0,0.0,83781.0,181015.0,45139.0,60829.0,11410.0,25617.0,102526.0,47704.0,6481.0,4638.0,15784.0,45256.0,212701.0,25052.0,12739.0,63677.0,123151.0,0.0,14504.0,30849.0,1,3,0,0,2,0,0,0,0,0,0,0,5,0,0,0,0,14,16.682609,51.460029,3.11748,4.212404,5.334206,20.614778,0.901527,0.133824,1.156226,23.238519,80.991563,45.572881,45.572881,0.0,45.572881,19.743383,3.311853,4.878021,9.885714,,0,1
857,Dave Bing,1578.0,0.0,59004.0,84.4,41.5,125.6,12567.0,28479.0,0.0,0.0,12567.0,28479.0,8153.0,10524.0,706.0,1257.0,6126.0,9889.0,826.0,155.0,216.0,4616.0,33287.0,1721.0,415540.0,73937.0,161962.0,0.0,0.0,73937.0,161962.0,39349.0,53436.0,8862.0,21624.0,87501.0,37621.0,5621.0,3296.0,13313.0,41625.0,187223.0,20878.0,9153.0,55407.0,108062.0,0.0,12628.0,29733.0,0,0,1,0,4,2,0,0,0,0,0,0,7,0,0,0,1,12,17.780952,50.267959,3.343683,5.752682,6.037843,24.768392,0.858264,0.202032,0.648151,23.613002,77.470544,44.127252,44.127252,0.0,44.127252,21.094423,3.882129,6.266793,10.466667,,0,1
2594,Michael Jordan,1072.0,1039.0,41011.0,149.9,64.2,213.9,12192.0,24537.0,581.0,1778.0,11611.0,22759.0,7327.0,8772.0,1668.0,5004.0,6672.0,5633.0,2514.0,893.0,2924.0,2783.0,32292.0,1230.0,297075.0,50053.0,104246.0,3591.0,10264.0,46462.0,93982.0,24199.0,31810.0,17279.0,35691.0,52970.0,30723.0,10578.0,5999.0,18290.0,27501.0,127896.0,34056.0,15681.0,49737.0,101622.0,11631.0,19579.0,31730.0,6,0,1,0,10,1,0,6,0,8,0,0,14,1,0,0,3,15,27.373333,56.85876,4.707369,14.111935,9.411346,25.195788,2.657243,1.437633,9.335685,33.234657,83.527132,49.688226,51.01718,32.677165,50.872152,30.123134,6.223881,5.254664,14.26,NBA,5,1


In [139]:
"""
Check the info of both our finalized dataframes, 
even after all modifications our complete dataframe still has a relatively low memory footprint at just about 21 mb
Our new aggregate dataframe obviously has a much lower memory footprint, and all values necessary for this analysis have no null values
(we're not concerned with mvp league right now)
"""
stats_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23769 entries, 0 to 23768
Columns: 114 entries, Year to Playoff Leading WS
dtypes: float64(73), int64(12), object(29)
memory usage: 20.9+ MB


In [140]:
stats_complete_agg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3807 entries, 0 to 3806
Data columns (total 90 columns):
Player                  3807 non-null object
G_Player                3807 non-null float64
GS                      3807 non-null float64
MP_Player               3807 non-null float64
OWS                     3807 non-null float64
DWS                     3807 non-null float64
WS                      3807 non-null float64
FG_Player               3807 non-null float64
FGA_Player              3807 non-null float64
3P_Player               3807 non-null float64
3PA_Player              3807 non-null float64
2P_Player               3807 non-null float64
2PA_Player              3807 non-null float64
FT_Player               3807 non-null float64
FTA_Player              3807 non-null float64
ORB_Player              3807 non-null float64
DRB_Player              3807 non-null float64
TRB_Player              3807 non-null float64
AST_Player              3807 non-null float64
STL_Player          

In [141]:
# Print a list of our columns so that we can use it to index the columns we want to select as features 

stats_complete_agg.columns

Index(['Player', 'G_Player', 'GS', 'MP_Player', 'OWS', 'DWS', 'WS',
       'FG_Player', 'FGA_Player', '3P_Player', '3PA_Player', '2P_Player',
       '2PA_Player', 'FT_Player', 'FTA_Player', 'ORB_Player', 'DRB_Player',
       'TRB_Player', 'AST_Player', 'STL_Player', 'BLK_Player', 'TOV_Player',
       'PF_Player', 'PTS_Player', 'G_Team', 'MP_Team', 'FG_Team', 'FGA_Team',
       '3P_Team', '3PA_Team', '2P_Team', '2PA_Team', 'FT_Team', 'FTA_Team',
       'ORB_Team', 'DRB_Team', 'TRB_Team', 'AST_Team', 'STL_Team', 'BLK_Team',
       'TOV_Team', 'PF_Team', 'PTS_Team', 'DRB_opp', 'ORB_opp', 'TRB_opp',
       'FGA_opp', '3PA_opp', 'TOV_opp', 'FTA_opp', 'Championships',
       'Runner-Ups', 'ROY', '6Man', '1st Team All NBA', '2nd Team All NBA',
       '3rd Team All NBA', 'Finals MVPs', 'Playoff Leading PTS',
       'Playoff Leading RBS', 'Playoff Leading ASTS', 'Playoff Leading WS',
       'AllStarSelections', 'dpoy_count', 'mip_count', 'abafinmvp_count',
       'all_star_mvp_count', 'Seasons_

In [142]:
# Creating pickle file to use later without having to rerun
# cells and re scrape all of our data

pickling_on = open("statsHOF.pickle", "wb")
pickle.dump(stats_complete_agg, pickling_on)
pickling_on.close()

In [143]:
# Creating pickle file for our team dataset to use later (for different future
# projects, we won't need it anymore here) without having to rerun
# cells and re scrape all of our data

pickling_on_teams = open("statsTeams.pickle", "wb")
pickle.dump(df_teams, pickling_on_teams)
pickling_on_teams.close()

In [144]:
# Creating pickle file for our opponents dataset to use later (for different future
# projects, we won't need it anymore here) without having to rerun
# cells and re scrape all of our data

pickling_on_opps = open("statsOpps.pickle", "wb")
pickle.dump(opps_df, pickling_on_opps)
pickling_on_opps.close()

In [None]:
# Closing down our kernel once pickle files are created to clear cache 

os._exit(00)