# Data Collection & DataFrame Structuring

Author: Holly Bok

## NBA API

In [1]:
# Importing packages
# Information and documentation on the NBA API maintained by user 'swar' can be found at:
# https://github.com/swar/nba_api

import pandas as pd
import numpy as np
from nba_api.stats.static import players
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.endpoints import commonplayerinfo
from bs4 import BeautifulSoup
import requests
import time

### Active Players List

In [2]:
# Creating a list of active NBA players and saving in a DataFrame called
# 'active_players_df'

nba_players = players.get_players()
player_names = []
player_ids = []
player_active = []

for each in nba_players:
    player_names.append(each['full_name'])
    player_ids.append(each['id'])
    player_active.append(each['is_active'])
    
players_df = pd.DataFrame({'player_name': player_names,
                          'player_id': player_ids,
                          'active' : player_active})
active_players_df = players_df[players_df['active'] == True]

### Career Stats

In [3]:
# Iterating through all players in the active players DataFrame and gathering information
# from the PlayerCareerStats endpoint. Saving career statistics in a new DataFrame called
# 'career_stats_df'

career_stats = []
errors = []
count = 0
for each in active_players_df['player_id']:
    stats = playercareerstats.PlayerCareerStats(player_id=each)
    stats = stats.get_data_frames()[1]
    try:
        player_career = {'player_id': stats['PLAYER_ID'][0],
                         'league_id' : stats['LEAGUE_ID'][0],
                         'team_id' : stats['Team_ID'][0],
                         'gp' : stats['GP'][0],
                         'gs' : stats['GS'][0],
                         'min' : stats['MIN'][0],
                         'fgm' : stats['FGM'][0],
                         'fga' : stats['FGA'][0],
                         'fg_pct' : stats['FG_PCT'][0],
                         'fg3m' : stats['FG3M'][0],
                         'fg3a' : stats['FG3A'][0],
                         'fg3_pct' : stats['FG3_PCT'][0],
                         'ft_pct' : stats['FT_PCT'][0],
                         'oreb' : stats['OREB'][0],
                         'dreb' : stats['DREB'][0],
                         'reb' : stats['REB'][0],
                         'ast' : stats['AST'][0],
                         'stl' : stats['STL'][0],
                         'blk' : stats['BLK'][0],
                         'tov' : stats['TOV'][0],
                         'pf' : stats['PF'][0],
                         'pts' : stats['PTS'][0], 
                        }
    except:
        errors.append(each)
    career_stats.append(player_career)
    count += 1
    time.sleep(3)
    if count % 100 == 0:
        print('Count:', count)

print('Done!')
        
career_stats_df = pd.DataFrame(career_stats)

Count: 100
Count: 200
Count: 300
Count: 400
Count: 500
Done!


### Common Info

In [4]:
# Repeating the same process using the CommonPlayerInfo endpoint to gather basic
# information about each player. Saving common player information to a new DataFrame
# called 'common_info_df'

players_common_info = []
count = 0
for each in active_players_df['player_id']:
    common_info_test = commonplayerinfo.CommonPlayerInfo(player_id=each)
    info_for_each = common_info_test.get_dict()['resultSets'][:1][0]['rowSet'][0] 
    player = {
        'player_id': info_for_each[0],
        'team': info_for_each[18],
        'school': info_for_each[8],
        'country':info_for_each[8],
        'height':info_for_each[10],
        'weight':info_for_each[11],
        'seasons':info_for_each[12],
        'jersey_number':info_for_each[13],
        'from_year':info_for_each[22],
        'to_year':info_for_each[23],
        'draft_year':info_for_each[27],
        'draft_round':info_for_each[28],
        'draft_number':info_for_each[29],
    }
    players_common_info.append(player)
    time.sleep(3)
    count+=1
    if count % 100 ==0:
        print(count)
        
print('Done!')
        
common_info_df = pd.DataFrame(players_common_info)

100
200
300
400
500
Done!


## Scraping of Hoops Hype

In [5]:
# Scraping current and future salary information from hoopshype.com. Current salaries
# are from the 19/20 season and future salaries are for the 20/21 season. Information is
# scraped using BeautifulSoup and saved in a new DataFrame called 'salary_df'

url = 'https://hoopshype.com/salaries/players/'
res = requests.get(url)
print(res)

soup = BeautifulSoup(res.content, 'html.parser')

players_with_salary = []
count = 0
for each in soup.find_all('tr')[1:]:
    player = {'player_name' : each.find_all('td')[1].text.strip(),
              '19_20_salary' : each.find_all('td')[2].text.strip(),
              '20_21_salary' : each.find_all('td')[3].text.strip(),
             }
    players_with_salary.append(player)
    time.sleep(3)
    count += 1
    if count % 100 == 0:
        print(count)
        
print('Done!')
        
salary_df = pd.DataFrame(players_with_salary)

<Response [200]>
100
200
300
400
500
Done!


## DataFrame Merging and Cleaning

In [9]:
# Merging all DataFrames together into one, cohesive DataFrame called 'df'

df = active_players_df.merge(career_stats_df, on='player_id')
df = df.merge(common_info_df, on = 'player_id')
df = df.merge(salary_df, on = 'player_name')

In [10]:
# Formatting salary features by removing dollar signs and commas and converting to
# integer. Null values for the 20/21 season represent players who do not have settled
# contracts or who will no longer be playing basketball and are replaced with 0s.

df['19_20_salary'] = df['19_20_salary'].str.replace('$','')
df['19_20_salary'] = df['19_20_salary'].str.replace(',','')
df['20_21_salary'] = df['20_21_salary'].str.replace('$','')
df['20_21_salary'] = df['20_21_salary'].str.replace(',','')
df['19_20_salary'] = df['19_20_salary'].astype(int)
df['20_21_salary'].fillna('0', inplace=True)
df['20_21_salary'] = df['20_21_salary'].astype(int)

# Replacing 20/21 salary feature with a binary feature where 1 means the player has a
# settled / announced future salary and 0 means the player does not

df['20_21_salary'].value_counts(normalize=True)
df['20_21_salary'].isnull().sum()
df['future_salary'] = [1 if each is not 0 else 0 for each in df['20_21_salary']]
df.drop(columns = '20_21_salary', inplace=True)


# Deleting duplicates and unnecessary or redundant columns

df.drop_duplicates(inplace=True)
df.drop(columns=['jersey_number', 'school', 'to_year', 'country', 'team_id', 'active',
                'league_id', 'active'], 
        inplace=True)

In [11]:
# Renaming columns. Descriptions of each statistic can be found in the Data Dictionary
# in the 'READMe.md' file of this repository

df.rename(columns={
    'gp': 'games',
    'gs': 'games_started',
    'min': 'minutes_played',
    'fgm': 'field_goals_made',
    'fga': 'field_goals_assisted',
    'fg_pct': 'field_goals_pct_made',
    'fg3m': '3_pntrs_made',
    'fg3a': '3_pntrs_assists',
    'fg3_pct': '3_pntrs_pct_made',
    'fgt_pct': 'free_throw_pct_made',
    'oreb': 'offensive_rebounds',
    'dreb' : 'defensive_rebounds',
    'reb':'rebounds',
    'ast':'assists_to_turnovers',
    'stl': 'steals',
    'blk':'blocks',
    'tov':'turnovers',
    'pf':'personal_fouls',
    'pts': 'points',
    'from_year':'start_year'
}
          
          
          , inplace=True)

In [12]:
# Reordering Columns

df = df[['player_id', 
         'player_name', 
         'team',  
         'height',
         'weight',
         'seasons',
         'points',
         'games', 
         'games_started', 
         'minutes_played',
         'field_goals_made', 
         '3_pntrs_made',
         'field_goals_pct_made',
         '3_pntrs_pct_made',
         'ft_pct',
         'field_goals_assisted',
         '3_pntrs_assists',
         'rebounds',
         'assists_to_turnovers',
         'offensive_rebounds',
         'defensive_rebounds',
         'steals',
         'blocks',
         'turnovers',
         'personal_fouls',
         'draft_year',
         'draft_round',
         'draft_number',
         'start_year',
         '19_20_salary',
         'future_salary',]]

In [13]:
# Several players do not have heights and weights listed through the NBA IPA. The
# height and weight data is manually entered. Information was found on the Wikipedia
# pages of each player.

fill_in_df = df[df['weight'] == '']
df = df[df['weight'] != '']

manual_height_list = ['6-6', '6-9', '7-1', '6-6', '6-0', '6-8', '6-10', '6-0', '6-8',
                     '6-2', '6-9', '6-3', '6-6', '6-2', '6-1', '6-6', '6-5', '6-9',
                     '6-11', '6-0', '6-1', '6-5', '5-9', '6-3', '6-5']
manual_weight_list = [230.0, 240.0, 249.0, 215.0, 185.0, 255.0, 240, 170.0, 235, 215,
                     222, 186.0, 216.0, 185.0, 160.0, 220.0, 219.0, 230.0, 241.0, 175.0,
                     195.0, 215.0, 185.0, 210.0, 198.0 ]
manual_team_list = ['BKN', 'HOU', 'MIN', 'PHX', 'DAL', 'PHI', 'CLE', 'MIN', 'BKN', 'DET',
                   'OKC', 'ORL', 'PHX', 'UTA', 'MIA', 'ORL', 'WAS', 'BKN', 'ATL', 'OKC',
                   'TOR', 'WAS', 'BKN', 'WAS', 'ATL']

fill_in_ids = [each for each in fill_in_df['player_id']]
fill_in_df['height'] = manual_height_list
fill_in_df['weight'] = manual_weight_list
fill_in_df['team'] = manual_team_list
fill_in_df.rename(columns={'0':'player_id'}, inplace=True)

dfs = [fill_in_df, df]
df = pd.concat(dfs)



In [14]:
# Engineering features
# Creating features that show average points per game and percent of games started

df['avg_pnts_per_game'] = df['points'] / df['games']
df['pct_of_games_started'] = df['games_started'] / df['games']

In [15]:
# Creating an additional height column, 'heights_inches', that shows the heights of
# each player in inches (so that height can be ranked more easily)

heights_inches = []
for each in df['height']:
    feet = pd.to_numeric(each[0])
    inches = pd.to_numeric(each[2:])
    feet_inches = feet * 12
    feet_total = feet_inches + inches
    heights_inches.append(feet_total)
df['heights_inches'] = heights_inches

In [16]:
# Replacing 'draft_round' with dummy columns for 1st draft pick, 2nd draft pick, and
# Undrafted players. Creating a new features for 'draft_number_group' (which represent
# groups of draft picks in batches of 10) and 'draft_nmbr_grp_rank' that shows the rank
# of each draft group (where draft number group 1 is ranked as 1 and the highest draft
# number groups ranked at 8)

draft_round_dummies = pd.get_dummies(df['draft_round'])
df['draft_1st_pick'] = draft_round_dummies['1']
df['draft_2nd_pick'] = draft_round_dummies['2']
df['draft_undrafted'] = draft_round_dummies['Undrafted']

df['draft_number'].value_counts()
draft_group = []
draft_group_rank = []
for each in df['draft_number']:
    if each in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']:
        draft_group.append('1-10')
        draft_group_rank.append(1)
    elif each in ['11', '12', '13', '14', '15', '16', '17', '18', '19', '20']:
        draft_group.append('11-20')
        draft_group_rank.append(2)
    elif each in ['21', '22', '23', '24', '25', '26', '27', '28', '29', '30']:
        draft_group.append('21-30')
        draft_group_rank.append(3)
    elif each in ['31', '32', '33', '34', '35', '36', '37', '38', '39', '40']:
        draft_group.append('31-40')
        draft_group_rank.append(4)
    elif each in ['41', '42', '43', '44', '45', '46', '47', '48', '49', '50']:
        draft_group.append('41-50')
        draft_group_rank.append(5)
    elif each in ['51', '52', '53', '54', '55', '56', '57', '58', '59', '60']:
        draft_group.append('51-60')
        draft_group_rank.append(6)
    elif each in ['61', '62', '63', '64', '65', '66', '67', '68', '69', '70']:
        draft_group.append('61-70')
        draft_group_rank.append(7)
    elif each == 'Undrafted':
        draft_group.append('Undrafted')
        draft_group_rank.append(8)
df['draft_number_group'] = draft_group
df['draft_nmbr_grp_rank'] = draft_group_rank

df.drop(columns=['draft_round', 'draft_number', 'draft_year'], inplace=True)

In [17]:
# Exporting the finished, clean DataFrame as 'NBAPlayers.csv' 

df.to_csv('datasets/NBAPlayers.csv', index=False)