In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from urllib.request import urlopen
from bs4 import BeautifulSoup

The below two functions will allow me to retrieve the gamelogs and advanced gamelogs of a player by inputting the player's surname initial, name and the season I want data from into the function.

In [2]:
def game_log(initial, name, season):
    '''
    
    initial : Initial of the player's surname. dtype=str
    name : Name of the player you want data for. dtype=str
    season : The season you want game logs from. ex. 2018-19 season is 2019. dtype=str
    
    '''
    player_code = {'James Harden':'hardeja01', 'Anthony Davis':'davisan02', 'LeBron James':'jamesle01', 'Giannis Antetokounmpo':'antetgi01', 'Damian Lillard':'lillada01', 'Kevin Durant':'duranke01', 'Russell Westbrook':'westbru01', 'Kyrie Irving':'irvinky01', 'LaMarcus Aldridge':'aldrila01', 'Victor Oladipo':'oladivi01', 'Paul George':'georgpa01', 'Joel Embiid':'embiijo01', 'Stephen Curry':'curryst01', 'Devin Booker':'bookede01', 'Kawhi Leonard':'leonaka01', 'Bradley Beal':'bealbr01', 'Trae Young':'youngtr01', 'Luka Doncic':'doncilu01'}


    # Formatting the string based on the requested player inputs so the correct data can be retrieved.

    url = 'https://www.basketball-reference.com/players/{}/%({})s/gamelog/{}'.format(initial,name,season)
    url = url % player_code
    
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('thead')[0].findAll('th')][1:] # extracting the column headers from the HTML
    stats = []
    for i in range(len(soup.findAll('tr')[1:])):
        stats.append([td.getText() for td in soup.findAll('tr')[1:][i].findAll('td')]) # retrieving the actual data from the HTML

    stats = stats[33:] # omits GameLog summaries which is data I don't need
    data = pd.DataFrame(stats, columns=headers)

    inactive = []
    for i in range(len(data)):
        if data.iloc[i][7] == 'Inactive': # for each row checking whether the player (Trae Young in this case) played or not, if the player hasn't played then we'll omit the row.
            inactive.append(i)
        elif data.iloc[i][7] == 'Did Not Play':
            inactive.append(i)
        elif data.iloc[i][7] == 'Did Not Dress':
            inactive.append(i)
        elif data.iloc[i][7] is None:
            inactive.append(i)
    data = data.drop(inactive)

    return data

In [3]:
def advanced_game_log(initial, name, season):
    '''
    
    initial : Initial of the player's surname. dtype=str
    name : Name of the player you want data for. dtype=str
    season : The season you want game logs from. ex. 2018-19 season is 2019. dtype=str
    
    '''
    player_code = {'James Harden':'hardeja01', 'Anthony Davis':'davisan02', 'LeBron James':'jamesle01', 'Giannis Antetokounmpo':'antetgi01', 'Damian Lillard':'lillada01', 'Kevin Durant':'duranke01', 'Russell Westbrook':'westbru01', 'Kyrie Irving':'irvinky01', 'LaMarcus Aldridge':'aldrila01', 'Victor Oladipo':'oladivi01', 'Paul George':'georgpa01', 'Joel Embiid':'embiijo01', 'Stephen Curry':'curryst01', 'Devin Booker':'bookede01', 'Kawhi Leonard':'leonaka01', 'Bradley Beal':'bealbr01', 'Trae Young':'youngtr01', 'Luka Doncic':'doncilu01'}


    # Formatting the string based on the requested player inputs so the correct data can be retrieved.

    url = 'https://www.basketball-reference.com/players/{}/%({})s/gamelog-advanced/{}'.format(initial,name,season)
    url = url % player_code
    
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('thead')[0].findAll('th')][1:] # extracting the column headers from the HTML
    stats = []
    for i in range(len(soup.findAll('tr')[1:])):
        stats.append([td.getText() for td in soup.findAll('tr')[1:][i].findAll('td')]) # retrieving the actual data from the HTML

    data = pd.DataFrame(stats, columns=headers)

    inactive = []
    for i in range(len(data)):
        if data.iloc[i][7] == 'Inactive': # for each row checking whether the player (Trae Young in this case) played or not, if the player hasn't played then we'll omit the row.
            inactive.append(i)
        elif data.iloc[i][7] == 'Did Not Play':
            inactive.append(i)
        elif data.iloc[i][7] == 'Did Not Dress':
            inactive.append(i)
        elif data.iloc[i][7] is None:
            inactive.append(i)
    data = data.drop(inactive)

    return data

In [4]:
# Now creating three lists with each player's surname initial, names and the seasons I want data for, for each player.
initial = ['h', 'd', 'j', 'a', 'l', 'd', 'w', 'i', 'a', 'o', 'g', 'e', 'c', 'b', 'l', 'b', 'y', 'd']
name = ['James Harden', 'Anthony Davis', 'LeBron James', 'Giannis Antetokounmpo', 'Damian Lillard', 'Kevin Durant', 'Russell Westbrook', 'Kyrie Irving', 'LaMarcus Aldridge', 'Victor Oladipo', 'Paul George', 'Joel Embiid', 'Stephen Curry', 'Devin Booker', 'Kawhi Leonard', 'Bradley Beal', 'Trae Young', 'Luka Doncic']
season = [['2018','2019','2020'],['2018','2020'],['2018'],['2018','2019','2020'],['2018','2019','2020'],['2018','2019'],['2018','2020'],['2018'],['2018'],['2018'],['2019'],['2019'],['2019'],['2019','2020'],['2019','2020'],['2019','2020'],['2020'],['2020']]

In [5]:
# Retrieving the feature data for the players and saving as CSV files.

for i in range(len(name)):
    for j in range(len(season[i])):
        gamelog = game_log(initial[i],name[i],season[i][j])
        advanced_gamelog = advanced_game_log(initial[i],name[i],season[i][j])
        
        # From the regular gamelogs I need FG, FGA, 3P, 3PA, FT, FTA

        basic = gamelog[['FG','FGA','3P','3PA','FT','FTA']]

        # From the advanced gamelogs I need USG% and ORtg

        advanced = advanced_gamelog[['USG%','ORtg']]

        # Merging the two dataframes together

        basic.reset_index(drop=True, inplace=True)
        advanced.reset_index(drop=True, inplace=True) # resets the indices of each dataframe so that they can be erge properly with each row aligning correctly.

        data = pd.concat([basic,advanced],axis=1)

        data.to_csv(r'C:\Users\frank\OneDrive\Documents\DS\ML Basketball Data\Player Data\{}\{}'.format(name[i],season[i][j]))

In [6]:
# Retrieving the label data for the players and saving as CSV files.

for i in range(len(name)):
    for j in range(len(season[i])):
        gamelog = game_log(initial[i],name[i],season[i][j])
        
        # From the basic gamelogs I need the PTS data.

        points = gamelog['PTS'] # retrieving the points in each game from the gamelogs.

        points.reset_index(drop=True, inplace=True) # resetting the indices of the dataframe so they match in my features and my labels.

        points.to_csv(r'C:\Users\frank\OneDrive\Documents\DS\ML Basketball Data\Player Data\{}\Points\{}'.format(name[i],season[i][j])) # uploading my dataframes into csv files saved on my computer.