### Requirements

In [2]:
# pip install ipykernel
# pip install numpy
# pip install pandas
# pip install lxml
# pip install requests
# conda install conda-forge::unidecode
# pip install tqdm

# -----------------------------------#
# pip install beautifulsoup4
# pip install nba_api

# Web scraping of NBA data

### Import

In [3]:
import os
import numpy as np
import pandas as pd
import random
import time
import requests
from unidecode import unidecode
from io import StringIO
from tqdm import tqdm

### Fixed variables from https://www.basketball-reference.com

In [4]:
# List of team codes
teams = [
    'atl', 'bos', 'brk', 'cho', 'chi', 'cle', 'dal', 'den', 'det', 'gsw',
    'hou', 'ind', 'lac', 'lal', 'mem', 'mia', 'mil', 'min', 'nop', 'nyk',
    'okc', 'orl', 'phi', 'pho', 'por', 'sac', 'sas', 'tor', 'uta', 'was'
]
print(f'Teams: {len(teams)}')

# List of years
# seasons = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
seasons = ['2019', '2020', '2021']
print(f'Seasons: {len(seasons)}')

# List of stats
stats = [
    'FG', 'FGA', 'FG%',
    '3P', '3PA', '3P%', 
    'FT', 'FTA', 'FT%', 
    'ORB', 'TRB', 'AST', 
    'STL', 'BLK', 'TOV', 'PF'
]

Teams: 30
Seasons: 3


In [5]:
# Dictionary of team stats 
tm_stats_dict = {stat: 'Tm_' + str(stat) for stat in stats}

# Dict of opponent stats
opp_stats_dict = {stat + '.1': 'Opp' + str(stat) for stat in stats}

In [6]:
# # Empty dataframe
# nba_df = pd.DataFrame()

# # Iteratre through the seasons
# for season in seasons:
#     # Iterate through the teams
#     for team in teams:
#         # URL
#         url = 'https://www.basketball-reference.com/teams/' + team + '/' + season + '/gamelog/'
#         print(url)
        
#         # Get game stats from 'tgl_basic' table (scraping statement)
#         team_df = pd.read_html(url, header=1, attrs={'id':'tgl_basic'})[0]

#         # Drop rows where 'Rk' is null or equals 'Rk'
#         team_df = team_df[(team_df['Rk'].str != '') & (team_df['Rk'].str.isnumeric())]

#         # Drop blank columns
#         team_df = team_df.drop(columns=['Rk', 'Unnamed: 24'])

#         # Rename columns
#         team_df = team_df.rename(columns={'Unnamed: 3': 'Home', 'Tm':'Tm_Pts', 'Opp.1':'Opp_Pts'})
#         team_df = team_df.rename(columns=tm_stats_dict)
#         team_df = team_df.rename(columns=opp_stats_dict)

#         # Replace values in columns 'Home' and 'Opp' of team_df
#         team_df['Home'] = team_df['Home'].apply(lambda x: 0 if x == '@' else 1)

#         # Add two columns to the front of teamd_df
#         team_df.insert(loc=0, column='Season', value=season)
#         team_df.insert(loc=1, column='Team', value=team.upper())

#         # Append current year and team gameLogs to aggregated dataframe
#         nba_df = pd.concat([nba_df, team_df], ignore_index=True)

#         # Pause program to abide by basketball-reference.com rules
#         time.sleep(random.randint(5, 10))


# # Display the aggregated dataframe
# print(nba_df)

### Web scraping

In [7]:
# Set headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
}

In [8]:
# Empty dataframe
nba_df = pd.DataFrame()

# Total tasks for progress bar (number of seasons * number of teams)
total_tasks = len(seasons) * len(teams)

# Progress bar
with tqdm(total=total_tasks, desc="Scraping Basketball Reference", unit="task") as pbar:
    # Iterate through the seasons
    for season in seasons:
        # Iterate through the teams
        for team in teams:

            # Random waiting time
            waiting_time = random.randint(5, 10)

            # URL
            url = f'https://www.basketball-reference.com/teams/{team}/{season}/gamelog/'

            success = False
            attempt = 0

            while not success:
                try:
                    # Fetch data
                    response = requests.get(url, headers=headers,) # Corrected to `headers`
                    
                    if response.status_code == 200:
                        # Parse HTML response
                        html = response.text
                        team_df = pd.read_html(StringIO(html), header=1, attrs={'id': 'tgl_basic'})[0]

                        # Drop rows where 'Rk' is null or equals 'Rk'
                        team_df = team_df[(team_df['Rk'].str != '') & (team_df['Rk'].str.isnumeric())]

                        # Drop blank columns
                        team_df = team_df.drop(columns=['Rk', 'Unnamed: 24'])

                        # Rename columns
                        team_df = team_df.rename(columns={'Unnamed: 3': 'Home', 'Tm':'Tm_Pts', 'Opp.1':'Opp_Pts'})
                        team_df = team_df.rename(columns=tm_stats_dict)
                        team_df = team_df.rename(columns=opp_stats_dict)

                        # Replace values in columns 'Home' and 'Opp' of team_df
                        team_df['Home'] = team_df['Home'].apply(lambda x: 0 if x == '@' else 1)

                        # Add two columns to the front of teamd_df
                        team_df.insert(loc=0, column='Season', value=season)
                        team_df.insert(loc=1, column='Team', value=team.upper())

                        # Append current year and team gameLogs to aggregated dataframe
                        nba_df = pd.concat([nba_df, team_df], ignore_index=True)

                        # Pause program to abide by basketball-reference.com rules
                        time.sleep(waiting_time)

                        # Exit the loop on success
                        success = True
                        attempt = 0  # Reset attempt count for the next request

                    elif response.status_code == 429:
                        # Handle rate limiting
                        attempt += 1
                        wait_time = min(120, waiting_time * (2 ** attempt))  # Exponential backoff, with a cap at 2 minutes
                        print(f"Rate limit hit for URL: {url}. Status Code: 429. Waiting {wait_time} seconds to retry...")
                        time.sleep(wait_time)

                    else:
                        # Non-429 error handling, retry after a fixed interval
                        print(f"Failed to fetch URL: {url}, Status Code: {response.status_code}. Retrying in {waiting_time} seconds...")
                        time.sleep(waiting_time)

                except Exception as e:
                    print(f"Error fetching or parsing data from {url}: {e}. Retrying in {waiting_time} seconds...")
                    time.sleep(waiting_time)  # Wait before retrying for exceptions

            # Update progress bar after a successful attempt
            pbar.update(1)

# Display the final DataFrame
print(nba_df)

Scraping Basketball Reference:   0%|          | 0/90 [00:10<?, ?task/s]


KeyboardInterrupt: 

In [9]:
nba_df

Unnamed: 0,Season,Team,G,Date,Home,Opp,W/L,Tm_Pts,Opp_Pts,Tm_FG,...,OppFT,OppFTA,OppFT%,OppORB,OppTRB,OppAST,OppSTL,OppBLK,OppTOV,OppPF
0,2019,ATL,1,2018-10-17,0,NYK,L,107,126,41,...,24,31,.774,10,46,21,12,6,16,23
1,2019,ATL,2,2018-10-19,0,MEM,L,117,131,41,...,30,43,.698,7,41,32,9,4,17,29
2,2019,ATL,3,2018-10-21,0,CLE,W,133,111,48,...,9,14,.643,15,49,23,5,2,18,25
3,2019,ATL,4,2018-10-24,1,DAL,W,111,104,37,...,16,26,.615,9,52,25,7,9,17,27
4,2019,ATL,5,2018-10-27,1,CHI,L,85,97,27,...,13,16,.813,8,52,25,14,6,23,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6733,2021,WAS,68,2021-05-08,0,IND,W,133,132,49,...,25,30,.833,8,49,33,11,5,13,24
6734,2021,WAS,69,2021-05-10,0,ATL,L,124,125,51,...,23,26,.885,13,55,23,4,6,12,17
6735,2021,WAS,70,2021-05-12,0,ATL,L,116,120,45,...,22,28,.786,7,51,22,4,5,14,21
6736,2021,WAS,71,2021-05-14,1,CLE,W,120,105,41,...,17,21,.810,7,44,24,4,3,17,21


### Save as CSV

In [9]:
# Define filename
filename = 'nba_game_log_2019-2021.csv'

# Get the current working directory
cwd = os.getcwd()

# Get the parent directory
parent_dir = os.path.dirname(cwd)

# Join data file path
data_file_path = os.path.join(parent_dir, 'data')

# Get a list of all ifc files from ifc file path
all_files = os.listdir(data_file_path)
print(f'all data files: {all_files}')
      
# Define full path including filename
file_path = os.path.join(data_file_path, filename)
print(f'new file path: {file_path}')

all data files: ['nba_game_log_2019-2021.csv', 'nba_game_log_2022-2024.csv']
new file path: c:\Users\Markus\Documents\Git\GitHub\SportBet\data\nba_game_log_test.csv


In [10]:
# Write the DataFrame to the file
nba_df.to_csv(file_path, index=False)