### Requirements

In [1]:
# pip install ipykernel
# pip install numpy
# pip install pandas
# pip install lxml
# pip install requests
# conda install conda-forge::unidecode
# pip install tqdm

# -----------------------------------#
# pip install beautifulsoup4
# pip install nba_api

# Web scraping of NBA data

### Import

In [2]:
import os
import numpy as np
import pandas as pd
import random
import time
import requests
from unidecode import unidecode
from io import StringIO
from tqdm import tqdm

### Fixed variables from https://www.basketball-reference.com

In [3]:
# List of team codes
teams = [
    'atl', 'bos', 'brk', 'cho', 'chi', 'cle', 'dal', 'den', 'det', 'gsw',
    'hou', 'ind', 'lac', 'lal', 'mem', 'mia', 'mil', 'min', 'nop', 'nyk',
    'okc', 'orl', 'phi', 'pho', 'por', 'sac', 'sas', 'tor', 'uta', 'was'
]
print(f'Teams: {len(teams)}')

# List of years
# seasons = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
seasons = ['2025']
print(f'Seasons: {len(seasons)}')

# List of stats
stats = [
    'FG', 'FGA', 'FG%',
    '3P', '3PA', '3P%', 
    'FT', 'FTA', 'FT%', 
    'ORB', 'TRB', 'AST', 
    'STL', 'BLK', 'TOV', 'PF'
]

Teams: 30
Seasons: 1


In [4]:
# Dictionary of team stats 
tm_stats_dict = {stat: 'Tm_' + str(stat) for stat in stats}

# Dict of opponent stats
opp_stats_dict = {stat + '.1': 'Opp' + str(stat) for stat in stats}

In [5]:
# # Empty dataframe
# nba_df = pd.DataFrame()

# # Iteratre through the seasons
# for season in seasons:
#     # Iterate through the teams
#     for team in teams:
#         # URL
#         url = 'https://www.basketball-reference.com/teams/' + team + '/' + season + '/gamelog/'
#         print(url)
        
#         # Get game stats from 'tgl_basic' table (scraping statement)
#         team_df = pd.read_html(url, header=1, attrs={'id':'tgl_basic'})[0]

#         # Drop rows where 'Rk' is null or equals 'Rk'
#         team_df = team_df[(team_df['Rk'].str != '') & (team_df['Rk'].str.isnumeric())]

#         # Drop blank columns
#         team_df = team_df.drop(columns=['Rk', 'Unnamed: 24'])

#         # Rename columns
#         team_df = team_df.rename(columns={'Unnamed: 3': 'Home', 'Tm':'Tm_Pts', 'Opp.1':'Opp_Pts'})
#         team_df = team_df.rename(columns=tm_stats_dict)
#         team_df = team_df.rename(columns=opp_stats_dict)

#         # Replace values in columns 'Home' and 'Opp' of team_df
#         team_df['Home'] = team_df['Home'].apply(lambda x: 0 if x == '@' else 1)

#         # Add two columns to the front of teamd_df
#         team_df.insert(loc=0, column='Season', value=season)
#         team_df.insert(loc=1, column='Team', value=team.upper())

#         # Append current year and team gameLogs to aggregated dataframe
#         nba_df = pd.concat([nba_df, team_df], ignore_index=True)

#         # Pause program to abide by basketball-reference.com rules
#         time.sleep(random.randint(5, 10))


# # Display the aggregated dataframe
# print(nba_df)

### Web scraping

In [6]:
# Set headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
}

In [7]:
# Empty dataframe
nba_df = pd.DataFrame()

# Total tasks for progress bar (number of seasons * number of teams)
total_tasks = len(seasons) * len(teams)

# Progress bar
with tqdm(total=total_tasks, desc="Scraping Basketball Reference", unit="task") as pbar:
    # Iterate through the seasons
    for season in seasons:
        # Iterate through the teams
        for team in teams:

            # Random waiting time
            waiting_time = random.randint(5, 10)

            # URL
            url = f'https://www.basketball-reference.com/teams/{team}/{season}/gamelog/'

            success = False
            attempt = 0

            while not success:
                try:
                    # Fetch data
                    response = requests.get(url, headers=headers,) # Corrected to `headers`
                    
                    if response.status_code == 200:
                        # Parse HTML response
                        html = response.text
                        team_df = pd.read_html(StringIO(html), header=1, attrs={'id': 'tgl_basic'})[0]

                        # Drop rows where 'Rk' is null or equals 'Rk'
                        team_df = team_df[(team_df['Rk'].str != '') & (team_df['Rk'].str.isnumeric())]

                        # Drop blank columns
                        team_df = team_df.drop(columns=['Rk', 'Unnamed: 24'])

                        # Rename columns
                        team_df = team_df.rename(columns={'Unnamed: 3': 'Home', 'Tm':'Tm_Pts', 'Opp.1':'Opp_Pts'})
                        team_df = team_df.rename(columns=tm_stats_dict)
                        team_df = team_df.rename(columns=opp_stats_dict)

                        # Replace values in columns 'Home' and 'Opp' of team_df
                        team_df['Home'] = team_df['Home'].apply(lambda x: 0 if x == '@' else 1)

                        # Add two columns to the front of teamd_df
                        team_df.insert(loc=0, column='Season', value=season)
                        team_df.insert(loc=1, column='Team', value=team.upper())

                        # Append current year and team gameLogs to aggregated dataframe
                        nba_df = pd.concat([nba_df, team_df], ignore_index=True)

                        # Pause program to abide by basketball-reference.com rules
                        time.sleep(waiting_time)

                        # Exit the loop on success
                        success = True
                        attempt = 0  # Reset attempt count for the next request

                    elif response.status_code == 429:
                        # Handle rate limiting
                        attempt += 1
                        wait_time = min(120, waiting_time * (2 ** attempt))  # Exponential backoff, with a cap at 2 minutes
                        print(f"Rate limit hit for URL: {url}. Status Code: 429. Waiting {wait_time} seconds to retry...")
                        time.sleep(wait_time)

                    else:
                        # Non-429 error handling, retry after a fixed interval
                        print(f"Failed to fetch URL: {url}, Status Code: {response.status_code}. Retrying in {waiting_time} seconds...")
                        time.sleep(waiting_time)

                except Exception as e:
                    print(f"Error fetching or parsing data from {url}: {e}. Retrying in {waiting_time} seconds...")
                    time.sleep(waiting_time)  # Wait before retrying for exceptions

            # Update progress bar after a successful attempt
            pbar.update(1)

# Display the final DataFrame
print(nba_df)

Scraping Basketball Reference: 100%|██████████| 30/30 [04:06<00:00,  8.20s/task]

     Season Team   G        Date  Home  Opp W/L Tm_Pts Opp_Pts Tm_FG  ...  \
0      2025  ATL   1  2024-10-23     1  BRK   W    120     116    39  ...   
1      2025  ATL   2  2024-10-25     1  CHO   W    125     120    39  ...   
2      2025  ATL   3  2024-10-27     0  OKC   L    104     128    36  ...   
3      2025  ATL   4  2024-10-28     1  WAS   L    119     121    39  ...   
4      2025  ATL   5  2024-10-30     0  WAS   L    120     133    45  ...   
...     ...  ...  ..         ...   ...  ...  ..    ...     ...   ...  ...   
1217   2025  WAS  35  2025-01-08     0  PHI   L    103     109    42  ...   
1218   2025  WAS  36  2025-01-10     0  CHI   L    105     138    39  ...   
1219   2025  WAS  37  2025-01-12     1  OKC   L     95     136    28  ...   
1220   2025  WAS  38  2025-01-13     1  MIN   L    106     120    40  ...   
1221   2025  WAS  39  2025-01-16     1  PHO   L    123     130    46  ...   

     OppFT OppFTA OppFT% OppORB OppTRB OppAST OppSTL OppBLK OppTOV OppPF  





In [8]:
nba_df

Unnamed: 0,Season,Team,G,Date,Home,Opp,W/L,Tm_Pts,Opp_Pts,Tm_FG,...,OppFT,OppFTA,OppFT%,OppORB,OppTRB,OppAST,OppSTL,OppBLK,OppTOV,OppPF
0,2025,ATL,1,2024-10-23,1,BRK,W,120,116,39,...,19,25,.760,12,43,21,8,6,19,32
1,2025,ATL,2,2024-10-25,1,CHO,W,125,120,39,...,25,31,.806,22,53,22,7,4,18,28
2,2025,ATL,3,2024-10-27,0,OKC,L,104,128,36,...,24,32,.750,11,45,26,13,12,10,23
3,2025,ATL,4,2024-10-28,1,WAS,L,119,121,39,...,18,22,.818,9,43,24,7,3,18,25
4,2025,ATL,5,2024-10-30,0,WAS,L,120,133,45,...,23,25,.920,6,37,32,10,5,16,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1217,2025,WAS,35,2025-01-08,0,PHI,L,103,109,42,...,14,19,.737,10,36,21,8,11,9,14
1218,2025,WAS,36,2025-01-10,0,CHI,L,105,138,39,...,9,14,.643,11,55,39,11,7,15,12
1219,2025,WAS,37,2025-01-12,1,OKC,L,95,136,28,...,33,35,.943,15,56,26,11,4,13,22
1220,2025,WAS,38,2025-01-13,1,MIN,L,106,120,40,...,24,31,.774,14,53,20,8,6,19,15


### Save as CSV

In [9]:
# Define filename
filename = 'nba_game_log_2025.csv'

# Get the current working directory
cwd = os.getcwd()

# Get the parent directory
parent_dir = os.path.dirname(cwd)

# Join data file path
data_file_path = os.path.join(parent_dir, 'data')

# Get a list of all ifc files from ifc file path
all_files = os.listdir(data_file_path)
print(f'all data files: {all_files}')
      
# Define full path including filename
file_path = os.path.join(data_file_path, filename)
print(f'new file path: {file_path}')

# Write the DataFrame to the file
nba_df.to_csv(file_path, index=False)

all data files: ['nba_game_log_2019-2021.csv', 'nba_game_log_2022-2024.csv', 'nba_historical_games.csv', 'preprocessed_nba_game_log.csv']
new file path: c:\Users\Markus\Documents\Git\GitHub\SportBet\data\nba_game_log_2025.csv
