# The purpose of this workbook is to scrape all player stats from 1980 onward.

- Last update 2/2/23
- Keep in mind that it will not scrape anything that has a file already associated with it.
- Example: If you scrape 2023 halfway through the season and then again at the end, it will not update the file.

In [1]:
import pandas as pd
import requests
from datetime import datetime
import os
import time


In [2]:
def remove_multiindex(df:pd.DataFrame):
	return df.droplevel(level=0,axis=1)

In [3]:
# link for extract html data
def getdata(url):
    r = requests.get(url)
    return r.text



In [4]:
combined = list()

for year in range(1980,datetime.now().year + 1):


    if os.path.exists(f'player_stats/{year}_player_stats.csv'):
        print(f"{year} already exists")
        continue

    per_game_data = getdata(f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html')
    adv_data = getdata(f'https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html')

    while 'The owner of this website (www.basketball-reference.com) has banned you temporarily from accessing this website' in per_game_data or 'The owner of this website (www.basketball-reference.com) has banned you temporarily from accessing this website' in adv_data:
        print("You've been temporarily banned from accessing basketball reference.\nSleeping 1 hour then continuing.")
        time.sleep(3600)
        per_game_data = getdata(f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html')
        adv_data = getdata(f'https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html')


    df = pd.read_html(per_game_data)[0]
    df_adv = pd.read_html(adv_data)[0]


    df = df.drop(df[df['Player'] == 'Player'].index)


    df_adv = df_adv.drop(df_adv[df_adv['Player'] == 'Player'].index)


    new = pd.merge(df,df_adv,on=['Player','Age','Tm','Pos'])

    player_stats_list = ['Player', 'Pos', 'Age', 'Tm', 'G_x', 'GS', 'MP_x', 'FG', 'FGA',
           'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
           'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
            'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%',
           'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS',
           'WS', 'WS/48','OBPM', 'DBPM', 'BPM', 'VORP']

    player_stats_list_correct = ['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
           '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
           'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER',
           'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
           'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
           'VORP']

    # cuts down the columns to what is listed in player_stats_list_correct
    new = new[player_stats_list]
    new.columns = player_stats_list_correct

    # This gets rid of traded players entirely
    new = new.drop_duplicates(subset='Player',keep=False).reset_index(drop=True)

    # Fills nan's with 0
    new.fillna(value=0,inplace=True)

    # Adds a year column
    new['Year'] = year

    combined.append(new)

    new.to_csv(f'player_stats/{year}_player_stats.csv')
    print(year,'Complete')


1980 already exists
1981 already exists
1982 already exists
1983 already exists
1984 already exists
1985 already exists
1986 already exists
1987 already exists
1988 already exists
1989 already exists
1990 already exists
1991 already exists
1992 already exists
1993 already exists
1994 already exists
1995 already exists
1996 already exists
1997 already exists
1998 already exists
1999 already exists
2000 already exists
2001 already exists
2002 already exists
2003 already exists
2004 already exists
2005 already exists
2006 already exists
2007 already exists
2008 already exists
2009 already exists
2010 already exists
2011 already exists
2012 already exists
2013 already exists
2014 already exists
2015 already exists
2016 already exists
2017 already exists
2018 already exists
2019 already exists
2020 already exists
2021 already exists
2022 already exists
2023 already exists
2024 already exists
