In [None]:
import numpy as np
import pandas as pd
import random
import time
import json
# from unicode import unidecode


# List of player codes needed for html
player_dict = {
    "Giannis Antetokounmpo": "antetgi01",
    "Jimmy Butler": "butleji01",
    "Stephen Curry": "curryst01",
    "Mike Conley": "conlemi01",
    "Kevin Durant": "duranke01",
    "Anthony Davis": "davisan02",
    "DeMar DeRozan": "derozde01",
    "Joel Embiid": "embiijo01",
    "Rudy Gobert": "goberru01",
    "James Harden": "hardeja01",
    "LeBron James": "jamesle01",
    "Nikola Jokic": "jokicni01",
    "Kawhi Leonard": "leonaka01",
    "Iman Quickley": "quickim01",
    "Karl-Anthony Towns": "townska01",
    "Jonathan Valanciunas": "valanjo01",
    "Russell Westbrook": "westbru01",
    "Trae Young": "youngtr01",
    "Ivica Zubac": "zubaciv01",
    "Donovan Mitchell": "mitchdo01"
}


# Smaller list for shorter testing response times
playerc = {'Stephen Curry': 'curryst01',
        "Jonathan Valanciunas": "valanjo01",
        "Ivica Zubac": "zubaciv01"}

# Only want current season stats
# seasons = [2025, 2026]
column_names = ['Date', 'Team', 'Home/Away',
                'Opp', 'Result','GS',
                'MP', 'FG', 'FGA',
                'FG%', '3P',
                '3PA', '3P%', 'FT',
                'FTA', 'FT%', 'ORB',
                'DRB', 'TRB', 'AST',
                'STL', 'BLK', 'TOV',
                'PF', 'PTS', 'GmSc','PLUS_MINUS', 'Player_Code', 'Player_Name']

nullified_cols = ['FG%', '3P%', 'FT%']


all_players_df = pd.DataFrame()

for i, (player_name, player_code) in enumerate(player_dict.items(), start=1):
    url = f'https://www.basketball-reference.com/players/{player_code[0]}/{player_code}.html'
    print(f"Fetching {player_code} from {url}. ({i}/{len(player_dict)})")
    
    try:
        # Read last 5 games table
        player_df = pd.read_html(url, header=0, attrs={'id':"last5"})[0]
        player_df["Player_Code"] = player_code
        player_df["Player Name"] = player_name


        # Changes @ symbol to Home/Away if wanted
        # player_df.iloc[:, 2] = player_df.iloc[:, 2].replace({'@': 'Away', '': 'Home'}).fillna('Home')=
        
        # Fill numeric missing values with 0 for all columns after 7th (MP onwards)
        player_df.iloc[:, 7:] = player_df.iloc[:, 7:].fillna(0)

        # Concatenate into main dataframe
        all_players_df = pd.concat([all_players_df, player_df], ignore_index=True)
        
        # Pause to avoid rate limits
        time.sleep(random.randint(4, 6))
    except Exception as e:
        print(f"Error fetching data for {player_code}: {e}")
        if "429" in str(e):
            print("Rate limit exceeded. Pausing for 30 seconds.")
            time.sleep(30)
        continue

# Set column names explicitly
all_players_df.columns = column_names

# Make percentage columns 0 if there is missing data
all_players_df[nullified_cols] = all_players_df[nullified_cols].replace('', 0.0)

## Convert 'GS' column to boolean
all_players_df['GS'] = all_players_df['GS'].apply(lambda x: x == '*')

## Convert 'Home/Away' column to 'H' or 'A'
all_players_df['Home/Away'] = (
    all_players_df['Home/Away']
    .fillna('')
    .apply(lambda x: 'A' if x == '@' else 'H')
)

# Reorder columns to put Player_Name first
all_players_df = all_players_df[['Player_Name'] + [col for col in all_players_df.columns if col != 'Player_Name']]


## This moves games into a games list in the object for easier backend processing
game_columns = [
    'Date', 'Team', 'Home/Away', 'Opp', 'Result', 'GS',
    'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
    'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
    'AST', 'STL', 'BLK', 'TOV', 'PF',
    'PTS', 'GmSc', 'PLUS_MINUS'
]

players = []

for (player_name, player_code), group in all_players_df.groupby(
    ['Player_Name', 'Player_Code']
):
    player_data = {
        'Player_Name': player_name,
        'Player_Code': player_code,
        'Games': group[game_columns].to_dict(orient='records')
    }
    players.append(player_data)


## Csv created for easier visualization when data cleaning
all_players_df.to_csv('../data/player_stats.csv', index=False)

## JSON crteated for easier integration with backend software
with open('../data/player_stats.json', 'w') as f:
    json.dump(players, f, indent=2)


Fetching antetgi01 from https://www.basketball-reference.com/players/a/antetgi01.html. (1/20)
Fetching butleji01 from https://www.basketball-reference.com/players/b/butleji01.html. (2/20)
Fetching curryst01 from https://www.basketball-reference.com/players/c/curryst01.html. (3/20)
Fetching conlemi01 from https://www.basketball-reference.com/players/c/conlemi01.html. (4/20)
Fetching duranke01 from https://www.basketball-reference.com/players/d/duranke01.html. (5/20)
Fetching davisan02 from https://www.basketball-reference.com/players/d/davisan02.html. (6/20)
Fetching derozde01 from https://www.basketball-reference.com/players/d/derozde01.html. (7/20)
Fetching embiijo01 from https://www.basketball-reference.com/players/e/embiijo01.html. (8/20)
Fetching goberru01 from https://www.basketball-reference.com/players/g/goberru01.html. (9/20)
Fetching hardeja01 from https://www.basketball-reference.com/players/h/hardeja01.html. (10/20)
Fetching jamesle01 from https://www.basketball-reference.co