In [4]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
import json
import re


pd.set_option('display.max_columns', None)


In [5]:
def get_months(year_soup, year):
    month_links = []
    for link in year_soup.find_all('a'):
        if "/leagues/NBA_{}_games-".format(year) in link['href']:
            month_links.append(link['href'])
            
    return month_links

def get_box_scores(month):
    url = "https://www.basketball-reference.com" + month
    html_text = requests.get(url).text
    month_page_soup = bs(html_text)

    box_scores_links = []
    for link in month_page_soup.find_all('a'):
        if "/boxscores/2" in link['href']:
            box_scores_links.append(link['href'])

    return box_scores_links

def get_bs_soup(game):
    
    date = game[11:19]
    url = "https://www.basketball-reference.com" + game
    html_text = requests.get(url).text
    game_page_soup = bs(html_text)

    #Get home and away team
    a_tags = game_page_soup.find_all('a')
    href_tags = []
    for tag in a_tags:
        try:
            if "/teams/" in tag['href']:
                href_tags.append(tag['href'])
        except:
            pass

    href_tags = href_tags[1:3]
    home = href_tags[1][7:10]
    away = href_tags[0][7:10]
    

    return game_page_soup, home, away, date

def parse_box_score(soup, home, away, date):
    
    away_advance_html = soup.find_all('table', id = "box-{}-game-advanced".format(away))
    away_basic_html = soup.find_all('table', id = "box-{}-game-basic".format(away))
    home_advance_html = soup.find_all('table', id = "box-{}-game-advanced".format(home))
    home_basic_html = soup.find_all('table', id = "box-{}-game-basic".format(home))


    away_advance = pd.read_html(str(away_advance_html))[0]
    away_basic = pd.read_html(str(away_basic_html))[0]
    home_advance = pd.read_html(str(home_advance_html))[0]
    home_basic = pd.read_html(str(home_basic_html))[0]

    dfs = [home_basic, home_advance, away_basic, away_advance]
    for df in dfs:
        df.columns = df.columns.droplevel(0)
    del dfs[1]['MP']
    del dfs[3]['MP']
        
    df_home = pd.concat([dfs[0].set_index('Starters'),dfs[1].set_index('Starters')], axis=1, join='inner')
    df_away = pd.concat([dfs[2].set_index('Starters'),dfs[3].set_index('Starters')], axis=1, join='inner')
    
    game_dict = {}
    game_dict[home] = {}
    game_dict[away] = {}
    
    teams = [home, away]
    for num, df in enumerate([df_home, df_away]):
        starters = df.iloc[:5]
        reserves = df.iloc[6:-1]

        for index, row in starters.iterrows():
            try:
                game_dict[teams[num]][index] = [float(i) for i in list(row.values[1:])]
                game_dict[teams[num]][index] += [row.values[0], 1]
            except Exception as e:
                game_dict[teams[num]][index] = [row.values[0]]
            


        for index, row in reserves.iterrows():
            try:
                game_dict[teams[num]][index] = [float(i) for i in list(row.values[1:])]
                game_dict[teams[num]][index] += [row.values[0], 0]
            except Exception as e:
                game_dict[teams[num]][index] = [row.values[0]]

    game_dict[home]["Team"] =  [float(i) for i in list(df_home.tail(1).values[0])]
    game_dict[away]["Team"] =  [float(i) for i in list(df_away.tail(1).values[0])]
        
    
    home_players = list(game_dict[home].keys())
    players_game = home_players + list(game_dict[away].keys())

    players_total = []
    for link in soup.find_all('a'):
        try:
            if 'players' in link['href']:
                players_total.append(link.text)
        except:
            pass

    for i, j in enumerate(players_total):
        if j == 'Players':
            if i != 0:
                break
    players_total = players_total[:217]

    inactive = []
    for player in players_total:
        if player not in players_game and player != "Players":
            if len(player.split('.')[0]) != 1:
                inactive.append(player)
    game_dict['Inactive'] = {}
    game_dict['Inactive'] = inactive
    
    return game_dict

In [6]:
def main(year):
    main_json = {}
    
    url = "https://www.basketball-reference.com/leagues/NBA_{}_games.html".format(year)
    html_text = requests.get(url).text
    main_page_soup = bs(html_text)
    month_links = get_months(main_page_soup, year)
    
    for month in month_links:
        
        box_score_links = get_box_scores(month)
        month = month.split(".")[0].split("-")[-1]
        print(month)
        main_json[month] = {}

        for box_score_link in tqdm(box_score_links):

            game_page_soup, home, away, date = get_bs_soup(box_score_link)
            try:
                main_json[month][date]
            except:
                main_json[month][date] = {}
            game_json = parse_box_score(game_page_soup, home, away, date)
            main_json[month][date][home] = game_json

    
    return main_json

    
year = '2021'
main_json = main(year)

with open('{}.json'.format(year), 'w') as fp:
    json.dump(main_json, fp)
    

  0%|                                                                                         | 0/67 [00:00<?, ?it/s]

december


100%|████████████████████████████████████████████████████████████████████████████████| 67/67 [00:33<00:00,  2.01it/s]
  0%|                                                                                         | 0/33 [00:00<?, ?it/s]

january


100%|████████████████████████████████████████████████████████████████████████████████| 33/33 [00:15<00:00,  2.08it/s]
0it [00:00, ?it/s]

february



0it [00:00, ?it/s]

march



