In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import json
from tqdm import tqdm
import re

pd.set_option('display.max_columns', None)

In [2]:
class Game:
    def __init__(self, url):
        self.url = url
        self.load_game()
        self.parse_game()
    
    def load_game(self):
        self.date = self.url[-17:-8]
        self.game_html = requests.get(self.url).text
        self.game_soup = bs(self.game_html)
        a_tags = self.game_soup.find_all('a')
        
        href_tags = []
        for tag in a_tags:
            try:
                if "/teams/" in tag['href']:
                    href_tags.append(tag['href'])
            except:
                pass

        href_tags = href_tags[1:3]
        self.home = href_tags[1][7:10]
        self.away = href_tags[0][7:10]
        
    def parse_game(self):

        away_advance_html = self.game_soup.find_all('table', id = "box-{}-game-advanced".format(self.away))
        away_basic_html = self.game_soup.find_all('table', id = "box-{}-game-basic".format(self.away))
        home_advance_html = self.game_soup.find_all('table', id = "box-{}-game-advanced".format(self.home))
        home_basic_html = self.game_soup.find_all('table', id = "box-{}-game-basic".format(self.home))


        away_advance = pd.read_html(str(away_advance_html))[0]
        away_basic = pd.read_html(str(away_basic_html))[0]
        home_advance = pd.read_html(str(home_advance_html))[0]
        home_basic = pd.read_html(str(home_basic_html))[0]

        dfs = [home_basic, home_advance, away_basic, away_advance]
        for df in dfs:
            df.columns = df.columns.droplevel(0)
        del dfs[1]['MP']
        del dfs[3]['MP']

        df_home = pd.concat([dfs[0].set_index('Starters'),dfs[1].set_index('Starters')], axis=1, join='inner')
        df_away = pd.concat([dfs[2].set_index('Starters'),dfs[3].set_index('Starters')], axis=1, join='inner')

        self.game_dict = {}
        self.game_dict[self.home] = {}
        self.game_dict[self.away] = {}

        teams = [self.home, self.away]
        for num, df in enumerate([df_home, df_away]):
            starters = df.iloc[:5]
            reserves = df.iloc[6:-1]

            for index, row in starters.iterrows():
                try:
                    self.game_dict[teams[num]][index] = [float(i) for i in list(row.values[1:])]
                    self.game_dict[teams[num]][index] += [row.values[0], 1]
                except Exception as e:
                    self.game_dict[teams[num]][index] = [row.values[0]]



            for index, row in reserves.iterrows():
                try:
                    self.game_dict[teams[num]][index] = [float(i) for i in list(row.values[1:])]
                    self.game_dict[teams[num]][index] += [row.values[0], 0]
                except Exception as e:
                    self.game_dict[teams[num]][index] = [row.values[0]]

        self.game_dict[self.home]["Team"] =  [float(i) for i in list(df_home.tail(1).values[0])]
        self.game_dict[self.away]["Team"] =  [float(i) for i in list(df_away.tail(1).values[0])]


        home_players = list(self.game_dict[self.home].keys())
        players_game = home_players + list(self.game_dict[self.away].keys())

        players_total = []
        for link in self.game_soup.find_all('a'):
            try:
                if 'players' in link['href']:
                    players_total.append(link.text)
            except:
                pass

        for i, j in enumerate(players_total):
            if j == 'Players':
                if i != 0:
                    break
        players_total = players_total[:217]

        inactive = []
        for player in players_total:
            if player not in players_game and player != "Players":
                if len(player.split('.')[0]) != 1:
                    inactive.append(player)
        self.game_dict['Inactive'] = {}
        self.game_dict['Inactive'] = inactive
        
class Fantasy_Year:
    def __init__(self, year):
        self.year = year
        self.main_json = {}
        self.year_url = "https://www.basketball-reference.com/leagues/NBA_{}_games.html".format(self.year)
        self.year_html = requests.get(self.year_url).text
        self.main_page_soup = bs(self.year_html)
        
        self.box_scores_links = []
        self.month_links = self.get_months()
        for month in self.month_links:
            self.get_box_scores(month)
        
        self.iter_games()
        
        with open('{}.json'.format(self.year), 'w') as fp:
            json.dump(self.main_json, fp)
        
    def get_months(self):
        month_links = []
        for link in self.main_page_soup.find_all('a'):
            if "/leagues/NBA_{}_games-".format(self.year) in link['href']:
                month_links.append(link['href'])
        return month_links
    
    def get_box_scores(self, month):
        self.month_url = "https://www.basketball-reference.com" + month
        self.month_html = requests.get(self.month_url).text
        month_page_soup = bs(self.month_html)
        
        for link in month_page_soup.find_all('a'):
            if "/boxscores/2" in link['href']:
                self.box_scores_links.append("https://www.basketball-reference.com" + link['href'])
    
    def iter_games(self):   
        for game_link in tqdm(self.box_scores_links):
            Game_obj = Game(game_link)
            try:
                self.main_json[Game_obj.date]
            except:
                self.main_json[Game_obj.date] = {}
            self.main_json[Game_obj.date][Game_obj.home] = Game_obj.game_dict
    
    
a = Fantasy_Year('2018')

100%|██████████| 1312/1312 [11:37<00:00,  1.88it/s]


In [None]:
(a.main_json['201710180'])