In [1]:
import os
import time
from bs4 import BeautifulSoup, Comment
from selenium import webdriver
from selenium.webdriver.common.by import By
import asyncio
import requests

DATA_DIR = 'data'
SEASONS = range(2014,2024)
STANDINGS_DIR = os.path.join(DATA_DIR, 'standings')
SCORES_DIR = os.path.join(DATA_DIR, 'scores')
BASE_URL = 'https://www.basketball-reference.com/'
loop = asyncio.get_event_loop()

In [2]:
print(SEASONS)

range(2014, 2024)


In [3]:
#GET HTML ELEMENTS BY THEIR SELECTORS
def get_html(url, selector, duration=5, retries=3):
    html = None
    
    for i in range(1, retries+1):
        time.sleep(duration * i)
        try:
            browser = webdriver.Chrome()
            browser.get(url)  # Utilisez browser.get(url) pour accéder à l'URL
            print(browser.title)
            elements = browser.find_elements(By.CSS_SELECTOR,selector)
            html = [element.get_attribute("innerHTML") for element in elements]
        except Exception as e:
            print(f"Error on attempt {i}: {str(e)}")
            continue
        else:
            break
        finally:
            browser.quit()
        
    return html

In [4]:
#Get all season period link
filter_links = {}
for season in SEASONS:
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    filter_elements = get_html(url, '.filter', 5,3)
    soup = BeautifulSoup(('').join(filter_elements), 'html.parser')
    #print(soup)
    links = [link.get('href') for link in soup.find_all("a")]
    #print(links)
    filter_links[season] = links

2013-14 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2017-18 NBA Schedule | Basketball-Reference.com
2018-19 NBA Schedule | Basketball-Reference.com
2019-20 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com
2022-23 NBA Schedule | Basketball-Reference.com


In [7]:
box_score_links = {}
def get_box_scores(links):
    if(not os.path.exists(f"./BOXSCORES")):
        os.makedirs(f"./BOXSCORES")
    for season in SEASONS:
        file_path = f"./BOXSCORES/BOXSCORES_{season}.txt"
        if os.path.exists(file_path):
            continue
        box_score_links[season] = []
        for link in filter_links[season]:
            box_scores = get_html(BASE_URL+link, '[data-stat="box_score_text"]')
            soup = BeautifulSoup(''.join(box_scores), 'html.parser')
            box_score_links[season] = box_score_links[season] + [element.get('href') for element in soup.find_all('a')]
        with open(file_path, 'w') as file:
        # Loop through the list and write each string to the file
            for string in box_score_links[season]:
                file.write(string + '\n') 

In [8]:
get_box_scores(filter_links)

2013-14 NBA Schedule | Basketball-Reference.com
2013-14 NBA Schedule | Basketball-Reference.com
2013-14 NBA Schedule | Basketball-Reference.com
2013-14 NBA Schedule | Basketball-Reference.com
2013-14 NBA Schedule | Basketball-Reference.com
2013-14 NBA Schedule | Basketball-Reference.com
2013-14 NBA Schedule | Basketball-Reference.com
2013-14 NBA Schedule | Basketball-Reference.com
2013-14 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Refere

In [9]:
def get_key_from_value(dictionary, value):
    for key, val in dictionary.items():
        if val == value:
            return key
    # If the value is not found, you can handle it as needed, e.g., return None or raise an exception.
    return None

In [10]:
TEAMS = {
    "ATL": 0,
    "BOS": 1,
    "BKN": 2,
    "CHA": 3,
    "CHI": 4,
    "CLE": 5,
    "DAL": 6,
    "DEN": 7,
    "DET": 8,
    "GSW": 9,
    "HOU": 10,
    "IND": 11,
    "LAC": 12,
    "LAL": 13,
    "MEM": 14,
    "MIA": 15,
    "MIL": 16,
    "MIN": 17,
    "NOP": 18,
    "NYK": 19,
    "OKC": 20,
    "ORL": 21,
    "PHI": 22,
    "PHX": 23,
    "POR": 24,
    "SAC": 25,
    "SAS": 26,
    "TOR": 27,
    "UTA": 28,
    "WAS": 29
}

In [2]:
import pandas as pd
import numpy as np
from io import StringIO

In [3]:
def str_to_nan(df):
    for i in range(len(df)):
        for j in range(len(df.iloc[i])):
            if(df.iloc[i,j] == "Did Not Play" or df.iloc[i,j] == "Did Not Dress" or df.iloc[i,j] == "Not With Team"):
                df.iloc[i,j] = np.nan

In [4]:
#Get Line Score Table to get the name of team
def get_team_df(link):
    line_score = get_html(BASE_URL+link, "[id='div_line_score']")
    soup_line_score = BeautifulSoup(line_score[0], 'html.parser')
    line_score_table = soup_line_score.find('table')
    pd_line_score = pd.read_html(StringIO(line_score_table.prettify()))
    line_score_df = pd_line_score[0]
    team_names = np.array(line_score_df.iloc[:,0])
    line_score_df
    won = False
    if(int(line_score_df.iloc[0, -1]) > int(line_score_df.iloc[1,-1])):
        won = True
    else:
        won = False

    team_names = np.append(team_names,won)
    team_names = team_names.reshape(1, -1)
    team_names_header = ['TEAM', 'TEAM_OPP', 'WON']
    team_row = pd.DataFrame(team_names, columns=team_names_header)
    return team_row

In [5]:
def get_links_from_file(filepath):
    with open(filepath, 'r') as file:
        lines = [line.strip() for line in file.readlines()]
        return lines

def dataframe_processing(df):
    str_to_nan(df)
    df = df.drop(5)
    df = df.sort_index(axis=1)
    df.pop(df.columns[-1])
    df.pop(df.columns[0])
    column_names = [t[1] for t in df.columns]
    max_column_names = [c+'_MAX' for c in column_names]
    column_names += max_column_names
    values = np.array(df.iloc[-1])
    df.drop(df.tail(1).index,inplace=True)
    return df, values



def add_max_values(df):
    temp_df = df
    temp_df = temp_df.apply(pd.to_numeric, errors='coerce')
    #Delete rows with only NAN values
    temp_df = temp_df.dropna(axis=1, how='all')
    #print(temp_df)
    values = np.nanmax(temp_df, axis=0)
    return values

def get_header(df):
    headers = [t[1] for t in df.columns]
    return headers

def get_max_header(df):
    headers = []
    for t in df.columns:
        s = t[1] +'_MAX'
        headers.append(s)
    headers.pop(13)
    return headers

def get_opp_header(df):
    headers = ['OPP_'+t[1] for t in df.columns]
    return headers

def get_opp_max_header(df):
    headers = []
    for t in df.columns:
        s = 'OPP_'+t[1] +'_MAX'
        headers.append(s)
    headers.pop(13)
    return headers

In [25]:
def get_all_html_files(season):
    if(not os.path.exists("./HTML_FILES")):
        os.makedirs(f"./HTML_FILES")
    print(f"!!! DOWNLOADING SEASON {season} !!!")
    time.sleep(10)
    if(not os.path.exists(f"./HTML_FILES/{season}")):
        os.makedirs(f"./HTML_FILES/{season}")
    boxscore_links = get_links_from_file(f'BOXSCORES/BOXSCORES_{season}.txt')
    for j in range(len(boxscore_links)):
        temp_link = boxscore_links[j].replace("/boxscores/", "")
        filename = temp_link.replace(".html", "")
        if(os.path.exists(f"./HTML_FILES/{season}/{filename}.html")):
            print("existe deja")
            continue
        time.sleep(2)
        response = requests.get(BASE_URL + boxscore_links[j])
        with open(f"./HTML_FILES/{season}/{filename}.html", 'wb') as file:
            file.write(response.content)
        
def get_dataframes():
    all_dataframes = {key:None for key in SEASONS}
    
    for season in SEASONS:
        if os.path.exists(f"../DATAS/season_{season}.xlsx"):
            continue
        print(f"Start generating dataframe for season {season}...")
        season_dataframe = pd.DataFrame()
        """-------------- GET TABLE IN PAGE --------------"""
        directory_path = f"./HTML_FILES/{season}/"
        
        filenames = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
        for filename in filenames:
            with open(directory_path+filename, 'r', encoding='utf-8') as file:
                html_content = file.read()

            page = BeautifulSoup(html_content, 'html.parser')
            score_tables = page.find_all("table",id=lambda value: value and 'box-' in value and '-game-basic' in value)
            if(len(score_tables) != 2):
                continue

            table = pd.read_html(StringIO(score_tables[0].prettify()))[0]
            opp_table = pd.read_html(StringIO(score_tables[1].prettify()))[0]

            #GET DATAFRAMES
            df, row_values = dataframe_processing(table)
            opp_df, opp_row_values = dataframe_processing(opp_table)

            """-------------- MATCH DATAS --------------"""
            #FIRST TEAM
            max_values = add_max_values(df)
            header = get_header(df)
            max_header = get_max_header(df)
            max_values = max_values.reshape(1, -1)
            row_values = row_values.reshape(1, -1)

            row = pd.DataFrame(row_values, columns=header)
            max_row = pd.DataFrame(max_values, columns=max_header)

            #OPP TEAM
            opp_max_values = add_max_values(opp_df)
            opp_header = get_opp_header(opp_df)
            opp_max_header = get_opp_max_header(opp_df)
            opp_max_values = opp_max_values.reshape(1, -1)
            opp_row_values = opp_row_values.reshape(1, -1)

            opp_row = pd.DataFrame(opp_row_values, columns=opp_header)
            opp_max_row = pd.DataFrame(opp_max_values, columns=opp_max_header)

            """-------------- TEAMS DATAS --------------"""
            teams = []
            #Get team names
            for table in score_tables:
                id_of_table = table.get('id')
                team_name = id_of_table.replace("box-", "").replace("-game-basic", "")
                teams.append(team_name)

            #Get won status
            team_final_score = row_values[-1, 8]
            opp_team_final_score = opp_row_values[-1, 8]
            won = int(team_final_score) > int(opp_team_final_score)
            teams.append(won)

            teams_array = np.array(teams)
            teams_array = teams_array.reshape(1, -1)
            team_row_header = ['TEAM', 'TEAM_OPP', 'WON']
            team_row = pd.DataFrame(teams_array, columns=team_row_header)

            #Create full row with all the datas
            full_row = pd.concat([row,opp_row,max_row,opp_max_row,team_row], axis=1)
            season_dataframe = pd.concat([season_dataframe, full_row], ignore_index=True)
        
        print(f"Dataframe for season {season} done !!!")
        export_dataframe(season_dataframe, season)

def export_dataframe(df, season):
    if(not os.path.exists(f"../DATAS")):
        os.makedirs(f"../DATAS")
    if os.path.exists(f"../DATAS/season_{season}.xlsx"):
        return
    df.to_excel(f'../DATAS/season_{season}.xlsx', index=False)

In [26]:
#Create and Extract Dataframe to Excel
get_dataframes()

Start generating dataframe for season 2020...
Dataframe for season 2020 done !!!
Start generating dataframe for season 2021...
Dataframe for season 2021 done !!!
Start generating dataframe for season 2022...
Dataframe for season 2022 done !!!
Start generating dataframe for season 2023...
Dataframe for season 2023 done !!!
ALL DONE !!!


{2014: None,
 2015: None,
 2016: None,
 2017: None,
 2018: None,
 2019: None,
 2020: None,
 2021: None,
 2022: None,
 2023: None}