# SLB boxscore data scrape

Extracts the data from each individual game and saves the boxscores in CSV format

First we need to load/import the various Python module we need to do this. I have added comments to give some indication of what the modules do.

If you have not used selenium with Chrome before, you will probably need to download and install [Chrome for Testing](https://googlechromelabs.github.io/chrome-for-testing/)

In [1]:
from datetime import datetime # This allows us to manipulate dates
from pathlib import Path # For checking if folders/directories exist and creating them
import requests # For simple access to web pages
import time # We're going to use this to pause the web browser for a couple of seconds
import pandas as pd # For creating dataframes / CSV files
import numpy as np # Numerical manipulations
import re # For regular expressions - allows us to search with wildcards
from bs4 import BeautifulSoup # For messing around with the data from web pages
from selenium import webdriver # Automates using Chrome to access more complex web pages.

Lets grab the HTML from the SLB livestats page. We can use this to figure out where all of the boxscore information is kept. This will automatically open Chrome and should close it again.

In [2]:
results_url = "https://www.superleaguebasketballm.co.uk/livestats/"
driver = webdriver.Chrome()
page = driver.get(results_url)
driver.switch_to.frame(1)
results_soup = BeautifulSoup(driver.page_source, 'html')
driver.close()

Now we want to extract all of the URLs for the games that have been played (it skips over any that currently have the score listed as 'Upcoming').

If we wanted to extract only the data for a specific competition, then we can uncomment the lines that include `comp_string'.

In [3]:
tables = results_soup.find_all('table')
#comp_string = 'Championship 2024-25'
upcoming_string = 'Upcoming'
url_soup = []
for rows in tables[0].find_all('tr'):
    cells = rows.find_all('td')
#    if comp_string in cells[2]:
    for a in rows.find_all('a', href=True):
        if a.contents[0] != upcoming_string:
            url_soup.append(a['href'])

We can use the SLB URLs gethered above to directly access the FIBA live stats. This next cell makes a list of the live stats URLs for all of the games identified above.

In [4]:
game_id = []
for url in url_soup:
    game_id.append(url.split('/')[5])

league = 'SLB'
baseurl = 'https://www.fibalivestats.com/u/{}'.format(league)
games = []

for g_id in game_id:
    url = "{}/{}/".format(baseurl, g_id)
    resp = requests.get(url)
    if resp.status_code == 200:
        #print(url)
        games.append(url)
    else:
        print("Couldn't resolve URL:", url)

We now define some functions that work with the live stats URLs to scrape all of the boxscore data and save the data as CSV.

In [5]:
def stats_to_df(soup):
    """Converts the soup of FIBA livestats for a single game into a data frame. 
    The data frame, the teams playing and the date the game was played are then returned"""
    teams=[]
    team_divs = soup.find_all("div", {"class": "team-name"})
    for count, div in enumerate(team_divs):
        team_span = div.find_all('span')
        teams.append(team_span[0].get_text())
    
    date=soup.find_all("div", {"class": "og-date"})[0].get_text()
    date_formatted = datetime.strptime(date, '%d/%m/%Y')
    date = date_formatted.strftime('%Y%m%d')

    #Create the internal lists to hold all of the data
    player_name=[]
    team=[]
    minutes=[]
    points=[]
    fgm=[]
    fga=[]
    fgper=[]
    twopm=[]
    twopa=[]
    twoper=[]
    threepm=[]
    threepa=[]
    threeper=[]
    ftpm=[]
    ftpa=[]
    ftper=[]
    rebo=[]
    rebd=[]
    rebtot=[]
    assists=[]
    tos=[]
    steals=[]
    blocks=[]
    blocksr=[]
    fouls=[]
    foulson=[]
    plusminus=[]

    #Populate the lists
    scores_tables = soup.find_all("table", {"class": "boxscore"})
    for team_count, table in enumerate(scores_tables):
        for count, row in enumerate(table.find_all('tr', {"class": "player-row"})):
            if count != 0:
                player_name.append(row.find_all('a', {"class": "playerpopup"})[0].find_all('span')[0].get_text())
                team.append(teams[team_count])
                minutes.append(row.find_all('span', {"id": re.compile("Minutes")})[0].get_text())
                points.append(row.find_all('span', {"id": re.compile("Points")})[0].get_text())
                fgm.append(row.find_all('span', {"id": re.compile("FieldGoalsMade")})[0].get_text())
                fga.append(row.find_all('span', {"id": re.compile("FieldGoalsAttempted")})[0].get_text())
                fgper.append(row.find_all('span', {"id": re.compile("FieldGoalsPercentage")})[0].get_text())
                twopm.append(row.find_all('span', {"id": re.compile("TwoPointersMade")})[0].get_text())
                twopa.append(row.find_all('span', {"id": re.compile("TwoPointersAttempted")})[0].get_text())
                twoper.append(row.find_all('span', {"id": re.compile("TwoPointersPercentage")})[0].get_text())
                threepm.append(row.find_all('span', {"id": re.compile("ThreePointersMade")})[0].get_text())
                threepa.append(row.find_all('span', {"id": re.compile("ThreePointersAttempted")})[0].get_text())
                threeper.append(row.find_all('span', {"id": re.compile("ThreePointersPercentage")})[0].get_text())
                ftpm.append(row.find_all('span', {"id": re.compile("FreeThrowsMade")})[0].get_text())
                ftpa.append(row.find_all('span', {"id": re.compile("FreeThrowsAttempted")})[0].get_text())
                ftper.append(row.find_all('span', {"id": re.compile("FreeThrowsPercentage")})[0].get_text())
                rebo.append(row.find_all('span', {"id": re.compile("ReboundsOffensive")})[0].get_text())
                rebd.append(row.find_all('span', {"id": re.compile("ReboundsDefensive")})[0].get_text())
                rebtot.append(row.find_all('span', {"id": re.compile("ReboundsTotal")})[0].get_text())
                assists.append(row.find_all('span', {"id": re.compile("Assists")})[0].get_text())
                tos.append(row.find_all('span', {"id": re.compile("Turnovers")})[0].get_text())
                steals.append(row.find_all('span', {"id": re.compile("Steals")})[0].get_text())
                blocks.append(row.find_all('span', {"id": re.compile("Blocks")})[0].get_text())
                blocksr.append(row.find_all('span', {"id": re.compile("BlocksReceived")})[0].get_text())
                fouls.append(row.find_all('span', {"id": re.compile("FoulsPersonal")})[0].get_text())
                foulson.append(row.find_all('span', {"id": re.compile("FoulsOn")})[0].get_text())
                plusminus.append(row.find_all('span', {"id": re.compile("PlusMinusPoints")})[0].get_text())

    #Create the dataframe
    df = pd.DataFrame(np.column_stack([player_name, team, minutes, points, fgm, fga, fgper, twopm, twopa, twoper, threepm, threepa, threeper, 
                                   ftpm, ftpa, ftper, rebo, rebd, rebtot, assists, tos, steals, blocks, blocksr, fouls, foulson, plusminus]), 
                                   columns=["Name", "Team", "Mins", "PTS", "FGM", "FGA", "FG%", "2PM", "2PA", "2P%", "3PM", "3PA", "3P%", 
                                    "FTM", "FTA", "FT%","OREB", "DREB", "REB", "AST", "TO", "STL", "BLK", "BLKR", "PF", 
                                    "FOULON", "PLUSMINUS"])
    
    return df, teams, date

In [6]:
def fiba_url_to_soup(game):
    """Takes the base FIBA livestats URL, adds the extra info to request the boxscore, then returns the pagesoup"""
    url = game+'bs.html'
    browser = webdriver.Chrome()
    browser.get(url)
    time.sleep(2)
    soup = BeautifulSoup(browser.page_source, 'html')
    browser.close()

    return soup

In [7]:
def save_game_csv(df, teams, date, directory=None):
    """Saves the dataframe in CSV format, with the filename generated from the teams and date
    Optionally places the file into a directory"""
    filename = teams[0].replace(" ", "-") + "-Vs-" + teams[1].replace(" ", "-") + "-" + date + ".csv"
    if directory == None:
        df.to_csv(filename)
    else:
        if not Path(directory).is_dir():
            Path(directory).mkdir()
        df.to_csv(Path(directory, filename))

    return

The next cell does the work of scraping the data and saving to CSV for each game identified above. Please note that it will launch a Chrome window (and eventually close it) for every game.

In [8]:
for game in games:
    soup = fiba_url_to_soup(game)
    df, teams, date = stats_to_df(soup)
    save_game_csv(df, teams, date, "data")

We should now have a separate CSV for each game that has been played, all in the directory called data. Processing and analysing the data will take place in a separate notebook.