In [12]:
import time
import random
import requests
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup

In [13]:
# Define the seasons that will be scraped in order to create the dataset, in this case we'll be using data from the last 2 seasons (2022-2023 and 2023-2024)
seasons = list(range(2024, 2022, -1))
# The league used for the creating of this csv is La Liga (Spanish Championship)
season_standings = "https://fbref.com/en/comps/12/La-Liga-Stats"

In [14]:
# Define an empty list that will store all games (matches) for each time that participated in the last 2 seasons of the championship
all_games = []

for season in seasons:
    # Using BeautifulSoup we extract the data from the table that holds the standings for the season
    data = requests.get(season_standings)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    # Next an empty list is initiated that will hold the url for each club's page
    club_urls = []
    # We find all anchor tags in the standings table
    all_links = standings_table.find_all('a')

    # And extract the href attribute from each anchor tag
    for link in all_links:
        href = link.get('href')
        # If the href contains the string '/squads/', then we can tell it is a club url
        if '/squads/' in href:
            # The urls are partial so we have to turn them intro full urls in order to use them later
            club_urls.append(f"https://fbref.com{href}")

    # Preparing for the next iteration of the for loop we set the season_standings url to the one of the previous season
    previous_season = soup.select("a.prev")[0].get("href")
    season_standings = f"https://fbref.com{previous_season}"

    # Looping through the club urls we extracted earlier we are now collecting data from the games of each club in the championship 
    for club in club_urls:
        
        # Get the table 'Scores & Fixtures' from each club's page
        data = requests.get(club)
        games = pd.read_html(StringIO(data.text), match="Scores & Fixtures")[0]
        
        # Get the link for the "Shooting" stats page for the club
        soup = BeautifulSoup(data.text)
        all_links = soup.find_all('a')
        shooting_links = []
        for link in all_links:
            href = link.get('href')
            if href and 'all_comps/shooting/' in href:
                shooting_links.append(href)
        
        # Get the data from the shooting stats page
        data = requests.get(f"https://fbref.com{shooting_links[0]}")
        # Create a pandas dataframe with the shooting data
        shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()

        # Merge the games and shooting stats together
        # Using a try/catch because sometimes the shooting stats for a specific game could be empty
        try:
            club_data = games.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        # Filter entries using only the games for the competition in question
        club_data = club_data[club_data["Comp"] == "La Liga"]

        # Using the url and a bit of cleanup we get each club's actual name
        club_name = club.split("/")[-1].replace("-Stats", "").replace("-", " ")
        # Add a season and club column for each entry in the df
        club_data["Season"] = season
        club_data["Club"] = club_name
        all_games.append(club_data)

        # Wait between 20 and 30 seconds before scraping the next club to avoid getting IP blocked (shorter wait times were tested and got blocked)
        wait_time = random.randint(20, 30)
        time.sleep(wait_time)
        print(f'{club_name}\'s stats for {season} done.')

Real Madrid's stats for 2024 done.
Barcelona's stats for 2024 done.
Girona's stats for 2024 done.
Atletico Madrid's stats for 2024 done.
Athletic Club's stats for 2024 done.
Real Sociedad's stats for 2024 done.
Real Betis's stats for 2024 done.
Villarreal's stats for 2024 done.
Valencia's stats for 2024 done.
Alaves's stats for 2024 done.
Osasuna's stats for 2024 done.
Getafe's stats for 2024 done.
Celta Vigo's stats for 2024 done.
Sevilla's stats for 2024 done.
Mallorca's stats for 2024 done.
Las Palmas's stats for 2024 done.
Rayo Vallecano's stats for 2024 done.
Cadiz's stats for 2024 done.
Almeria's stats for 2024 done.
Granada's stats for 2024 done.
Barcelona's stats for 2023 done.
Real Madrid's stats for 2023 done.
Atletico Madrid's stats for 2023 done.
Real Sociedad's stats for 2023 done.
Villarreal's stats for 2023 done.
Real Betis's stats for 2023 done.
Osasuna's stats for 2023 done.
Athletic Club's stats for 2023 done.
Mallorca's stats for 2023 done.
Girona's stats for 2023 do

In [15]:
len(all_games)

40

In [16]:
all_games[1]

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Club
0,2023-08-13,21:30,La Liga,Matchweek 1,Sun,Away,D,0,0,Getafe,...,Match Report,,14,3,19.9,1.0,0,0,2024,Barcelona
1,2023-08-20,19:30,La Liga,Matchweek 2,Sun,Home,W,2,0,Cádiz,...,Match Report,,24,10,15.5,2.0,0,0,2024,Barcelona
2,2023-08-27,17:30,La Liga,Matchweek 3,Sun,Away,W,4,3,Villarreal,...,Match Report,,21,11,16.8,0.0,0,0,2024,Barcelona
3,2023-09-03,21:00,La Liga,Matchweek 4,Sun,Away,W,2,1,Osasuna,...,Match Report,,8,3,15.6,0.0,1,1,2024,Barcelona
4,2023-09-16,21:00,La Liga,Matchweek 5,Sat,Home,W,5,0,Betis,...,Match Report,,17,9,18.7,2.0,0,0,2024,Barcelona
6,2023-09-23,18:30,La Liga,Matchweek 6,Sat,Home,W,3,2,Celta Vigo,...,Match Report,,15,6,20.8,0.0,0,0,2024,Barcelona
7,2023-09-26,21:30,La Liga,Matchweek 7,Tue,Away,D,2,2,Mallorca,...,Match Report,,10,4,15.5,0.0,0,0,2024,Barcelona
8,2023-09-29,21:00,La Liga,Matchweek 8,Fri,Home,W,1,0,Sevilla,...,Match Report,,18,4,13.5,0.0,0,0,2024,Barcelona
10,2023-10-08,21:00,La Liga,Matchweek 9,Sun,Away,D,2,2,Granada,...,Match Report,,22,10,16.9,1.0,0,0,2024,Barcelona
11,2023-10-22,21:00,La Liga,Matchweek 10,Sun,Home,W,1,0,Athletic Club,...,Match Report,,12,7,15.4,0.0,0,0,2024,Barcelona


In [17]:
df = pd.concat(all_games)

In [18]:
df.columns = [column.lower() for column in df.columns]

In [19]:
df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,club
0,2023-08-12,21:30,La Liga,Matchweek 1,Sat,Away,W,2,0,Athletic Club,...,Match Report,,14.0,8.0,16.0,2.0,0,0,2024,Real Madrid
1,2023-08-19,19:30,La Liga,Matchweek 2,Sat,Away,W,3,1,Almería,...,Match Report,,25.0,9.0,17.0,1.0,0,0,2024,Real Madrid
2,2023-08-25,21:30,La Liga,Matchweek 3,Fri,Away,W,1,0,Celta Vigo,...,Match Report,,9.0,2.0,19.4,0.0,0,1,2024,Real Madrid
3,2023-09-02,16:15,La Liga,Matchweek 4,Sat,Home,W,2,1,Getafe,...,Match Report,,26.0,12.0,17.7,0.0,0,0,2024,Real Madrid
4,2023-09-17,21:00,La Liga,Matchweek 5,Sun,Home,W,2,1,Real Sociedad,...,Match Report,,17.0,8.0,15.9,1.0,0,0,2024,Real Madrid


In [20]:
df.to_csv('la-liga.csv')
