In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Base URL for La Liga data on Fbref
base_url = "https://fbref.com/en/comps/12"

# Seasons to scrape data for
seasons = ["2024-2025", "2023-2024", "2022-2023", "2021-2022", "2020-2021"]

# Headers 
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

all_data = []

for season in seasons:
    print(f"Fetching data for season: {season}")
    season_url = f"{base_url}/{season}/{season}-La-Liga-Stats"

    # Request the page for the season's stats
    response = requests.get(season_url, headers=headers)
    time.sleep(random.uniform(10, 15)) 
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the standings table to extract team links
    stats_table = soup.select_one('#stats_squads_standard_for')
    if stats_table:
        print(f"Stats table found for {season}!")
        squad_links = stats_table.select('a[href]')
        all_shooting_links = []

        # Extract links to team shooting stats
        for link in squad_links:
            href = link.get('href')
            if "/squads/" in href: 
                team_id = href.split('/')[3]
                team_name = href.split('/')[-1].split('-Stats')[0]
                shooting_link = (
                    f"https://fbref.com/en/squads/{team_id}/{season}/matchlogs/c12/shooting/{team_name}-Match-Logs-La-Liga"
                )
                all_shooting_links.append((shooting_link, team_name))

        # Scrape data from each team's shooting stats page
        for shooting_link, team_name in all_shooting_links:
            print(f"Fetching shooting data from: {shooting_link}")
            response = requests.get(shooting_link, headers=headers)
            time.sleep(10)  
            soup = BeautifulSoup(response.text, "html.parser")

            # Locate the shooting stats table
            shooting_stats = soup.select_one('#matchlogs_for')
            if shooting_stats:
                try:
                   
                    df = pd.read_html(str(shooting_stats))[0]

                    if isinstance(df.columns, pd.MultiIndex):
                        df.columns = [' '.join(col).strip() for col in df.columns]

                    # Clean column names
                    df.columns = df.columns.str.replace(r'^For [A-Za-zÀ-ÿ\s\-]+ ', '', regex=True)
                    df.columns = df.columns.str.replace(r'^Standard ', '', regex=True)

                    # Map and rename relevant columns
                    column_mapping = {
                        "Date": "Date",
                        "Time": "Time",
                        "Round": "Round",
                        "Day": "Day",
                        "Venue": "Venue",
                        "Result": "Result",
                        "GF": "GF",
                        "GA": "GA",
                        "Opponent": "Opponent",
                        "Gls": "Gls",
                        "Sh": "Sh",
                        "SoT": "SoT",
                        "SoT%": "SoT%",
                        "G/Sh": "G/Sh",
                        "G/SoT": "G/SoT",
                        "Dist": "Dist",
                        "FK": "FK",
                        "PK": "PK",
                        "PKatt": "PKatt",
                    }
                    df = df.rename(columns=column_mapping)

                    # Ensure all desired columns are present
                    for col in column_mapping.values():
                        if col not in df.columns:
                            df[col] = None  

                    df = df[list(column_mapping.values())]

                    df["Team"] = team_name
                    df["Season"] = season

                    all_data.append(df)

                except ValueError as e:
                    print(f"Error reading the table for {team_name}: {e}")
            else:
                print(f"Shooting table NOT found for {team_name} in {season}!")

# Combine all collected data into a single DataFrame
if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)
    print("All data combined successfully!")

    desired_columns = ['Team', 'Season', 'Date', 'Time', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'Gls', 'Sh', 'SoT', 'SoT%', 'G/Sh', 'G/SoT', 'Dist', 'FK', 'PK', 'PKatt']
    combined_df = combined_df[desired_columns]
    combined_df = combined_df.drop_duplicates()

    combined_df.to_csv("la_liga_data.csv", index=False)
    print(f"Saved to la_liga_data.csv}")
else:
    print("Error. No data collected.")

In [26]:
from pathlib import Path
import pandas as pd

# Read in the csv file
data_path = Path(...)
la_liga_data = pd.read_csv(data_path)
la_liga_data

Unnamed: 0,Team,Season,Date,Time,Round,Day,Venue,Result,GF,GA,...,Gls,Sh,SoT,SoT%,G/Sh,G/SoT,Dist,FK,PK,PKatt
0,Alaves,2024-2025,2024-08-16,19:00,Matchweek 1,Fri,Away,L,1,2,...,1,10,2,20.0,0.10,0.50,14.5,0,0,0
1,Alaves,2024-2025,2024-08-25,19:15,Matchweek 2,Sun,Home,D,0,0,...,0,5,1,20.0,0.00,0.00,18.6,0,0,0
2,Alaves,2024-2025,2024-08-28,21:30,Matchweek 3,Wed,Away,W,2,1,...,2,13,4,30.8,0.08,0.25,18.1,0,1,1
3,Alaves,2024-2025,2024-09-01,17:00,Matchweek 4,Sun,Home,W,2,0,...,2,14,5,35.7,0.14,0.40,19.6,1,0,1
4,Alaves,2024-2025,2024-09-14,16:15,Matchweek 5,Sat,Away,L,2,3,...,2,19,6,31.6,0.11,0.33,14.1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3499,Villarreal,2020-2021,2021-05-09,18:30,Matchweek 35,Sun,Home,L,2,4,...,2,15,6,40.0,0.07,0.17,14.1,0,1,1
3500,Villarreal,2020-2021,2021-05-13,19:00,Matchweek 36,Thu,Away,W,2,0,...,2,9,3,33.3,0.22,0.67,18.5,0,0,0
3501,Villarreal,2020-2021,2021-05-16,18:30,Matchweek 37,Sun,Home,W,4,0,...,4,12,5,41.7,0.33,0.80,20.6,1,0,0
3502,Villarreal,2020-2021,2021-05-22,18:00,Matchweek 38,Sat,Away,L,1,2,...,1,6,2,33.3,0.17,0.50,20.0,0,0,0


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# URL for La Liga 24/25 fixtures
url = "https://fbref.com/en/comps/12/2024-2025/schedule/2024-2025-La-Liga-Scores-and-Fixtures"

# Headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

# Request and parse the page
response = requests.get(url, headers=headers)
time.sleep(5)  # Avoid hitting rate limits
soup = BeautifulSoup(response.text, "html.parser")

# Locate the schedule table
stats_table = soup.select_one('#sched_2024-2025_12_1')

if stats_table:
    print("Schedule table found!")

    # Extract and clean the table data
    df = pd.read_html(str(stats_table))[0]
    desired_columns = ['Wk', 'Day', 'Date', 'Time', 'Venue', 'Home', 'Away']
    df = df[desired_columns].drop_duplicates()

    # Save the data to a CSV file
    df.to_csv("la_liga_24_25.csv", index=False)
    print(f"Saved to la_liga_24_25.csv")
else:
    print("Error. No data collected.")

In [27]:
from pathlib import Path
import pandas as pd

# Read in the csv file 
data_path = Path(...)
la_liga_24_25 = pd.read_csv(data_path)
la_liga_24_25

Unnamed: 0,Wk,Day,Date,Time,Venue,Home,Away
0,1.0,Thu,2024-08-15,19:00,San Mamés,Athletic Club,Getafe
1,1.0,Thu,2024-08-15,21:30,Estadio Benito Villamarín,Betis,Girona
2,1.0,Fri,2024-08-16,19:00,Estadio Abanca Balaídos,Celta Vigo,Alavés
3,1.0,Fri,2024-08-16,20:30,Estadio de Gran Canaria,Las Palmas,Sevilla
4,1.0,Sat,2024-08-17,19:00,Estadio El Sadar,Osasuna,Leganés
...,...,...,...,...,...,...,...
376,38.0,Sun,2025-05-25,,,Betis,Valencia
377,38.0,Sun,2025-05-25,,,Villarreal,Sevilla
378,38.0,Sun,2025-05-25,,,Girona,Atlético Madrid
379,38.0,Sun,2025-05-25,,,Alavés,Osasuna
