This notebook is to scrape football data from fbref.com

Much of the scraping code is taken from this repository: https://github.com/chmartin/FBref_EPL.

Run the first cell and then the further cells to get data for whichever leagues you want.

All data is courtesy of StatsBomb via FBref. Find me on Twitter @pathaleee!


In [None]:
from seleniumbase import Driver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import pathlib as Path
import os


DATA_PATH = (
    next(p for p in [Path.cwd()] + list(Path.cwd().parents) if (p / ".git").exists())
    / "data"
    / "fbref"
)
driver = Driver(uc=True)


# --- Core function using Selenium instead of requests ---
def get_tables(url, text, category):
    if category == "stats":
        category = "standard"
    elif category == "keepers":
        category = "keeper"
    elif category == "keepersadv":
        category = "keeper_adv"
    elif category == "playingtime":
        category = "playing_time"
    driver.get(url)
    time.sleep(3)  # give JS time to load
    html = driver.page_source
    print(f"getting table from : {url}")
    soup = BeautifulSoup(html, "html.parser")

    if "Cloudflare" in soup.text or "Verify you are human" in soup.text:
        print(f"BLOCKED by Cloudflare on URL: {url}")

    all_tables = soup.find_all("tbody")

    all_features = soup.find_all("thead")
    if len(all_tables) < 3 and len(all_features) < 3:
        raise ValueError("Could not find expected tables")

    team_for_table = soup.find("table", {"id": f"stats_squads_{category}_for"}).find(
        "tbody"
    )
    features_team_table = [
        th["data-stat"] for th in all_features[0].find_all("tr")[-1].find_all("th")
    ]

    team_vs_table = soup.find("table", {"id": f"stats_squads_{category}_against"}).find(
        "tbody"
    )
    features_team_vs_table = [
        th["data-stat"] for th in all_features[1].find_all("tr")[-1].find_all("th")
    ]

    player_table = soup.find("table", {"id": f"stats_{category}"}).find("tbody")
    features_player_table = [
        th["data-stat"] for th in all_features[2].find_all("tr")[-1].find_all("th")
    ]

    if text == "for":
        return player_table, team_for_table, features_player_table, features_team_table
    if text == "vs":
        return (
            player_table,
            team_vs_table,
            features_player_table,
            features_team_vs_table,
        )
    driver.quit()


# --- Keep your helper functions (unchanged) ---
def get_frame(endpoint, features, player_table):
    print(f"getting features [{len(features)}] from {endpoint}")
    print(f"features : {features}")

    pre_df_player = dict()
    rows_player = player_table.find_all("tr")
    for row in rows_player:
        if row.find("th", {"scope": "row"}) is not None:
            for f in features:
                cell = row.find("td", {"data-stat": f})
                text = cell.text.strip() if cell else "0"
                if text == "":
                    text = "0"
                if f not in [
                    "player",
                    "nationality",
                    "position",
                    "team",
                    "age",
                    "birth_year",
                ]:
                    text = (
                        float(text.replace(",", ""))
                        if text.replace(".", "", 1).isdigit()
                        else text
                    )

                pre_df_player.setdefault(f, []).append(text)
    df_player = pd.DataFrame.from_dict(pre_df_player)
    print(len(df_player))
    return df_player


def get_frame_team(endpoint, features, team_table):
    features.remove("team")
    print(f"getting features [{len(features)}] from {endpoint}")
    print(f"features : {features}")
    pre_df_team = dict()
    rows_team = team_table.find_all("tr")
    for row in rows_team:
        if row.find("th", {"scope": "row"}) is not None:
            name = row.find("th", {"data-stat": "team"}).text.strip()
            pre_df_team.setdefault("team", []).append(name)
            for f in features:
                cell = row.find("td", {"data-stat": f})
                text = cell.text.strip() if cell else "0"
                if text == "":
                    text = "0"
                if f not in [
                    "player",
                    "nationality",
                    "position",
                    "team",
                    "age",
                    "birth_year",
                ]:
                    text = (
                        float(text.replace(",", ""))
                        if text.replace(".", "", 1).isdigit()
                        else text
                    )
                pre_df_team.setdefault(f, []).append(text)

    df_team = pd.DataFrame.from_dict(pre_df_team)
    print(len(df_team))
    return df_team


# --- Category helper wrappers ---
def frame_for_category(category, top, end):
    url = f"{top}{category}{end}"
    player_table, _, features_player, _ = get_tables(url, "for", category)
    return get_frame(category + end, features_player, player_table)


def frame_for_category_team(category, top, end, text):
    url = f"{top}{category}{end}"
    _, team_table, _, features_team = get_tables(url, text, category)
    return get_frame_team(category + end, features_team, team_table)


def get_outfield_data(top, end):
    df1 = frame_for_category("stats", top, end)
    df2 = frame_for_category("shooting", top, end)
    df3 = frame_for_category("passing", top, end)
    df4 = frame_for_category("passing_types", top, end)
    df5 = frame_for_category("gca", top, end)
    df6 = frame_for_category("defense", top, end)
    df7 = frame_for_category("possession", top, end)
    df8 = frame_for_category("misc", top, end)
    df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8], axis=1)
    df = df.loc[:, ~df.columns.duplicated()]
    print(df.describe)
    return df


# Function to get keeping and advance goalkeeping data
def get_keeper_data(top, end):
    df1 = frame_for_category("keepers", top, end)
    df2 = frame_for_category("keepersadv", top, end)
    df = pd.concat([df1, df2], axis=1)
    df = df.loc[:, ~df.columns.duplicated()]
    return df


# Function to get team-wise data accross all categories as mentioned above
def get_team_data(top, end, text):
    df1 = frame_for_category_team("stats", top, end, text)
    df2 = frame_for_category_team("keepers", top, end, text)
    df3 = frame_for_category_team("keepersadv", top, end, text)
    df4 = frame_for_category_team("shooting", top, end, text)
    df5 = frame_for_category_team("passing", top, end, text)
    df6 = frame_for_category_team("passing_types", top, end, text)
    df7 = frame_for_category_team("gca", top, end, text)
    df8 = frame_for_category_team("defense", top, end, text)
    df9 = frame_for_category_team("possession", top, end, text)
    df10 = frame_for_category_team("misc", top, end, text)
    df11 = frame_for_category_team("playingtime", top, end, text)
    df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11], axis=1)
    df = df.loc[:, ~df.columns.duplicated()]

    return df


In [26]:
# This cell is to get the outfield player data for any competition

# Go to the 'Standard stats' page of the league
# For Premier League 2020/21, the link is this: https://fbref.com/en/comps/9/stats/Premier-League-Stats
# Remove the 'stats', and pass the first and third part of the link as parameters like below


PL_outfield_25_26 = get_outfield_data(
    "https://fbref.com/en/comps/9/", "/Premier-League-Stats"
)
PL_outfield_24_25 = get_outfield_data(
    "https://fbref.com/en/comps/9/2024-2025/", "/2024-2025-Premier-League-Stats"
)
PL_outfield_23_24 = get_outfield_data(
    "https://fbref.com/en/comps/9/2023-2024/", "/2023-2024-Premier-League-Stats"
)
PL_outfield_22_23 = get_outfield_data(
    "https://fbref.com/en/comps/9/2022-2023/", "/2022-2023-Premier-League-Stats"
)
PL_outfield_21_22 = get_outfield_data(
    "https://fbref.com/en/comps/9/2021-2022/", "/2021-2022-Premier-League-Stats"
)
PL_outfield_20_21 = get_outfield_data(
    "https://fbref.com/en/comps/9/2020-2021/", "/2020-2021-Premier-League-Stats"
)


getting table from : https://fbref.com/en/comps/9/stats/Premier-League-Stats
getting features [37] from stats/Premier-League-Stats
features : ['ranker', 'player', 'nationality', 'position', 'team', 'age', 'birth_year', 'games', 'games_starts', 'minutes', 'minutes_90s', 'goals', 'assists', 'goals_assists', 'goals_pens', 'pens_made', 'pens_att', 'cards_yellow', 'cards_red', 'xg', 'npxg', 'xg_assist', 'npxg_xg_assist', 'progressive_carries', 'progressive_passes', 'progressive_passes_received', 'goals_per90', 'assists_per90', 'goals_assists_per90', 'goals_pens_per90', 'goals_assists_pens_per90', 'xg_per90', 'xg_assist_per90', 'xg_xg_assist_per90', 'npxg_per90', 'npxg_xg_assist_per90', 'matches']
497
getting table from : https://fbref.com/en/comps/9/shooting/Premier-League-Stats
getting features [26] from shooting/Premier-League-Stats
features : ['ranker', 'player', 'nationality', 'position', 'team', 'age', 'birth_year', 'minutes_90s', 'goals', 'shots', 'shots_on_target', 'shots_on_target_p

In [32]:
# This cell is to get the goalkeeper data for any competition

# Go to the 'Standard stats' page of the league
# For Premier League 2020/21, the link is this: https://fbref.com/en/comps/9/stats/Premier-League-Stats
# Remove the 'stats', and pass the first and third part of the link as parameters like below
PL_keeper_25_26 = get_keeper_data(
    "https://fbref.com/en/comps/9/", "/Premier-League-Stats"
)
PL_keeper_25_26
PL_keeper_24_25 = get_keeper_data(
    "https://fbref.com/en/comps/9/2024-2025/", "/2024-2025-Premier-League-Stats"
)
PL_keeper_23_24 = get_keeper_data(
    "https://fbref.com/en/comps/9/2023-2024/", "/2023-2024-Premier-League-Stats"
)
PL_keeper_22_23 = get_keeper_data(
    "https://fbref.com/en/comps/9/2022-2023/", "/2022-2023-Premier-League-Stats"
)
PL_keeper_21_22 = get_keeper_data(
    "https://fbref.com/en/comps/9/2021-2022/", "/2021-2022-Premier-League-Stats"
)
PL_keeper_20_21 = get_keeper_data(
    "https://fbref.com/en/comps/9/2020-2021/", "/2020-2021-Premier-League-Stats"
)


getting table from : https://fbref.com/en/comps/9/keepers/Premier-League-Stats
getting features [27] from keepers/Premier-League-Stats
features : ['ranker', 'player', 'nationality', 'position', 'team', 'age', 'birth_year', 'gk_games', 'gk_games_starts', 'gk_minutes', 'minutes_90s', 'gk_goals_against', 'gk_goals_against_per90', 'gk_shots_on_target_against', 'gk_saves', 'gk_save_pct', 'gk_wins', 'gk_ties', 'gk_losses', 'gk_clean_sheets', 'gk_clean_sheets_pct', 'gk_pens_att', 'gk_pens_allowed', 'gk_pens_saved', 'gk_pens_missed', 'gk_pens_save_pct', 'matches']
30
getting table from : https://fbref.com/en/comps/9/keepersadv/Premier-League-Stats
getting features [34] from keepersadv/Premier-League-Stats
features : ['ranker', 'player', 'nationality', 'position', 'team', 'age', 'birth_year', 'minutes_90s', 'gk_goals_against', 'gk_pens_allowed', 'gk_free_kick_goals_against', 'gk_corner_kick_goals_against', 'gk_own_goals_against', 'gk_psxg', 'gk_psnpxg_per_shot_on_target_against', 'gk_psxg_net',

In [35]:
# This cell is to get the data FOR all teams in any competition

# Go to the 'Standard stats' page of the league
# For Premier League 2020/21, the link is this: https://fbref.com/en/comps/9/stats/Premier-League-Stats
# Remove the 'stats', and pass the first and third part of the link as parameters like below
# df_team = get_team_data('https://fbref.com/en/comps/9/','/Premier-League-Stats','for')

PL_team_25_26 = get_team_data(
    "https://fbref.com/en/comps/9/", "/Premier-League-Stats", "for"
)
PL_team_24_25 = get_team_data(
    "https://fbref.com/en/comps/9/2024-2025/", "/2024-2025-Premier-League-Stats", "for"
)
PL_team_23_24 = get_team_data(
    "https://fbref.com/en/comps/9/2023-2024/", "/2023-2024-Premier-League-Stats", "for"
)
PL_team_22_23 = get_team_data(
    "https://fbref.com/en/comps/9/2022-2023/", "/2022-2023-Premier-League-Stats", "for"
)
PL_team_21_22 = get_team_data(
    "https://fbref.com/en/comps/9/2021-2022/", "/2021-2022-Premier-League-Stats", "for"
)
PL_team_20_21 = get_team_data(
    "https://fbref.com/en/comps/9/2020-2021/", "/2020-2021-Premier-League-Stats", "for"
)


getting table from : https://fbref.com/en/comps/9/stats/Premier-League-Stats
getting features [31] from stats/Premier-League-Stats
features : ['players_used', 'avg_age', 'possession', 'games', 'games_starts', 'minutes', 'minutes_90s', 'goals', 'assists', 'goals_assists', 'goals_pens', 'pens_made', 'pens_att', 'cards_yellow', 'cards_red', 'xg', 'npxg', 'xg_assist', 'npxg_xg_assist', 'progressive_carries', 'progressive_passes', 'goals_per90', 'assists_per90', 'goals_assists_per90', 'goals_pens_per90', 'goals_assists_pens_per90', 'xg_per90', 'xg_assist_per90', 'xg_xg_assist_per90', 'npxg_per90', 'npxg_xg_assist_per90']
20
getting table from : https://fbref.com/en/comps/9/keepers/Premier-League-Stats
getting features [20] from keepers/Premier-League-Stats
features : ['players_used', 'gk_games', 'gk_games_starts', 'gk_minutes', 'minutes_90s', 'gk_goals_against', 'gk_goals_against_per90', 'gk_shots_on_target_against', 'gk_saves', 'gk_save_pct', 'gk_wins', 'gk_ties', 'gk_losses', 'gk_clean_sh

In [36]:
data_categories = {
    "PL_outfield": [
        PL_outfield_25_26,
        PL_outfield_24_25,
        PL_outfield_23_24,
        PL_outfield_22_23,
        PL_outfield_21_22,
        PL_outfield_20_21,
    ],
    "PL_keeper": [
        PL_keeper_25_26,
        PL_keeper_24_25,
        PL_keeper_23_24,
        PL_keeper_22_23,
        PL_keeper_21_22,
        PL_keeper_20_21,
    ],
    "PL_team": [
        PL_team_25_26,
        PL_team_24_25,
        PL_team_23_24,
        PL_team_22_23,
        PL_team_21_22,
        PL_team_20_21,
    ],
}

years = ["25_26", "24_25", "23_24", "22_23", "21_22", "20_21"]

for folder, df_list in data_categories.items():
    # Ensure folder exists
    target_dir = os.path.join(DATA_PATH, folder)
    os.makedirs(target_dir, exist_ok=True)

    for df, year in zip(df_list, years):
        file_name = f"{folder}_{year}.csv"
        save_path = os.path.join(target_dir, file_name)
        df.to_csv(save_path, index=False)
        print(f"Saved: {save_path}")

Saved: /home/kheaw/projects/special_scoop/notebooks/scrape/data/PL_outfield/PL_outfield_25_26.csv
Saved: /home/kheaw/projects/special_scoop/notebooks/scrape/data/PL_outfield/PL_outfield_24_25.csv
Saved: /home/kheaw/projects/special_scoop/notebooks/scrape/data/PL_outfield/PL_outfield_23_24.csv
Saved: /home/kheaw/projects/special_scoop/notebooks/scrape/data/PL_outfield/PL_outfield_22_23.csv
Saved: /home/kheaw/projects/special_scoop/notebooks/scrape/data/PL_outfield/PL_outfield_21_22.csv
Saved: /home/kheaw/projects/special_scoop/notebooks/scrape/data/PL_outfield/PL_outfield_20_21.csv
Saved: /home/kheaw/projects/special_scoop/notebooks/scrape/data/PL_keeper/PL_keeper_25_26.csv
Saved: /home/kheaw/projects/special_scoop/notebooks/scrape/data/PL_keeper/PL_keeper_24_25.csv
Saved: /home/kheaw/projects/special_scoop/notebooks/scrape/data/PL_keeper/PL_keeper_23_24.csv
Saved: /home/kheaw/projects/special_scoop/notebooks/scrape/data/PL_keeper/PL_keeper_22_23.csv
Saved: /home/kheaw/projects/special_