## This file will focus strictly on scraping data from the website fbref.com


In [7]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [8]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import random


### Defining functions


`get_data_from_txt()` takes a file path, reads the HTML content, and returns a dataframe

In [9]:
def get_data_from_txt(file_path):
    # Read the HTML content from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table
    table = soup.find('table')
    
    if not table:
        raise ValueError("No table found in the HTML content")
    
    # Extract column names from thead
    thead = table.find('thead')
    if thead:
        column_headers = thead.find_all('th')
        column_names = [th.get('aria-label', th.text.strip()) for th in column_headers]
    else:
        column_names = []
    
    # Extract data from tbody
    tbody = table.find('tbody')
    data = []
    if tbody:
        for row in tbody.find_all('tr'):
            row_data = [cell.text.strip() for cell in row.find_all(['th', 'td'])]
            data.append(row_data)
    
    # Create dataframe
    df = pd.DataFrame(data)
    
    # Assign column names, truncating or padding as necessary
    if len(df.columns) > len(column_names):
        # If there are more columns in the data than names, use the first len(column_names) columns
        df = df.iloc[:, :len(column_names)]
    elif len(df.columns) < len(column_names):
        # If there are fewer columns in the data than names, truncate the column names
        column_names = column_names[:len(df.columns)]
    
    df.columns = column_names
    
    return df
    
    

### Getting Team data

`get_squad_stats()` scrapes fbref link html for standard squad stats, puts it in txt file, and returns txt file address

In [10]:
def get_squad_stats():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)

    # 🧢 Pretend to be a real Chrome browser
    headers = {
        'User-Agent': (
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/124.0.0.0 Safari/537.36'
        )
    }
    
    base_url = "https://fbref.com/en/comps/Big5/{}/stats/squads/{}-Big-5-European-Leagues-Stats"
    seasons = ["2022-2023"]  # Start small to test

    for season in seasons:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers=headers)
            response.raise_for_status()  # Raise error for 403s or other HTTP issues

            soup = BeautifulSoup(response.content, 'html.parser')
            table_div = soup.find('div', id='div_stats_teams_standard_for')
            
            if table_div:
                with open(f'data_html/squad_stats_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved squad stats for {season}")
            else:
                print(f"Table div not found for {season}")
            
            time.sleep(random.uniform(5, 10))  # 💤 Nap time

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")
            
    print("✅ All seasons processed")

In [11]:
get_squad_stats()

Failed to fetch data for season 2022-2023: 403 Client Error: Forbidden for url: https://fbref.com/en/comps/Big5/2022-2023/stats/squads/2022-2023-Big-5-European-Leagues-Stats
✅ All seasons processed


`get_squad_wages()` scrapes fbref link html for squad wages, puts it in txt file, and returns txt file address

In [12]:
def get_squad_wages():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/wages/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
           # url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
       # else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_squad_wages')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/squad_wages_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved squad wages for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")

    print("All seasons processed")
    return f'data_html/squad_wages_{season}.txt'

### Getting Player data


`get_standard_stats()` scrapes fbref link html for defensive stats, puts it in txt file, and returns txt file address

In [13]:
def get_standard_stats():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/stats/players/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
            #url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
        #else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_stats_standard')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/standard_stats_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved standard stats for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")

    print("All seasons processed")
    return f'data_html/standard_stats_{season}.txt'
    

`get_defensive_stats()` scrapes fbref link html for defensive stats, puts it in txt file, and returns txt file address

In [14]:
def get_defensive_stats():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/defense/players/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
           # url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
       # /else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_stats_defense')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/defensive_stats_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved defensive stats for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")

    print("All seasons processed")
    return f'data_html/defensive_stats_{season}.txt'


`get_passing_stats()` scrapes fbref link html for defensive stats, puts it in txt file, and returns txt file address

In [15]:
def get_passing_stats():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/passing/players/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
           # url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
       # else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_stats_passing')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/passing_stats_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved passing stats for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")

    print("All seasons processed")
    return f'data_html/passing_stats_{season}.txt'


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os

def get_shooting_stats(seasons=None):
    if seasons is None:
        seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]

    options = Options()
    options.add_argument("--headless")  # Run in headless mode (no browser UI)
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=options)

    os.makedirs("data_html", exist_ok=True)

    for season in seasons:
        url = f"https://fbref.com/en/comps/Big5/{season}/shooting/players/{season}-Big-5-European-Leagues-Stats"
        print(f"Fetching: {url}")
        driver.get(url)
        try:
            # Wait up to 20 seconds for the table div to appear
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.ID, "div_stats_shooting"))
            )
            table_div = driver.find_element(By.ID, "div_stats_shooting")
            html = table_div.get_attribute('outerHTML')
            with open(f"data_html/shooting_stats_{season}_selenium.txt", "w", encoding="utf-8") as f:
                f.write(html)
            print(f"Saved shooting stats for {season}")
        except Exception as e:
            print(f"Could not get table for {season}: {e}")
        # Space out requests to avoid being flagged
        time.sleep(8)

    driver.quit()
    print("All requested seasons processed.")

# Example usage: only run this for shooting stats
#get_shooting_stats()  # Or add more seasons as needed


Fetching: https://fbref.com/en/comps/Big5/2017-2018/shooting/players/2017-2018-Big-5-European-Leagues-Stats
Saved shooting stats for 2017-2018
Fetching: https://fbref.com/en/comps/Big5/2018-2019/shooting/players/2018-2019-Big-5-European-Leagues-Stats
Saved shooting stats for 2018-2019
Fetching: https://fbref.com/en/comps/Big5/2019-2020/shooting/players/2019-2020-Big-5-European-Leagues-Stats
Saved shooting stats for 2019-2020
Fetching: https://fbref.com/en/comps/Big5/2020-2021/shooting/players/2020-2021-Big-5-European-Leagues-Stats
Saved shooting stats for 2020-2021
Fetching: https://fbref.com/en/comps/Big5/2021-2022/shooting/players/2021-2022-Big-5-European-Leagues-Stats
Saved shooting stats for 2021-2022
Fetching: https://fbref.com/en/comps/Big5/2022-2023/shooting/players/2022-2023-Big-5-European-Leagues-Stats
Saved shooting stats for 2022-2023
Fetching: https://fbref.com/en/comps/Big5/2023-2024/shooting/players/2023-2024-Big-5-European-Leagues-Stats
Saved shooting stats for 2023-2024

`get_goalkeeping_stats()` scrapes fbref link html for goalkeeper stats, puts it in txt file, and returns txt file address

In [19]:
def get_goalkeeping_stats():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/keepers/players/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
           # url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
       # else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_stats_keeper')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/goalkeeping_stats_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved goalkeeping stats for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")

    print("All seasons processed")
    return f'data_html/goalkeeping_stats_{season}.txt'


### Get all data

`get_all_seasons()` loops through all the txt files, extracts the data, and adds it to a dataframe for all seasons

In [20]:
def get_all_seasons():
    # List of seasons to loop through
    seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024']
    # dataframes to store squad stats, and wages
    seasons_squads_stats = []
    seasons_shooting_stats = []
    seasons_squads_wages = []
    seasons_standard_stats = []
    seasons_defensive_stats = []
    seasons_passing_stats = []
    seasons_goalkeeping_stats = []
    
    # Getting squads stats data
    get_squad_stats()
    for season in seasons:
        file_path = f'data_html/squad_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        df["Season"] = season
        seasons_squads_stats.append(df)
        
    # Getting squad wages data
    get_squad_wages()
    for season in seasons:
        file_path = f'data_html/squad_wages_{season}.txt'
        df = get_data_from_txt(file_path)
        df["Season"] = season
        seasons_squads_wages.append(df)
    
    # Getting standard player stats
    get_standard_stats()
    for season in seasons:
        file_path = f'data_html/standard_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        df["Season"] = season
        seasons_standard_stats.append(df)
        
    # Getting defensive player stats
    get_defensive_stats()
    for season in seasons:
        file_path = f'data_html/defensive_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        df["Season"] = season
        seasons_defensive_stats.append(df)
    
    # Getting passing player stats
    get_passing_stats()
    for season in seasons:
        file_path = f'data_html/passing_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        df["Season"] = season
        seasons_passing_stats.append(df)
        
    get_shooting_stats()
    for season in seasons:
        file_path = f'data_html/shooting_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        df["Season"] = season
        seasons_shooting_stats.append(df)
        
    # Getting goalkeeping player stats
    get_goalkeeping_stats()
    for season in seasons:
        file_path = f'data_html/goalkeeping_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        df["Season"] = season
        seasons_goalkeeping_stats.append(df)
    
    
    squads_stats = pd.concat(seasons_squads_stats, ignore_index=True)
    squads_wages = pd.concat(seasons_squads_wages, ignore_index=True)
    standard_stats = pd.concat(seasons_standard_stats, ignore_index=True)
    defensive_stats = pd.concat(seasons_defensive_stats, ignore_index=True)
    passing_stats = pd.concat(seasons_passing_stats, ignore_index=True)
    shooting_stats = pd.concat(seasons_shooting_stats, ignore_index=True)
    goalkeeping_stats = pd.concat(seasons_goalkeeping_stats, ignore_index=True)
    
    return squads_stats, squads_wages, standard_stats, defensive_stats, passing_stats, shooting_stats, goalkeeping_stats


    

In [21]:
def scrape_all_data():
    # get squads stats, squad wages, standard player stats, defensive stats, passing stats, and goalkeeping for all seasons
    squads_stats, squads_wages, standard_stats, defensive_stats, passing_stats, shooting_stats, goalkeeping_stats = get_all_seasons()

    # output paths
    stats_output_path = 'uncleaned_data_csv\seasons_stats.csv'
    wages_output_path = 'uncleaned_data_csv\seasons_wages.csv'
    standard_output_path = 'uncleaned_data_csv\standard.csv'
    defending_output_path = 'uncleaned_data_csv\defending.csv'
    shooting_output_path = 'uncleaned_data_csv\shooting.csv'
    passing_output_path = 'uncleaned_data_csv\passing.csv'
    
    # To csv
    squads_stats.to_csv(stats_output_path, index=False)
    squads_wages.to_csv(wages_output_path, index=False)
    standard_stats.to_csv(standard_output_path, index=False)
    defensive_stats.to_csv(defending_output_path, index=False)
    passing_stats.to_csv(passing_output_path, index=False)
    shooting_stats.to_csv(shooting_output_path, index=False)
    
    # print them out
    print(squads_stats.head())
    print(squads_wages.head())
    print(standard_stats.head())
    print(shooting_stats.head())
    print(defensive_stats.head())
    print(passing_stats.head())
    print(goalkeeping_stats.head())
    

In [22]:
scrape_all_data()

Failed to fetch data for season 2022-2023: 403 Client Error: Forbidden for url: https://fbref.com/en/comps/Big5/2022-2023/stats/squads/2022-2023-Big-5-European-Leagues-Stats
✅ All seasons processed
Failed to fetch data for season 2017-2018: 403 Client Error: Forbidden for url: https://fbref.com/en/comps/Big5/2017-2018/wages/2017-2018-Big-5-European-Leagues-Stats
Failed to fetch data for season 2018-2019: 403 Client Error: Forbidden for url: https://fbref.com/en/comps/Big5/2018-2019/wages/2018-2019-Big-5-European-Leagues-Stats
Failed to fetch data for season 2019-2020: 403 Client Error: Forbidden for url: https://fbref.com/en/comps/Big5/2019-2020/wages/2019-2020-Big-5-European-Leagues-Stats
Failed to fetch data for season 2020-2021: 403 Client Error: Forbidden for url: https://fbref.com/en/comps/Big5/2020-2021/wages/2020-2021-Big-5-European-Leagues-Stats
Failed to fetch data for season 2021-2022: 403 Client Error: Forbidden for url: https://fbref.com/en/comps/Big5/2021-2022/wages/2021-2

FileNotFoundError: [Errno 2] No such file or directory: 'data_html/shooting_stats_2017-2018.txt'

#Sanity Check#

In [None]:
df = pd.read_csv('uncleaned_data_csv/defending.csv')
print(df['Season'].value_counts())

Season
2021-2022    3036
2022-2023    3004
2023-2024    2966
2020-2021    2934
2019-2020    2841
2017-2018    2799
2018-2019    2762
Name: count, dtype: int64


In [None]:
df = pd.read_csv('uncleaned_data_csv/passing.csv')
df.groupby('Season').size()

Season
2017-2018    2799
2018-2019    2762
2019-2020    2841
2020-2021    2934
2021-2022    3036
2022-2023    3004
2023-2024    2966
dtype: int64

In [None]:
df.isna().mean().sort_values(ascending=False)

Progressive Passing Distance    0.057025
Passes Attempted                0.022417
Year of birth                   0.015289
Nation                          0.006243
Unnamed: 2                      0.000442
90s Played                      0.000295
Age                             0.000295
Pass Completion % (Short)       0.000295
Passes Attempted (Short)        0.000295
Pass Completion % (Medium)      0.000295
Total Passing Distance          0.000295
Pass Completion %               0.000295
Passes Completed                0.000295
Passes Completed (Long)         0.000295
Squad                           0.000295
Competition                     0.000295
Passes Attempted (Medium)       0.000295
Position                        0.000295
Player                          0.000295
Rk                              0.000295
Passes Attempted (Long)         0.000295
Pass Completion % (Long)        0.000295
Passes Completed (Medium)       0.000295
Unnamed: 7                      0.000246
Unnamed: 6      