## This file will focus strictly on scraping data from the website fbref.com


In [55]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [56]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import random


### Defining functions


`get_data_from_txt()` takes a file path, reads the HTML content, and returns a dataframe

In [57]:
def get_data_from_txt(file_path):
    # Read the HTML content from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table
    table = soup.find('table')
    
    if not table:
        raise ValueError("No table found in the HTML content")
    
    # Extract column names from thead
    thead = table.find('thead')
    if thead:
        column_headers = thead.find_all('th')
        column_names = [th.get('aria-label', th.text.strip()) for th in column_headers]
    else:
        column_names = []
    
    # Extract data from tbody
    tbody = table.find('tbody')
    data = []
    if tbody:
        for row in tbody.find_all('tr'):
            row_data = [cell.text.strip() for cell in row.find_all(['th', 'td'])]
            data.append(row_data)
    
    # Create dataframe
    df = pd.DataFrame(data)
    
    # Assign column names, truncating or padding as necessary
    if len(df.columns) > len(column_names):
        # If there are more columns in the data than names, use the first len(column_names) columns
        df = df.iloc[:, :len(column_names)]
    elif len(df.columns) < len(column_names):
        # If there are fewer columns in the data than names, truncate the column names
        column_names = column_names[:len(df.columns)]
    
    df.columns = column_names
    
    return df
    
    

### Getting Team data

`get_squad_stats()` scrapes fbref link html for standard squad stats, puts it in txt file, and returns txt file address

In [58]:
def get_squad_stats():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/stats/squads/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
           # url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
       # else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_stats_teams_standard_for')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/squad_stats_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved squad stats for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")
            
    print("All seasons processed")
    return f'data_html/squad_stats_{season}.txt'

`get_squad_wages()` scrapes fbref link html for squad wages, puts it in txt file, and returns txt file address

In [59]:
def get_squad_wages():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/wages/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
           # url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
       # else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_squad_wages')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/squad_wages_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved squad wages for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")

    print("All seasons processed")
    return f'data_html/squad_wages_{season}.txt'

### Getting Player data


`get_standard_stats()` scrapes fbref link html for defensive stats, puts it in txt file, and returns txt file address

In [60]:
def get_standard_stats():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/stats/players/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
            #url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
        #else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_stats_standard')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/standard_stats_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved standard stats for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")

    print("All seasons processed")
    return f'data_html/standard_stats_{season}.txt'
    

`get_defensive_stats()` scrapes fbref link html for defensive stats, puts it in txt file, and returns txt file address

In [61]:
def get_defensive_stats():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/defense/players/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
           # url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
       # else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_stats_defense')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/defensive_stats_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved defensive stats for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")

    print("All seasons processed")
    return f'data_html/defensive_stats_{season}.txt'


`get_passing_stats()` scrapes fbref link html for defensive stats, puts it in txt file, and returns txt file address

In [62]:
def get_passing_stats():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/passing/players/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
           # url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
       # else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_stats_passing')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/passing_stats_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved passing stats for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")

    print("All seasons processed")
    return f'data_html/passing_stats_{season}.txt'


`get_goalkeeping_stats()` scrapes fbref link html for goalkeeper stats, puts it in txt file, and returns txt file address

In [63]:
def get_goalkeeping_stats():
    session = requests.Session()
    
    retry = Retry(
        total=5,
        backoff_factor=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    base_url = "https://fbref.com/en/comps/Big5/{}/keepers/players/{}-Big-5-European-Leagues-Stats"
    seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
    
    for season in seasons:
        #if season == "2023-2024":
           # url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
       # else:
        url = base_url.format(season, season)
        
        try:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the div containing the table
            table_div = soup.find('div', id='div_stats_keeper')
            
            if table_div:
                # Save the HTML content of the div to a file
                with open(f'data_html/goalkeeping_stats_{season}.txt', 'w', encoding='utf-8') as file:
                    file.write(str(table_div))
                print(f"Saved goalkeeping stats for {season}")
            else:
                print(f"Table div not found for {season}")
            
            # Introduce a random delay between 5 and 10 seconds
            time.sleep(random.uniform(5, 10))

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for season {season}: {e}")

    print("All seasons processed")
    return f'data_html/goalkeeping_stats_{season}.txt'


### Get all data

`get_all_seasons()` loops through all the txt files, extracts the data, and adds it to a dataframe for all seasons

In [64]:
def get_all_seasons():
    # List of seasons to loop through
    seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024']
    # dataframes to store squad stats, and wages
    seasons_squads_stats = []
    seasons_squads_wages = []
    seasons_standard_stats = []
    seasons_defensive_stats = []
    seasons_passing_stats = []
    seasons_goalkeeping_stats = []
    
    # Getting squads stats data
    get_squad_stats()
    for season in seasons:
        file_path = f'data_html/squad_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        seasons_squads_stats.append(df)
        
    # Getting squad wages data
    get_squad_wages()
    for season in seasons:
        file_path = f'data_html/squad_wages_{season}.txt'
        df = get_data_from_txt(file_path)
        seasons_squads_wages.append(df)
    
    # Getting standard player stats
    get_standard_stats()
    for season in seasons:
        file_path = f'data_html/standard_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        seasons_standard_stats.append(df)
        
    # Getting defensive player stats
    get_defensive_stats()
    for season in seasons:
        file_path = f'data_html/defensive_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        seasons_defensive_stats.append(df)
    
    # Getting passing player stats
    get_passing_stats()
    for season in seasons:
        file_path = f'data_html/passing_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        seasons_passing_stats.append(df)
        
    # Getting goalkeeping player stats
    get_goalkeeping_stats()
    for season in seasons:
        file_path = f'data_html/goalkeeping_stats_{season}.txt'
        df = get_data_from_txt(file_path)
        seasons_goalkeeping_stats.append(df)
    
    
    squads_stats = pd.concat(seasons_squads_stats, ignore_index=True)
    squads_wages = pd.concat(seasons_squads_wages, ignore_index=True)
    standard_stats = pd.concat(seasons_standard_stats, ignore_index=True)
    defensive_stats = pd.concat(seasons_defensive_stats, ignore_index=True)
    passing_stats = pd.concat(seasons_passing_stats, ignore_index=True)
    goalkeeping_stats = pd.concat(seasons_goalkeeping_stats, ignore_index=True)
    
    return squads_stats, squads_wages, standard_stats, defensive_stats, passing_stats, goalkeeping_stats


    

In [71]:
def scrape_all_data():
    # get squads stats, squad wages, standard player stats, defensive stats, passing stats, and goalkeeping for all seasons
    squads_stats, squads_wages, standard_stats, defensive_stats, passing_stats, goalkeeping_stats = get_all_seasons()

    # output paths
    stats_output_path = 'uncleaned_data_csv\seasons_stats.csv'
    wages_output_path = 'uncleaned_data_csv\seasons_wages.csv'
    standard_output_path = 'uncleaned_data_csv\standard.csv'
    defending_output_path = 'uncleaned_data_csv\defending.csv'
    passing_output_path = 'uncleaned_data_csv\passing.csv'
    goalkeeping_output_path = 'uncleaned_data_csv\goalkeeping.csv'
    
    # To csv
    squads_stats.to_csv(stats_output_path, index=False)
    squads_wages.to_csv(wages_output_path, index=False)
    standard_stats.to_csv(standard_output_path, index=False)
    defensive_stats.to_csv(defending_output_path, index=False)
    passing_stats.to_csv(passing_output_path, index=False)
    goalkeeping_stats.to_csv(goalkeeping_output_path, index=False)
    
    # print them out
    print(squads_stats.head())
    print(squads_wages.head())
    print(standard_stats.head())
    print(defensive_stats.head())
    print(passing_stats.head())
    print(goalkeeping_stats.head())
    

  stats_output_path = 'uncleaned_data_csv\seasons_stats.csv'
  wages_output_path = 'uncleaned_data_csv\seasons_wages.csv'
  standard_output_path = 'uncleaned_data_csv\standard.csv'
  defending_output_path = 'uncleaned_data_csv\defending.csv'
  passing_output_path = 'uncleaned_data_csv\passing.csv'
  goalkeeping_output_path = 'uncleaned_data_csv\goalkeeping.csv'


In [66]:
scrape_all_data()

Saved squad stats for 2017-2018
Saved squad stats for 2018-2019
Saved squad stats for 2019-2020
Saved squad stats for 2020-2021
Saved squad stats for 2021-2022
Saved squad stats for 2022-2023
Saved squad stats for 2023-2024
All seasons processed
Saved squad wages for 2017-2018
Saved squad wages for 2018-2019
Saved squad wages for 2019-2020
Saved squad wages for 2020-2021
Saved squad wages for 2021-2022
Saved squad wages for 2022-2023
Saved squad wages for 2023-2024
All seasons processed
Saved standard stats for 2017-2018
Saved standard stats for 2018-2019
Saved standard stats for 2019-2020
Saved standard stats for 2020-2021
Saved standard stats for 2021-2022
Saved standard stats for 2022-2023
Saved standard stats for 2023-2024
All seasons processed
Saved defensive stats for 2017-2018
Saved defensive stats for 2018-2019
Saved defensive stats for 2019-2020
Saved defensive stats for 2020-2021
Saved defensive stats for 2021-2022
Saved defensive stats for 2022-2023
Saved defensive stats for