This notebook is to scrape football data from fbref.com

Much of the scraping code is taken from this repository: https://github.com/chmartin/FBref_EPL.

Run the first cell and then the further cells to get data for whichever leagues you want.

All data is courtesy of StatsBomb via FBref. Find me on Twitter @pathaleee!


In [1]:
#@title
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import sys, getopt
import csv
import time
import traceback
from functools import wraps
from requests.exceptions import HTTPError


#Functions to get the data in a dataframe using BeautifulSoup

In [19]:
# NOTE Most complete data seems to appear after 2018 (eg. Distance for shooting in Championship)

# La Liga: 12, Segunda Div: 17, EPL: 9, Championship: 10

COMPETITION_MAPPING = {
    # 'ENG': {
    #     9: 'Premier-League',
    #     10: 'Championship'
    # },
    'SPA': {
        12: 'La-Liga',
        17: 'Segunda-Division'
    },
    'GER': {
        20: 'Bundesliga',
        33: '2-Bundesliga'
    },
    'ITA': {
        11: 'Serie-A',
        18: 'Serie-B'
    },
    'FRA': {
        13: 'Ligue-1',
        60: 'Ligue-2'
    }
}

MAX_RETRIES = 3

In [22]:
class SeasonData:

    categories = ['stats', 'keepers', 'keepersadv', 'shooting', 'passing', 'passing_types', 'gca', 'defense', 'possession', 'misc']

    def __init__(self, year, country):
        self.country = country
        self.year = year
        self.comp_ids = list(COMPETITION_MAPPING[country].keys())

        self.urls = {
            1: {
                'pre': f"https://fbref.com/en/comps/{self.comp_ids[0]}/{year}-{year+1}/", # season table url
                'suf': f"/{year}-{year+1}-{COMPETITION_MAPPING[country][self.comp_ids[0]]}-Stats" # get data url
            },
            2: {
                'pre': f"https://fbref.com/en/comps/{self.comp_ids[1]}/{year}-{year+1}/", # season table url
                'suf': f"/{year}-{year+1}-{COMPETITION_MAPPING[country][self.comp_ids[1]]}-Stats" # get data url
            }
        }

        self.table_urls = [div_urls['pre'] for div_urls in self.urls.values()]
        self.promoted_teams = []
        self.relegated_teams = []
        self.df_team = None

    def catch_too_many_requests(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for i in range(MAX_RETRIES):
                try:
                    return func(*args, **kwargs)
                except HTTPError as http_err:
                    if hasattr(http_err, 'response') and http_err.response.status_code == 429:
                        print("Rate limit exceeded.")
                    else:
                        print(f"HTTP error occurred: {http_err}" + ", retrying" if i < MAX_RETRIES else "")
        return wrapper

    @catch_too_many_requests
    def get_promoted_teams(self):
        res = requests.get(self.urls[2]['pre'])
        res.raise_for_status()
        comm = re.compile("<!--|-->")
        soup = BeautifulSoup(comm.sub("",res.text),'lxml')    
        all_tables = soup.findAll("tbody")
        results_table = all_tables[0]
        rows_teams = results_table.find_all('tr')
        for row in rows_teams:
            self.find_special_teams(row, 'promote')
            
    @catch_too_many_requests
    def get_relegated_teams(self):
        res = requests.get(self.urls[1]['pre'])
        res.raise_for_status()
        comm = re.compile("<!--|-->")
        soup = BeautifulSoup(comm.sub("",res.text),'lxml')    
        all_tables = soup.findAll("tbody")
        results_table = all_tables[0]
        rows_teams = results_table.find_all('tr')
        for row in rows_teams:
            self.find_special_teams(row, 'relegate')

    def find_special_teams(self, row, class_name): # promoted or relegated
        if(row.find(lambda tag: tag.name == 'th' and tag.has_attr('class') \
            and class_name in tag['class']) != None):
            team_td = row.find('td', {'data-stat': 'team'})
            team_added = 0
            if team_td:
                team_name = team_td.find('a').text.strip()
                if team_name:
                    # print(team_name)
                    if class_name == 'promote':
                        self.promoted_teams.append(team_name)
                    else:
                        self.relegated_teams.append(team_name)
                    team_added = 1
            if team_added == 0:
                print(f"{class_name}d team found but was not appended to {class_name}d teams array")

    @staticmethod
    @catch_too_many_requests
    def get_team_table(url):
        # print(url)
        res = requests.get(url)
        res.raise_for_status()
        ## The next two lines get around the issue with comments breaking the parsing.
        comm = re.compile("<!--|-->")
        soup = BeautifulSoup(comm.sub("",res.text),'lxml')    
        all_tables = soup.findAll("tbody")
        team_table = all_tables[0]
        return team_table

    def get_frame_team(self, team_table, url):
        pre_df_squad = dict()
        rows_squad = team_table.find_all('tr')
        for row in rows_squad:
            if(row.find('th',{"scope":"row"}) != None) and row.find('th',{"data-stat":"team"}):
                name = row.find('th',{"data-stat":"team"}).text.strip().encode().decode("utf-8")
                if (url == self.urls[2]['pre'] and name not in self.promoted_teams) or \
                    (url == self.urls[1]['pre'] and name in self.relegated_teams):
                    continue
                # print(f"Team found: {name}")
                if 'squad' in pre_df_squad:
                    pre_df_squad['squad'].append(name)
                else:
                    pre_df_squad['squad'] = [name]
                for f in [td.get('data-stat') for td in row.find_all('td')]:
                    cell = row.find("td",{"data-stat": f})
                    a = cell.text.strip().encode()
                    text=a.decode("utf-8")
                    if(text == ''):
                        text = None
                    if f in pre_df_squad:
                        pre_df_squad[f].append(text)
                    else:
                        pre_df_squad[f] = [text]
        df_squad = pd.DataFrame.from_dict(pre_df_squad)
    #     print(df_squad.columns) # print columns for each table
        return df_squad

    def frame_for_category_team(self, category):
        df_divs_array = []
        for div_urls in self.urls.values(): # first div url, second div url
            team_table = self.get_team_table(div_urls['pre'] + category + div_urls['suf'])
            df_team = self.get_frame_team(team_table, div_urls['pre'])
            df_divs_array.append(df_team)
        
        return pd.concat(df_divs_array)

    #Function to get team-wise data accross all categories as mentioned above
    def get_team_data(self):
        df_array = []
        for category in self.categories:
            df_array.append(self.frame_for_category_team(category))
            time.sleep(5)
        df = pd.concat(df_array, axis=1)
        self.df_team = df.loc[:,~df.columns.duplicated()].copy()
        self.df_team['season_start_year'] = self.year
        self.df_team['season_end_year'] = self.year + 1
    
    def save_to_csv(self):
        try:
            self.df_team.to_csv(f"./data/teams/raw/{self.country}{self.year}_teams_for_{self.year+1}.csv",index=False)
            print(f"Saved successfully ({self.year})")
        except Exception as e:
            print(e)

In [23]:
#This cell is to get the data FOR all teams in any competition

#Go to the 'Standard stats' page of the league
#For Premier League 2020/21, the link is this: https://fbref.com/en/comps/9/stats/Premier-League-Stats
#Remove the 'stats', and pass the first and third part of the link as parameters like below

# NOTE: may need to add delays between requests to prevent 429 (gettting blocked by fbref)

from requests.exceptions import HTTPError

for country in COMPETITION_MAPPING.keys():
    for year in range(2018, 2023):

        season_data = SeasonData(year, country)

        season_data.get_promoted_teams()
        season_data.get_relegated_teams()

        season_data.get_team_data() # sets self.df_team

        season_data.save_to_csv()

Saved successfully (2018)
Saved successfully (2019)
Saved successfully (2020)
Saved successfully (2021)
Saved successfully (2022)
Saved successfully (2018)
Saved successfully (2019)
Saved successfully (2020)
Saved successfully (2021)
Saved successfully (2022)
Saved successfully (2018)
Saved successfully (2019)
Saved successfully (2020)
Saved successfully (2021)
Saved successfully (2022)
Saved successfully (2018)
Saved successfully (2019)
Saved successfully (2020)
Saved successfully (2021)
Saved successfully (2022)
