In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import io
from time import sleep
import datetime
from functools import reduce



In [2]:
yearStart = 2010 # earlier than 2017 doesn't have some of the required columns and tables # earlier than 2019 raises an error at 2019 1
yearEnd = 2023

In [3]:
def get_current_year():
	today = datetime.date.today()
	return today.year

In [4]:
def get_premier_league_url(season_begin_year: int):
	default_url_prefix = "https://fbref.com/en/comps/9/"
	default_url_suffix = "Premier-League-Stats"

	if(season_begin_year > 2023):
		raise ValueError("Year " + str(season_begin_year) + " didn't happen yet!")
	elif(season_begin_year == 2023):
		return default_url_prefix + default_url_suffix
	return default_url_prefix + str(season_begin_year) + "-" + str(season_begin_year + 1) + "/" + str(season_begin_year) + "-" + str(season_begin_year + 1) + "-" + default_url_suffix

In [5]:
def get_team_urls(premier_league_url: str):
	premier_league_data = requests.get(premier_league_url)

	if(premier_league_data.status_code != 200):
		print(premier_league_data.status_code)
		retry_after = int(premier_league_data.headers['Retry-After'])
		print(retry_after)
		return
	
	premier_league_soup = BeautifulSoup(premier_league_data.text)
	teams_table = premier_league_soup.select('table.stats_table')[0]

	links = list(filter(lambda x: x is not None, [l.get("href") if "squad" in l.get("href") else None for l in teams_table.find_all('a')]))
	team_urls = [f"https://fbref.com{l}" for l in links]

	return team_urls


In [6]:
def write_list_into_csv(all_matches: list, name: str):
	match_df = pd.concat(all_matches)
	match_df.columns = [c.lower() for c in match_df.columns]
	match_df.to_csv(name, na_rep='NaN')

In [7]:
all_matches = []
for year in range(yearStart, yearEnd + 1):
	team_urls = get_team_urls(get_premier_league_url(year))

	index = 0
	for team_url in team_urls:
		sleep(4)
		team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")

		print("Current year: " + str(year) + " in interval " + str(yearStart) + "-" + str(yearEnd))
		print("Current team: " + team_name + " being " + str(index + 1) + "/" + str(len(team_urls)))
		print(team_url)
		
		team_data = requests.get(team_url)
		team_soup = BeautifulSoup(team_data.text)

		team_links = [l.get("href") for l in team_soup.find_all('a')]
		
		team_links_schedule = [l for l in team_links if l and 'matchlogs/c9/schedule' in l] # Scores & Fixtures
		team_links_shooting = [l for l in team_links if l and 'matchlogs/c9/shooting' in l] # Shooting
		team_links_keeper = [l for l in team_links if l and 'matchlogs/c9/keeper' in l] # Goalkeeping
		team_links_passing = [l for l in team_links if l and 'matchlogs/c9/passing' in l] # Passing
		team_links_passing_types = [l for l in team_links if l and 'matchlogs/c9/passing_types' in l] # Pass Types
		team_links_gca = [l for l in team_links if l and 'matchlogs/c9/gca' in l] # Goal and Shot Creation
		team_links_defensive = [l for l in team_links if l and 'matchlogs/c9/defense' in l] # Defensive Actions
		team_links_possession = [l for l in team_links if l and 'matchlogs/c9/possession' in l] # Possession
		team_links_misc = [l for l in team_links if l and 'matchlogs/c9/misc' in l] # Miscellaneous Stats

		sleep(4)
		schedule_data = requests.get(f"https://fbref.com{team_links_schedule[0]}")
		schedule_html = pd.read_html(io.StringIO(schedule_data.text), match="Scores & Fixtures")[0]
		# schedule_html = schedule_html.dropna(subset=['Result'])

		sleep(4)
		shooting_data = requests.get(f"https://fbref.com{team_links_shooting[0]}")
		shooting_html = pd.read_html(io.StringIO(shooting_data.text), match="Shooting")[0]
		shooting_html.columns = shooting_html.columns.droplevel()

		sleep(4)
		keeper_data = requests.get(f"https://fbref.com{team_links_keeper[0]}")
		keeper_html = pd.read_html(io.StringIO(keeper_data.text), match="Goalkeeping")[0]
		keeper_html.columns = keeper_html.columns.droplevel()

		# sleep(4)
		# passing_data = requests.get(f"https://fbref.com{team_links_passing[0]}")
		# passing_html = pd.read_html(io.StringIO(passing_data.text), match="Passing")[0]
		# passing_html.columns = passing_html.columns.droplevel()

		# sleep(4)
		# passing_types_data = requests.get(f"https://fbref.com{team_links_passing_types[0]}")
		# passing_types_html = pd.read_html(io.StringIO(passing_types_data.text), match="Pass Types")[0]
		# passing_types_html.columns = passing_types_html.columns.droplevel()

		# sleep(4)
		# gca_data = requests.get(f"https://fbref.com{team_links_gca[0]}")
		# gca_html = pd.read_html(io.StringIO(gca_data.text), match="Goal and Shot Creation")[0]
		# gca_html.columns = gca_html.columns.droplevel()

		# sleep(4)
		# defensive_actions_data = requests.get(f"https://fbref.com{team_links_defensive[0]}")
		# defensive_actions_html = pd.read_html(io.StringIO(defensive_actions_data.text), match="Defensive Actions")[0]
		# defensive_actions_html.columns = defensive_actions_html.columns.droplevel()

		# sleep(4)
		# possession_data = requests.get(f"https://fbref.com{team_links_possession[0]}")
		# possession_html = pd.read_html(io.StringIO(possession_data.text), match="Possession")[0]
		# possession_html.columns = possession_html.columns.droplevel()

		# sleep(4)
		# misc_data = requests.get(f"https://fbref.com{team_links_misc[0]}")
		# misc_html = pd.read_html(io.StringIO(misc_data.text), match="Miscellaneous Stats")[0]
		# misc_html.columns = misc_html.columns.droplevel()

		# dfs = [schedule_html, shooting_html, keeper_html, passing_html, passing_types_html, gca_html, defensive_actions_html, possession_html, misc_html]
		dfs = [schedule_html, shooting_html, keeper_html]		

		team_data_end = reduce(lambda left,right: pd.merge(left,right,on='Date', suffixes=('', '_drop'), how='outer'), dfs)
		team_data_end.drop([col for col in team_data_end.columns if 'drop' in col], axis=1, inplace=True)
	
		team_data_end["Season"] = str(year) + "-" + str(year + 1)
		team_data_end["Team"] = team_name

		all_matches.append(team_data_end)
		index += 1

Current year: 2010 in interval 2010-2023
Current team: Manchester United being 1/20
https://fbref.com/en/squads/19538871/2010-2011/Manchester-United-Stats
Current year: 2010 in interval 2010-2023
Current team: Chelsea being 2/20
https://fbref.com/en/squads/cff3d9bb/2010-2011/Chelsea-Stats
Current year: 2010 in interval 2010-2023
Current team: Manchester City being 3/20
https://fbref.com/en/squads/b8fd03ef/2010-2011/Manchester-City-Stats
Current year: 2010 in interval 2010-2023
Current team: Arsenal being 4/20
https://fbref.com/en/squads/18bb7c10/2010-2011/Arsenal-Stats
Current year: 2010 in interval 2010-2023
Current team: Tottenham Hotspur being 5/20
https://fbref.com/en/squads/361ca564/2010-2011/Tottenham-Hotspur-Stats
Current year: 2010 in interval 2010-2023
Current team: Liverpool being 6/20
https://fbref.com/en/squads/822bd0ba/2010-2011/Liverpool-Stats
Current year: 2010 in interval 2010-2023
Current team: Everton being 7/20
https://fbref.com/en/squads/d3fd31cc/2010-2011/Everton-S

In [9]:
name = "premier_league_" + str(yearStart) + "-" + str(yearEnd) + ".csv"
# write_list_into_csv(all_matches, name)
all_matches

[          Date  Time         Round  Day Venue Result   GF   GA  \
 0   2010-08-16   NaN   Matchweek 1  Mon  Home      W  3.0  0.0   
 1   2010-08-22   NaN   Matchweek 2  Sun  Away      D  2.0  2.0   
 2   2010-08-28   NaN   Matchweek 3  Sat  Home      W  3.0  0.0   
 3   2010-09-11   NaN   Matchweek 4  Sat  Away      D  3.0  3.0   
 4   2010-09-19   NaN   Matchweek 5  Sun  Home      W  3.0  2.0   
 5   2010-09-26   NaN   Matchweek 6  Sun  Away      D  2.0  2.0   
 6   2010-10-02   NaN   Matchweek 7  Sat  Away      D  0.0  0.0   
 7   2010-10-16   NaN   Matchweek 8  Sat  Home      D  2.0  2.0   
 8   2010-10-24   NaN   Matchweek 9  Sun  Away      W  2.0  1.0   
 9   2010-10-30   NaN  Matchweek 10  Sat  Home      W  2.0  0.0   
 10  2010-11-06   NaN  Matchweek 11  Sat  Home      W  2.0  1.0   
 11  2010-11-10   NaN  Matchweek 12  Wed  Away      D  0.0  0.0   
 12  2010-11-13   NaN  Matchweek 13  Sat  Away      D  2.0  2.0   
 13  2010-11-20   NaN  Matchweek 14  Sat  Home      W  2.0  0.