In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib3.exceptions import InsecureRequestWarning
from lxml import html
import time
import difflib

# Turn off SSL Warning
requests.packages.urllib3.disable_warnings()

# Define URL & Header
curr_season = '2022-2023'
transfer_season = curr_season[0:4] 
url = 'https://fbref.com/en/comps/9/{0}/{1}-Premier-League-Stats'.format(curr_season, curr_season)
header_ = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", "Accept-Language": "en-US,en;q=0.9"}

# Make Request to URL
req = requests.get(url, headers = header_, verify = False)

# Parse HTML With BeautifulSoup
dat = BeautifulSoup(req.text)
league_table = dat.select('table.stats_table')[0]

# Find Links in HTML Text
team_links = league_table.find_all('a')
team_links = [link.get('href') for link in team_links]
team_links = [link for link in team_links if '/squads/' in link]

# Complete Each Link By Adding Prefix
pre = 'https://fbref.com'
team_links = [pre + link for link in team_links]

In [3]:
# Scrape Data from Transfermarkt.co.uk
urlT = 'https://www.transfermarkt.co.uk/premier-league/transfers/wettbewerb/GB1/plus/?saison_id={}&s_w=&leihe=0&intern=0&intern=1'.format(transfer_season)
trans_req = requests.get(urlT, headers = header_, verify = False)

# Parse HTML & Find Span Classes Containing Expenditures
transers = BeautifulSoup(trans_req.text)
transfer_data = transers.find_all("span", {"class":"transfer-einnahmen-ausgaben redtext"})
title_tags = transers.find_all("h2")[0:20]
tnames = [str(tag).split('"')[-2] for tag in title_tags]
expenditures = []

# Clean-Up Expenditure Data
for x in transfer_data:
    expenditures.append(x.text[17:22])
for i in range(0, 20):
    if expenditures[i] == '\t\t\t\t\t':
        expenditures[i] = 0
    if 'm' in str(expenditures[i]):
        expenditures[i] = expenditures[i].replace("m", "")
    if 'Th' in str(expenditures[i]):
        expenditures[i] = expenditures[i].replace("Th", "")
        expenditures[i] = float(expenditures[i])/1000
    expenditures[i] = float(expenditures[i])

# Create Dataframe for Each Team's Transfer Expenses
team_transfers = pd.DataFrame(data = tnames, index = range(0,20), columns = ["Team"])
team_transfers["Expenditures"] = expenditures

# Sort & Match Team Names
tn = []
names = [u.split("/")[-1].replace("-Stats", "").replace("-", " ") for u in team_links]

for i in range(0, 20):
    check_word = team_transfers["Team"][i]
    n = 1
    cutoff = 0.8
    close_match = difflib.get_close_matches(check_word, names, n, cutoff)
    team_transfers["Team"].iat[i] = close_match[0]

In [4]:
all_stats = []

# Loop Through Each Team URL
for url in team_links:
    team_req = requests.get(url, headers = header_, verify = False)
    team_name = url.split("/")[-1].replace("-Stats", "").replace("-", " ")
    for i in range(0, len(team_transfers)):
        if team_name == team_transfers["Team"][i]:
            team_exp = team_transfers["Expenditures"][i]
    time.sleep(1)

    # Use Pandas to Read the Scores & Fixtures Table for Each Team
    try:
        scores_fixtures = pd.read_html(team_req.text, match = "Scores & Fixtures")[0]
    except:
        scores_fixtures = pd.read_html(team_req.text, match = "Scores & Fixtures ")[0]
    scores_fixtures = scores_fixtures[scores_fixtures["Comp"] == "Premier League"]
    scores_fixtures.insert(loc = 9, column = "Team", value = team_name)
    scores_fixtures.insert(loc = 19, column = "Transfers - Home", value = team_exp)
    scores_fixtures = scores_fixtures.drop(columns = ["Comp", "xG", "xGA", "Attendance", "Captain", "Referee", "Match Report", "Notes"])

    # Parse HTML With BeautifulSoup & Find Links
    data = BeautifulSoup(team_req.text)
    squad_links = data.find_all('a')
    squad_links = [link.get('href') for link in squad_links]
    squad_links = [link for link in squad_links if link and 'matchlogs/all_comps/' in link]
    squad_links = list(dict.fromkeys(squad_links))
    squad_links = squad_links[1:]

    # Loop Through Extra Tables
    stats_list = []
    for link in squad_links:
        table_page = requests.get(pre + link, headers = header_, verify = False)
        table = pd.read_html(table_page.text)[0]
        table.columns = table.columns.droplevel()
        table = table[table["Comp"] == "Premier League"]
        if "/shooting/" not in link:
            table = table.drop(columns = "Date")
        if "/shooting/" in link:
            table = table.rename(columns={'Sh':'Shots', 'FK':'FK Shots'})
        if "/passing_types/" in link:
            table = table.rename(columns={'Live':'Live Passes', 'Cmp':'Passes Completed', 'Press':'Passes Under Pressure'})
        stats_list.append(table)
        time.sleep(1)

    # Combine Tables for Current Team
    combined = pd.concat(stats_list, axis = 1)
    combined = combined[["Date","Shots", "SoT%", "G/SoT", "Dist", "FK Shots", "PK", "SoTA", "Save%", "CS", "PKA", "#OPA", "Live Passes", "Passes Completed", "Passes Under Pressure", 
                        "Ground", "High", "SCA", "Press", "Succ%", "CrdR", "CrdY", "Fls", "OG", "Won%"]]
    team_stats = scores_fixtures.merge(combined, on = "Date")
    all_stats.append(team_stats)

# Create Dataframe for All Teams and Save to CSV
df = pd.concat(all_stats)
df = df.rename(columns = {'Poss':'Possession', 'SoTA':'SoT Against', 'Dist':'Avg. Distance of Shots', 'CS':'Clean Sheet', 'PKA':'PK Allowed', '#OPA':'# Defensive Actions OPA', 'Ground':'Ground Passes',
                        'High':'High Passes', 'SCA':'Shot-Creating Actions', 'Press':'Presses (Applied)','Succ%':'Successful Dribbles', 'CrdR':'Red Cards', 'CrdY':'Yellow Cards',
                        'Fls':'Fouls', 'OG':'Own Goals', 'Won%':'Aerial Battles Won (%)'})
df.to_csv("{}.csv".format(curr_season))