Scraping the first page with requests

In [1]:
import requests

In [2]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
# Downloading the page
data = requests.get(standings_url)

Parsing HTML links with BeautifulSoup

In [4]:
from bs4 import BeautifulSoup

In [5]:
# Initializing the soup object with the html 
soup = BeautifulSoup(data.text)

In [10]:
# Selecting the table using a css selector stats_table=class 
standings_table = soup.select('table.stats_table')[0]

IndexError: list index out of range

In [None]:
# Finding all of the a tags in the table
links = standings_table.find_all('a')

In [None]:
# Getting the href property of each link
# Using a list comprehension
links = [l.get("href") for l in links]

In [None]:
# Filtering the links to only have the squad links
links = [l for l in links if '/squads/' in l]

In [None]:
links

In [None]:
# Turning the links inot full urls, adding the domain, 
# using format string to get absolute links
team_urls = [f"https://fbref.com{l}" for l in links]

In [None]:
team_urls

Extract Match Stats Using Pandas And Requests

In [None]:
team_url =  team_urls[0]

In [None]:
data = requests.get(team_url)

In [None]:
import pandas as pd

In [None]:
matches = pd.read_html(data.text, match="Scores & Fixtures")

In [None]:
matches[0]

In [None]:
# Getting the shootings record /  match stats

In [None]:
soup = BeautifulSoup(data.text)

In [None]:
links = soup.find_all('a')

In [None]:
links = [l.get("href") for l in links]

In [None]:
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [None]:
data =  requests.get(f"https://fbref.com{links[0]}")

In [None]:
shooting = pd.read_html(data.text, match="Shooting")[0]

Cleaning and Merging Scraped Data with Pandas

In [None]:
shooting.head()

In [None]:
# Removing the first column row
shooting.columns =  shooting.columns.droplevel()

In [None]:
shooting.head()

In [None]:
shooting.columns

In [None]:
 team_data = matches[0].merge(shooting[['Date', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']], on='Date')

In [None]:
team_data.head(10)

  Scraping Data for Multiple Seasons and Teams with a Loop

In [None]:
years = list(range(2023, 2022, -1))

In [None]:
years

In [None]:
all_matches = []

In [None]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]
    
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com/{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", "")
        
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        
        try:
             team_data = matches.merge(shooting[['Date', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']], on='Date')
        except ValueError:
            continue
            
        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)
        
        

In [None]:
match_df = pd.concat(all_matches)

In [None]:
match_df.columns = [c.lower() for c in match_df.columns]

In [None]:
match_df

In [None]:
match_df.to_csv("matches_last_sn.csv")

The data contains matsches played from 2019/2020 season to date. I will need to clean it and only use matches from the 2020/2021 and 2021/2022 seasons.