In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
from io import StringIO
import time

# Setup Selenium WebDriver with options for better performance
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode (no GUI)
driver = webdriver.Chrome(options=options)
driver.set_page_load_timeout(120)

#Starting URL for the most recent Premier League season
standing_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

#This list will hold all the scraped match data
all_matches = []

#List of years to scrape (2024-2025 and 2023-2024 seasons)
years = list(range(2025, 2023, -1))

#Loop over each season
for year in years:
    # Load standings page for the current season
    driver.get(standing_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Extract the table that contains the team stats
    standings_table = soup.select("table.stats_table")[0]

    # Get all team links
    links = [a.get("href") for a in standings_table.find_all('a') if a.get("href") and '/squads/' in a.get("href")]
    team_urls = [f"https://fbref.com{l}" for l in links]

    # Get link to previous season for the next iteration
    previous_season = soup.select("a.prev")[0].get("href")
    standing_url = f"https://fbref.com{previous_season}"

    # Loop through each team in the current season
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("_", "")

        # Get team match data (Scores & Fixtures)
        driver.get(team_url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        try:
            matches = pd.read_html(StringIO(str(soup)), match="Scores & Fixtures")[0]
        except ValueError:
            continue  # Skip if match table not found

        # Find link to shooting stats
        shooting_links = [a.get("href") for a in soup.find_all("a") if a.get("href") and "all_comps/shooting/" in a.get("href")]
        if not shooting_links:
            continue

        # Load shooting stats
        driver.get(f"https://fbref.com{shooting_links[0]}")
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        try:
            shooting = pd.read_html(StringIO(str(soup)), match="Shooting")[0]
            shooting.columns = shooting.columns.droplevel()  # Flatten multi-index
        except (ValueError, IndexError):
            continue

        # Merge match data with shooting data on the 'Date' column
        try:
            team_data = matches.merge(
                shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]],
                on="Date"
            )
        except ValueError:
            continue  # Skip if merge fails (possibly due to mismatched dates)

        # Filter only Premier League matches
        team_data = team_data[team_data["Comp"] == "Premier League"]

        # Add season and team metadata
        team_data["Season"] = year
        team_data["Team"] = team_name

        # Append to our master list
        all_matches.append(team_data)

        time.sleep(1)  # Be polite with scraping

#Combine all data into a single DataFrame and export to CSV
match_df = pd.concat(all_matches, ignore_index=True)
match_df.columns = [col.lower() for col in match_df.columns]
match_df.to_csv("matches.csv", index=False)

# Close the browser
driver.quit()


In [None]:
# Summary:
# This script scrapes Premier League match data for the last two seasons from fbref.com.
# For each team, it collects match results (Scores & Fixtures) and shooting stats,
# merges them on the match date, filters for Premier League matches, and adds season/team info.
# Finally, all team data is combined and saved to a CSV file named 'matches.csv'.
