In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
years = list(range(2024, 2020,-1))

In [3]:
all_matches = []

In [4]:
standings_url = "https://fbref.com/en/comps/8/Champions-League-Stats"

In [None]:
# Loop through each season we want to analyze
for season in years:
    # Add delay to avoid overwhelming the server
    time.sleep(3)
    response = requests.get(standings_url)
    soup_data = BeautifulSoup(response.text)

    # Handle different table positions based on season
    # 2024 season has a different table structure than previous years
    if season == 2024:
        stats_table = soup_data.select('table.stats_table')[0]
    else:
        stats_table = soup_data.select('table.stats_table')[8]

    # Extract team URLs from the standings table
    href_links = [link.get("href") for link in stats_table.find_all('a')]
    filtered_links = [link for link in href_links if '/squads/' in link]
    team_pages = [f"https://fbref.com{link}" for link in filtered_links]

    # Get URL for the previous season's standings
    prev_season_link = soup_data.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{prev_season_link}"

    # Loop through each team's URL to get their match data
    for page_url in team_pages:
        time.sleep(3)
        # Extract team name from URL
        club_name = page_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        team_response = requests.get(page_url)
        
        # Get basic match data
        match_data = pd.read_html(team_response.text, match="Scores & Fixtures")[0]
        soup_team = BeautifulSoup(team_response.text)

        # Find link to shooting stats
        all_links = [link.get("href") for link in soup_team.find_all('a')]
        shooting_links = [link for link in all_links if link and 'all_comps/shooting/' in link]

        # Get shooting statistics
        time.sleep(3)
        shooting_response = requests.get(f"https://fbref.com{shooting_links[0]}")
        shooting_data = pd.read_html(shooting_response.text, match="Shooting")[0]
        shooting_data.columns = shooting_data.columns.droplevel()

        # Merge match data with shooting data
        try:
            combined_data = match_data.merge(
                shooting_data[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date"
            )
        except ValueError:
            continue

        # Filter for Champions League matches only
        combined_data = combined_data[combined_data["Comp"] == "Champions Lg"]

        # Clean up opponent names
        combined_data["Opponent"] = combined_data["Opponent"].str.strip()
        combined_data["Opponent"] = combined_data["Opponent"].str.replace(r'^[a-z]{2,}\s+', '', regex=True)
        combined_data["Opponent"] = combined_data["Opponent"].str.title()

        # Add season and team name columns
        combined_data["Season"] = season
        combined_data["Team"] = club_name

        # Append to all_matches list
        all_matches.append(combined_data)
        time.sleep(3)

In [None]:
match_df = pd.concat(all_matches)

In [33]:
match_df.columns = [c.lower() for c in match_df.columns]

In [36]:
match_df.to_csv("matches.csv")