In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
years = list(range(2024, 2020,-1))

In [3]:
all_matches = []

In [4]:
standings_url = "https://fbref.com/en/comps/8/Champions-League-Stats"

In [None]:
# Loop through each year we want to analyze
for year in years:
    # Add delay to avoid overwhelming the server
    time.sleep(3)
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    
    # Handle different table positions based on season
    # 2024 season has different table structure than previous years
    if year == 2024:
        standings_table = soup.select('table.stats_table')[0]
    else:
        standings_table = soup.select('table.stats_table')[8]  
    
    # Extract team URLs from the standings table
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    # Get URL for previous season's standings
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    # Loop through each team's URL to get their match data
    for team_url in team_urls:
        time.sleep(3)
        # Extract team name from URL
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        # Get basic match data
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        
        # Find link to shooting stats
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        
        # Get shooting statistics
        time.sleep(3)
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0] 
        shooting.columns = shooting.columns.droplevel()
        
        # Merge match data with shooting data
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
            
        # Filter for Champions League matches only
        team_data = team_data[team_data["Comp"] == "Champions Lg"]
        
        # Clean up opponent names
        team_data["Opponent"] = team_data["Opponent"].str.strip()
        team_data["Opponent"] = team_data["Opponent"].str.replace(r'^[a-z]{2,}\s+', '', regex=True)
        team_data["Opponent"] = team_data["Opponent"].str.title()

        # Add season and team name columns
        team_data["Season"] = year
        team_data["Team"] = team_name
        
        # Append to all_matches list
        all_matches.append(team_data)
        time.sleep(3)

In [None]:
match_df = pd.concat(all_matches)

In [33]:
match_df.columns = [c.lower() for c in match_df.columns]

In [36]:
match_df.to_csv("matches.csv")