In [238]:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
import time
from io import StringIO
import random

In [239]:
# Enhanced scraper with better rate limiting and realistic browser behavior
def create_enhanced_scraper():
    """Create a scraper with realistic browser headers to avoid detection"""
    scraper = cloudscraper.create_scraper(
        browser={
            'browser': 'chrome',
            'platform': 'darwin',  # macOS
            'desktop': True
        }
    )
    # Add realistic headers to appear more human-like
    scraper.headers.update({
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Cache-Control': 'no-cache',
        'Pragma': 'no-cache',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Upgrade-Insecure-Requests': '1'
    })
    return scraper

def smart_request(scraper, url, max_retries=3, base_delay=5):
    """
    Make requests with intelligent rate limiting and retry logic
    
    Why this is important:
    - Exponential backoff: Wait longer after each failure
    - Random jitter: Prevents synchronized requests that look robotic
    - Multiple status code handling: Different responses for different errors
    - Timeout protection: Prevents hanging requests
    """
    for attempt in range(max_retries):
        try:
            # Add random delay to appear more human (except first attempt)
            if attempt > 0:
                jitter = random.uniform(0.5, 2.0)  # Random jitter
                delay = base_delay * (2 ** attempt) + jitter  # Exponential backoff + jitter
                print(f"  ⏳ Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)
            
            print(f"  📡 Requesting: {url.split('/')[-1]} (attempt {attempt + 1}/{max_retries})")
            response = scraper.get(url, timeout=30)
            
            if response.status_code == 200:
                print(f"  ✅ Success!")
                return response
            elif response.status_code == 429:
                # Rate limited - wait longer with exponential backoff
                wait_time = base_delay * (3 ** attempt) + random.uniform(5, 15)
                print(f"  🚫 Rate limited (429). Waiting {wait_time:.1f} seconds...")
                time.sleep(wait_time)
            elif response.status_code == 403:
                # Forbidden - might be blocked, wait even longer
                wait_time = base_delay * (4 ** attempt) + random.uniform(10, 30)
                print(f"  🔒 Forbidden (403). Waiting {wait_time:.1f} seconds...")
                time.sleep(wait_time)
            else:
                print(f"  ❌ Error {response.status_code}: {response.reason}")
                time.sleep(base_delay)
                
        except Exception as e:
            print(f"  💥 Request failed: {e}")
            time.sleep(base_delay + random.uniform(1, 5))
    
    print(f"  💀 Failed after {max_retries} attempts")
    return None

# Initialize enhanced scraper
scraper = create_enhanced_scraper()
standings_url = "https://fbref.com/en/comps/8/Champions-League-Stats"

In [240]:
# Selecting which years to pull data from
years = list(range(2026, 2023, -1))
all_matches = []
print(f"🎯 Scraping data for years: {years}")

🎯 Scraping data for years: [2026, 2025, 2024]


In [241]:
scraper = cloudscraper.create_scraper()
url = "https://fbref.com/en/comps/8/Champions-League-Stats"
resp = scraper.get(url)   # no verify=False needed
soup = BeautifulSoup(resp.text, "html.parser")
standings_table = soup.select("table.stats_table")[0]
# Finding all of the a tags within the table, which contains the link to more data for each team
links = standings_table.find_all("a")
# Extracting the href properties from the links
links = [link.get("href") for link in links]
# Filtering the links to only include those that contain "/squads/", which is the pattern for team data pages
links = [l for l in links if l and "/squads/" in l]
# Constructing the full URLs for each team
team_urls = [f"https://fbref.com{l}" for l in links]



In [242]:
# --- TEST MODE SWITCH ---
TEST_MODE = False   # flip to False for full scrape

if TEST_MODE:
    team_urls = team_urls[:2]        # only 1 team

#

for i, team_url in enumerate(team_urls):
    # Change team name to a readable format
    team_name = team_url.split("/")[-1].replace("-", " ").replace("_Stats", "")
    print(f"\n🏃 Processing team {i+1}/{len(team_urls)}: {team_name}")

    current_url = team_url

    for year in years:
        print(f"\n🏆 Season {year}: {current_url}")

        # Get team page with retries
        data = smart_request(scraper, current_url, max_retries=3, base_delay=3)
        if not data:
            print(f"    ❌ Failed to get team data for {team_name}, season {year}")
            break

        # Try to get matches data
        try:
            matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures")
            if not matches:
                print(f"    ❌ No matches data for {team_name}, season {year}")
                break
            print(f"    📋 Matches found ({matches[0].shape[0]} rows)")
        except Exception as e:
            print(f"    ❌ Failed to parse matches for {team_name}, season {year}: {e}")
            break

        # ---------- Helper function for stats ---------- #
        def get_stats_data(data_type, url_pattern, table_match):
            print(f"    📊 Getting {data_type} data...")
            soup = BeautifulSoup(data.text, "html.parser")
            links = [a.get("href") for a in soup.find_all("a") if a.get("href")]
            links = [l for l in links if url_pattern in l]

            if not links:
                print(f"    ⚠️ No {data_type} links found")
                return None

            stats_url = f"https://fbref.com{links[0]}"
            stats_response = smart_request(scraper, stats_url, max_retries=2, base_delay=2)
            if not stats_response:
                print(f"    ❌ Failed request for {data_type} data")
                return None

            try:
                stats_df = pd.read_html(StringIO(stats_response.text), match=table_match)[0]
                if hasattr(stats_df.columns, "nlevels") and stats_df.columns.nlevels > 1:
                    stats_df.columns = stats_df.columns.droplevel()
                print(f"    ✅ {data_type} data loaded ({stats_df.shape[0]} rows)")
                return stats_df
            except Exception as e:
                print(f"    ❌ Failed to parse {data_type} data: {e}")
                return None

        # ---------- Collect stats ---------- #
        shooting   = get_stats_data("Shooting", "all_comps/shooting/", "Shooting"); time.sleep(random.uniform(1, 3))
        possession = get_stats_data("Possession", "all_comps/possession/", "Possession"); time.sleep(random.uniform(1, 3))
        gsc        = get_stats_data("Goal/Shot Creation", "all_comps/gca", "Goal and Shot Creation"); time.sleep(random.uniform(1, 3))
        defense    = get_stats_data("Defense", "all_comps/defense", "Defensive Actions"); time.sleep(random.uniform(1, 3))

        # ---------- Merge stats ---------- #
        try:
            team_data = matches[0].copy()
            print(f"    🔗 Starting merge with {team_data.shape[1]} base columns")

            # Merge helpers
            def safe_merge(df, source, cols, label):
                if source is not None:
                    keep = [c for c in cols if c in source.columns]
                    if len(keep) > 1:
                        # rename all columns except 'Date' to avoid duplicates
                        rename_map = {c: f"{c}_{label}" for c in keep if c != "Date"}
                        source_renamed = source[keep].rename(columns=rename_map)
                        df = df.merge(source_renamed, on="Date", how="left")
                        print(f"       + {label} ({len(keep)-1} cols)")
                return df

            # Merge each stats table
            team_data = safe_merge(team_data, shooting, ["Date", "Sh", "SoT", "Dist"], "shooting")
            team_data = safe_merge(team_data, possession,
                                   ["Date", "Touches", "Def Pen", "Def 3rd", "Mid 3rd",
                                    "Att 3rd", "Att Pen", "Succ%", "PrgDist", "1/3"],
                                   "poss")
            team_data = safe_merge(team_data, gsc, ["Date", "SCA", "PassLive"], "gsc")
            team_data = safe_merge(team_data, defense, ["Date", "Tkl+Int", "TklW", "Tkl%"], "def")

            # Add team & season info
            team_data["Team"] = team_name
            team_data["Season"] = f"{year-1}-{str(year)[-2:]}"
            all_matches.append(team_data)

            print(f"    ✅ Finished {team_name} {year-1}-{str(year)[-2:]} → {team_data.shape[1]} cols")

        except Exception as e:
            print(f"    ❌ Merge failed for {team_name}, season {year}: {e}")
            continue

        # ---------- Move to previous season ---------- #
        soup = BeautifulSoup(data.text, "html.parser")
        prev_link_tag = soup.find("a", string=lambda s: s and "Previous Season" in s)
        if prev_link_tag:
            current_url = f"https://fbref.com{prev_link_tag.get('href')}"
            print(f"➡️  Moving to previous season → {current_url}")
        else:
            print("❌ No previous season found — stopping loop")
            break

        # Respectful delay
        delay = random.uniform(10, 20)
        print(f"    ⏳ Cooling off {delay:.1f}s before next year...")
        time.sleep(delay)

    # Longer delay between teams
    team_delay = random.uniform(20, 40)
    print(f"✅ Completed {team_name}. Cooling off {team_delay:.1f}s before next team...\n")
    time.sleep(team_delay)



🏃 Processing team 1/36: Liverpool Stats

🏆 Season 2026: https://fbref.com/en/squads/822bd0ba/Liverpool-Stats
  📡 Requesting: Liverpool-Stats (attempt 1/3)
  ✅ Success!
    📋 Matches found (48 rows)
    📊 Getting Shooting data...
  📡 Requesting: Liverpool-Match-Logs-All-Competitions (attempt 1/2)
  ✅ Success!
    ✅ Shooting data loaded (5 rows)
    📊 Getting Possession data...
  📡 Requesting: Liverpool-Match-Logs-All-Competitions (attempt 1/2)
  ✅ Success!
    ✅ Possession data loaded (5 rows)
    📊 Getting Goal/Shot Creation data...
  📡 Requesting: Liverpool-Match-Logs-All-Competitions (attempt 1/2)
  ✅ Success!
    ✅ Goal/Shot Creation data loaded (5 rows)
    📊 Getting Defense data...
  📡 Requesting: Liverpool-Match-Logs-All-Competitions (attempt 1/2)
  ✅ Success!
    ✅ Defense data loaded (5 rows)
    🔗 Starting merge with 20 base columns
       + shooting (3 cols)
       + poss (9 cols)
       + gsc (2 cols)
       + def (3 cols)
    ✅ Finished Liverpool Stats 2025-26 → 40 cols
➡️

In [243]:
# More robust concatenation with error handling and duplicate column fix
if all_matches:
    # First, fix any duplicate columns in each DataFrame
    fixed_matches = []
    for i, df in enumerate(all_matches):
        # Remove duplicate columns by keeping only the first occurrence
        df_fixed = df.loc[:, ~df.columns.duplicated()]
        
        # Also ensure all column names are strings (sometimes they can be tuples)
        df_fixed.columns = [str(col) if not isinstance(col, str) else col for col in df_fixed.columns]
        
        fixed_matches.append(df_fixed)
        print(f"DataFrame {i}: {df.shape[1]} → {df_fixed.shape[1]} columns (removed {df.shape[1] - df_fixed.shape[1]} duplicates)")
    
    try:
        # First try normal concat with fixed DataFrames
        all_match_data = pd.concat(fixed_matches, ignore_index=True)
        print(f"\n🎉 Scraping complete! Total matches collected: {all_match_data.shape[0]}")
    except (ValueError, KeyError) as e:
        print(f"⚠️ Normal concat failed: {e}")
        print("Trying with outer join to handle column mismatches...")
        # Use outer join to handle different column structures
        all_match_data = pd.concat(fixed_matches, ignore_index=True, join='outer', sort=False)
        print(f"✅ Concatenation successful: {all_match_data.shape[0]} rows, {all_match_data.shape[1]} columns")
else:
    print("❌ No data collected")
    all_match_data = pd.DataFrame()

DataFrame 0: 40 → 39 columns (removed 1 duplicates)
DataFrame 1: 40 → 39 columns (removed 1 duplicates)
DataFrame 2: 40 → 39 columns (removed 1 duplicates)
DataFrame 3: 40 → 39 columns (removed 1 duplicates)
DataFrame 4: 23 → 23 columns (removed 0 duplicates)
DataFrame 5: 40 → 39 columns (removed 1 duplicates)
DataFrame 6: 40 → 39 columns (removed 1 duplicates)
DataFrame 7: 40 → 39 columns (removed 1 duplicates)
DataFrame 8: 40 → 39 columns (removed 1 duplicates)
DataFrame 9: 40 → 39 columns (removed 1 duplicates)
DataFrame 10: 40 → 39 columns (removed 1 duplicates)
DataFrame 11: 40 → 39 columns (removed 1 duplicates)
DataFrame 12: 40 → 39 columns (removed 1 duplicates)
DataFrame 13: 23 → 23 columns (removed 0 duplicates)
DataFrame 14: 40 → 39 columns (removed 1 duplicates)
DataFrame 15: 40 → 39 columns (removed 1 duplicates)
DataFrame 16: 40 → 39 columns (removed 1 duplicates)
DataFrame 17: 40 → 39 columns (removed 1 duplicates)
DataFrame 18: 40 → 39 columns (removed 1 duplicates)
Dat

In [244]:
all_match_data

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Succ%_poss,PrgDist_poss,1/3_poss,SCA_gsc,PassLive_gsc,Tkl+Int_def,TklW_def,Tkl%_def,Team,Season
0,2025-08-10,15:00,FA Community Shield,FA Community Shield,Sun,Neutral,D,2 (2),2 (3),Crystal Palace,...,,,,,,0.0,8.0,,Liverpool Stats,2025-26
1,2025-08-15,20:00,Premier League,Matchweek 1,Fri,Home,W,4,2,Bournemouth,...,38.1,914.0,14.0,36.0,32.0,20.0,13.0,57.9,Liverpool Stats,2025-26
2,2025-08-25,20:00,Premier League,Matchweek 2,Mon,Away,W,3,2,Newcastle Utd,...,50.0,787.0,8.0,9.0,8.0,15.0,8.0,52.9,Liverpool Stats,2025-26
3,2025-08-31,16:30,Premier League,Matchweek 3,Sun,Home,W,1,0,Arsenal,...,46.7,939.0,8.0,16.0,13.0,18.0,9.0,62.5,Liverpool Stats,2025-26
4,2025-09-14,14:00,Premier League,Matchweek 4,Sun,Away,,,,Burnley,...,,,,,,,,,Liverpool Stats,2025-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4339,2024-04-28,18:30,Pro League A,Champions play-offs,Sun,Home,W,4,1,Antwerp,...,57.9,1192.0,28.0,22.0,20.0,30.0,17.0,68.8,Union SG Stats,2023-24
4340,2024-05-05,18:30,Pro League A,Champions play-offs,Sun,Home,D,0,0,Anderlecht,...,18.2,1034.0,13.0,30.0,24.0,22.0,10.0,62.5,Union SG Stats,2023-24
4341,2024-05-13,20:30,Pro League A,Champions play-offs,Mon,Away,D,2,2,Club Brugge,...,46.7,743.0,14.0,28.0,23.0,30.0,13.0,47.6,Union SG Stats,2023-24
4342,2024-05-19,13:30,Pro League A,Champions play-offs,Sun,Away,W,2,1,Cercle Brugge,...,61.1,495.0,10.0,26.0,17.0,26.0,10.0,55.0,Union SG Stats,2023-24


In [245]:
all_match_data.to_csv("champions_league_match_stats.csv", index=False)