In [23]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

In [24]:
all_teams = []

In [None]:
url = 'https://fbref.com/en/comps/9/Premier-League-Stats'
html = requests.get(url).text

# Try using 'html.parser' instead of 'lxml'
soup = BeautifulSoup(html, 'html.parser')

# Find all tables with the class 'stats_table'
tables = soup.find_all('table', class_='stats_table')

if tables:
    table = tables[0]  # Get the first table
    print(table)  # Print to check if data is being fetched
else:
    print("No table found with class 'stats_table'")

In [26]:
links = table.find_all('a') ## finding all links in the table 
links = [l.get("href") for l in links] ##parsing through links
links = [l for l in links if '/squads/' in l] ##filtering through links to only get squads

In [27]:
team_urls = [f"https://fbref.com{l}" for l in links] ## formatting back to links

In [None]:
for team_url in team_urls:
    team_name = team_url.split("/")[-1].replace("-Stats", "")  # Extract team name
    print(f"Scraping data for {team_name}...")

    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    
    data = requests.get(team_url, headers=headers).text
    soup = BeautifulSoup(data, 'html.parser')


    stats_tables = soup.find_all('table', class_="stats_table")
    if not stats_tables:
        print(f"No stats table found for {team_name}, skipping...")
        continue

    stats = stats_tables[0]  # Get the first table
    print(soup.prettify())  # View the entire page source

    # Convert table HTML to DataFrame
    team_data = pd.read_html(str(stats))[0]

    # Drop multi-level index if present
    if isinstance(team_data.columns, pd.MultiIndex):
        team_data.columns = team_data.columns.droplevel(0)

    team_data["Team"] = team_name
    all_teams.append(team_data)  # Append to list

    time.sleep(5)  # Prevent getting blocked



In [None]:
# Ensure all DataFrames have the same structure
common_columns = list(set.intersection(*(set(df.columns) for df in all_teams)))
all_teams = [df[common_columns] for df in all_teams]  # Keep only common columns
print(f"Total teams scraped: {len(all_teams)}")
for i, df in enumerate(all_teams):
    print(f"DataFrame {i} shape: {df.shape}")

# Reset index and concatenate
stat_df = pd.concat([df.reset_index(drop=True) for df in all_teams], ignore_index=True)

# Save to CSV
stat_df.to_csv("stats.csv", index=False)
print("CSV successfully saved!")