In [6]:
# Import all required libraries upfront
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [7]:
# URL for Premier League Stats
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

In [8]:
def get_fbref_data(url):
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run in headless mode
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--window-size=1920,1080')  # Set window size
    
    # Initialize the driver
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )
    
    try:
        # Add random delay between 1-3 seconds
        time.sleep(2)
        driver.get(url)
        
        # Get the page source
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')
        return soup
        
    finally:
        driver.quit()

In [None]:
# Main scraping function
def scrape_premier_league_data(start_year=2025, end_year=2023):
    all_matches = []
    standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
    
    years = list(range(start_year, end_year, -1))
    for year in years:
        print(f"\nScraping {year} season...")
        
        # Get standings page using Selenium
        soup = get_fbref_data(standings_url)
        standings_table = soup.select('table.stats_table')[0]
        
        # Get team URLs
        links = [l.get("href") for l in standings_table.find_all('a')]
        links = [l for l in links if '/squads/' in l]
        team_urls = [f"https://fbref.com{l}" for l in links]
        
        # Get URL for previous season
        previous_season = soup.select("a.prev")[0].get("href")
        standings_url = f"https://fbref.com{previous_season}"
        
        # Process each team
        for team_url in team_urls:
            try:
                team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
                print(f"\nProcessing {team_name}...")
                
                # Get team page data using Selenium
                team_soup = get_fbref_data(team_url)
                matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
                
                # Get shooting stats URL
                links = [l.get("href") for l in team_soup.find_all('a')]
                shooting_links = [l for l in links if l and 'all_comps/shooting/' in l]
                
                if shooting_links:
                    # Get shooting stats using Selenium
                    shooting_soup = get_fbref_data(f"https://fbref.com{shooting_links[0]}")
                    shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]
                    shooting.columns = shooting.columns.droplevel()
                    
                    try:
                        # Merge matches with shooting data
                        team_data = matches.merge(
                            shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], 
                            on="Date"
                        )
                        
                        # Filter Premier League matches only
                        team_data = team_data[team_data["Comp"] == "Premier League"]
                        
                        # Add season and team info
                        team_data["Season"] = year
                        team_data["Team"] = team_name
                        
                        all_matches.append(team_data)
                        print(f"Successfully scraped {len(team_data)} matches")
                        
                    except ValueError as e:
                        print(f"Error merging data for {team_name}: {e}")
                        continue
                
                time.sleep(2)  # Reduced delay between teams
                
            except Exception as e:
                print(f"Error processing {team_url}: {e}")
                continue+
    
    # Combine all data
    if all_matches:
        match_df = pd.concat(all_matches, ignore_index=True)
        # Convert column names to lowercase to match matches.csv
        match_df.columns = [c.lower() for c in match_df.columns]
        return match_df
    return None

In [10]:
# Run the scraper
match_df = scrape_premier_league_data(2022, 2020)
if match_df is not None:
    match_df.to_csv("test.csv", index=False)
    print("Data successfully saved to matches.csv")


Scraping 2022 season...

Processing Liverpool...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Arsenal...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Manchester City...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Chelsea...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Newcastle United...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Aston Villa...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Nottingham Forest...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Brighton and Hove Albion...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Bournemouth...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Error processing https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats: Document is empty

Processing Brentford...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Fulham...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Crystal Palace...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Everton...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing West Ham United...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Manchester United...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Wolverhampton Wanderers...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Tottenham Hotspur...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Leicester City...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Error processing https://fbref.com/en/squads/a2d435b3/Leicester-City-Stats: Document is empty

Processing Ipswich Town...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Southampton...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Scraping 2021 season...

Processing Manchester City...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Arsenal...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Liverpool...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Aston Villa...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Tottenham Hotspur...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Chelsea...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Newcastle United...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Manchester United...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing West Ham United...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Crystal Palace...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Brighton and Hove Albion...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Bournemouth...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Fulham...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Wolverhampton Wanderers...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Everton...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Brentford...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Nottingham Forest...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Luton Town...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Burnley...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches

Processing Sheffield United...


  matches = pd.read_html(team_soup.prettify(), match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_soup.prettify(), match="Shooting")[0]


Successfully scraped 38 matches
Data successfully saved to matches.csv
