In [19]:
# Load packages
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import random


In [21]:
def setup_driver():
    """Set up Chrome WebDriver with appropriate options."""
    chrome_options = Options()
    chrome_options.add_argument('--headless=new')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920x1080')
    
    # Add user agent
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    
    # Additional settings to avoid detection
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    # Use ChromeDriverManager to handle driver installation
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    return driver

def get_season_url(year):
    """Generate URL for a specific Premier League season."""
    if year == 2024:  # Current season
        return "https://fbref.com/en/comps/9/stats/Premier-League-Stats"
    else:
        return f"https://fbref.com/en/comps/9/{year}-{year+1}/stats/{year}-{year+1}-Premier-League-Stats"

def scrape_season(year):
    """Scrape data for a specific Premier League season."""
    url = get_season_url(year)
    
    try:
        # Set up the driver
        driver = setup_driver()
        
        # Load the page
        print("Loading page...")
        driver.get(url)
        
        # Wait for the table to load
        print("Waiting for table to load...")
        wait = WebDriverWait(driver, 20)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table#stats_standard')))
        
        # Get the page source
        print("Parsing data...")
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find the stats table
        stats_table = soup.find('table', {'id': 'stats_standard'})
        if not stats_table:
            print("No stats table found!")
            return None
            
        # Extract player data
        all_players = []
        for row in stats_table.find('tbody').find_all('tr'):
            try:
                # Get player data from the row
                player_name = row.find('td', {'data-stat': 'player'}).text.strip()
                squad_name = row.find('td', {'data-stat': 'team'}).text.strip()
                age = row.find('td', {'data-stat': 'age'}).text.strip()
                nation = row.find('td', {'data-stat': 'nationality'}).text.strip() if row.find('td', {'data-stat': 'nationality'}) else ''
                
                all_players.append({
                    'Player': player_name,
                    'Squad': squad_name,
                    'Age': age,
                    'Nationality': nation
                })
                
            except (AttributeError, IndexError) as e:
                print(f"Error parsing row: {str(e)}")
                continue
        
        # Create DataFrame
        df = pd.DataFrame(all_players)
        df['Season'] = f'{year}-{year+1}'
        
        print(f"Successfully scraped {len(df)} players")
        return df
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return None
        
    finally:
        driver.quit()

In [23]:
# Run the scraper for multiple seasons
all_seasons = []

# Start with recent seasons first
for year in range(2024, 1991, -1):  # From 2023 back to 1992
    print(f"\nScraping season {year}-{year+1}...")
    df = scrape_season(year)
    if df is not None:
        all_seasons.append(df)
        print(f"Success! Found {len(df)} players")
    else:
        print(f"Failed to scrape season {year}-{year+1}")
    
    # Add a delay between seasons
    time.sleep(random.uniform(3, 5))

# Combine all seasons
if all_seasons:
    final_df = pd.concat(all_seasons, ignore_index=True)
    print(f"\nTotal players found across all seasons: {len(final_df)}")
    
    # Save to CSV
    final_df.to_csv('premier_league_players_all_seasons.csv', index=False)
    print("\nData saved to 'premier_league_players_all_seasons.csv'")
    
    # Display sample
    print("\nSample of data (3 rows per season):")
    print(final_df.groupby('Season').head(3))
else:
    print("No data was successfully scraped")



Scraping season 2024-2025...
Loading page...
Waiting for table to load...
Parsing data...
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute 'text'
Error parsing row: 'NoneType' object has no attribute '

In [25]:
final_df.head()

Unnamed: 0,Player,Squad,Age,Nationality,Season
0,Max Aarons,Bournemouth,25-111,eng ENG,2024-2025
1,Joshua Acheampong,Chelsea,18-355,eng ENG,2024-2025
2,Tyler Adams,Bournemouth,26-070,us USA,2024-2025
3,Tosin Adarabioyo,Chelsea,27-213,eng ENG,2024-2025
4,Simon Adingra,Brighton,23-114,ci CIV,2024-2025
