In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import time
from datetime import datetime
from io import StringIO



In [2]:
# Data directory configuration
DATA_DIR = "data"
RAW_DATA_DIR = os.path.join("..", DATA_DIR, "raw", "Big5")
os.makedirs(RAW_DATA_DIR, exist_ok=True)

In [3]:
# Seasons to scrape (FBref format)
SEASONS = {
    "2023-2024": "https://fbref.com/en/comps/24/2023-2024/stats/2023-2024-Serie-A-Stats",
    "2022-2023": "https://fbref.com/en/comps/24/2022-2023/stats/2022-2023-Serie-A-Stats",
    "2021-2022": "https://fbref.com/en/comps/24/2021-2022/stats/2021-2022-Serie-A-Stats",
    "2020-2021": "https://fbref.com/en/comps/24/2020-2021/stats/2020-2021-Serie-A-Stats"
}

In [4]:
# Function to scrape FBref stats for a given season
def get_player_stats_section(url):
    """Extract the player stats table from the page"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the player stats comment section
        comments = soup.find_all(string=lambda text: isinstance(text, str) and 'id="all_stats_standard"' in text)
        if not comments:
            return None
            
        # Extract the table HTML from the comment
        comment_soup = BeautifulSoup(comments[0], 'html.parser')
        table = comment_soup.find('table', {'id': 'stats_standard'})
        return str(table) if table else None
        
    except Exception as e:
        print(f"Error fetching {url}: {str(e)}")
        return None

In [5]:
# Function to extract comments from HTML
def scrape_season_stats():
    all_seasons_data = []
    
    for season, url in SEASONS.items():
        print(f"\nScraping {season} Serie A...")
        start_time = time.time()
        
        table_html = get_player_stats_section(url)
        if not table_html:
            print(f"Warning: Could not extract player stats table for {season}")
            continue
            
        try:
            # Parse the HTML table
            df = pd.read_html(StringIO(table_html))[0]
            
            # Clean the dataframe
            df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
            df = df[df['Rk'].ne('Rk')].reset_index(drop=True)  # Remove header rows
            df['Season'] = season
            df['ScrapeDate'] = datetime.now().strftime('%Y-%m-%d')
            
            all_seasons_data.append(df)
            elapsed = time.time() - start_time
            print(f"Success: Added {len(df)} players from {season} ({elapsed:.1f}s)")
            
            # Save individual season data
            season_file = os.path.join(RAW_DATA_DIR, f'serie_a_{season.replace("-", "_")}.csv')
            df.to_csv(season_file, index=False)
            print(f"Saved {season} data to {season_file}")
            
        except Exception as e:
            print(f"Error processing {season} data: {str(e)}")
            
        time.sleep(45 + abs(hash(season)) % 30)  # Randomized delay 45-75s
    
    return pd.concat(all_seasons_data, ignore_index=True) if all_seasons_data else None

In [6]:
def main():
    start_time = time.time()
    print("Starting Serie A season scraping...")
    
    combined_df = scrape_season_stats()
    
    if combined_df is not None:
        output_path = os.path.join(RAW_DATA_DIR, 'serie_a_combined_seasons.csv')
        combined_df.to_csv(output_path, index=False)
        
        print(f"\nSuccess! Saved combined data to {output_path}")
        print(f"Total players scraped: {len(combined_df)}")
        
        print("\nSeason Distribution:")
        print(combined_df['Season'].value_counts().to_string())
        
        print("\nSample Data (Forwards Only):")
        print(combined_df[combined_df['Pos'].str.contains('FW', na=False)]
              [['Player', 'Pos', 'Squad', 'Season', 'Gls', 'Ast', 'npxG']]
              .head(5).to_string(index=False))
    else:
        print("\nScraping failed - no data collected")
    
    print(f"\nTotal execution time: {time.time() - start_time:.1f} seconds")

if __name__ == "__main__":
    main()

Starting Serie A season scraping...

Scraping 2023-2024 Serie A...

Scraping 2022-2023 Serie A...

Scraping 2021-2022 Serie A...

Scraping 2020-2021 Serie A...

Scraping failed - no data collected

Total execution time: 0.7 seconds
