In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import time
from datetime import datetime
from io import StringIO



In [2]:
#Data directory configuration
DATA_DIR = "data"
RAW_DATA_DIR = os.path.join("..", DATA_DIR, "raw")
os.makedirs(RAW_DATA_DIR, exist_ok=True)

In [3]:
#League configuration with proper naming
LEAGUES = {
    "Big5_Europe": "https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats",
    "Portugal": "https://fbref.com/en/comps/32/stats/Primeira-Liga-Stats#all_stats_standard",
    "Turkey": "https://fbref.com/en/comps/26/stats/Super-Lig-Stats#all_stats_standard",
    "Brazil": "https://fbref.com/en/comps/24/stats/Serie-A-Stats#all_stats_standard",
    "Netherlands": "https://fbref.com/en/comps/23/stats/Eredivisie-Stats#all_stats_standard"
}

In [4]:
def get_player_stats_url(league_url):
    """Convert league URL to player stats URL"""
    return league_url.replace("-Stats", "/stats/players/") + "-Stats"

def scrape_league_stats():
    all_data = []
    
    for league_name, league_url in LEAGUES.items():
        print(f"\nScraping {league_name}...")
        player_stats_url = get_player_stats_url(league_url)
        
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        
        try:
            # Request with timeout
            response = requests.get(player_stats_url, headers=headers, timeout=15)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            table = soup.find('table', {'id': 'stats_standard'})
            
            if not table:
                print(f"Warning: Stats table not found at {player_stats_url}")
                continue
                
            # Proper HTML parsing to avoid FutureWarning
            html_string = str(table)
            df = pd.read_html(StringIO(html_string))[0]
            
            # Clean and standardize data
            df.columns = df.columns.droplevel(0) if isinstance(df.columns, pd.MultiIndex) else df.columns
            df['League'] = league_name
            df['ScrapeDate'] = datetime.now().strftime('%Y-%m-%d')
            
            all_data.append(df)
            print(f"Success: Added {len(df)} players from {league_name}")
            
        except requests.exceptions.RequestException as e:
            print(f"Request failed for {league_name}: {str(e)}")
        except Exception as e:
            print(f"Error processing {league_name}: {str(e)}")
            
        time.sleep(25 + abs(hash(league_name)) % 15)  # Randomized delay
    
    return pd.concat(all_data, ignore_index=True) if all_data else None

In [5]:
def main():
    start_time = time.time()
    print("Starting FBref scraping...")
    
    df = scrape_league_stats()
    
    if df is not None:
        output_path = os.path.join(RAW_DATA_DIR, f"fbref_strikers_raw_{datetime.now().strftime('%Y%m%d')}.csv")
        df.to_csv(output_path, index=False)
        print(f"\nSuccess! Saved data to {output_path}")
        print(f"Total players scraped: {len(df)}")
        
        # Basic data validation
        print("\nLeague Distribution:")
        print(df['League'].value_counts())
        
        print("\nSample Data:")
        print(df[['Player', 'Pos', 'Squad', 'League']].head(3))
    else:
        print("\nScraping failed - no data collected")
    
    print(f"\nExecution time: {time.time() - start_time:.2f} seconds")

if __name__ == "__main__":
    main()

Starting FBref scraping...

Scraping Big5_Europe...
Success: Added 2895 players from Big5_Europe

Scraping Portugal...

Scraping Turkey...

Scraping Brazil...

Scraping Netherlands...

Success! Saved data to ../data/raw/fbref_strikers_raw_20250408.csv
Total players scraped: 2895

League Distribution:
League
Big5_Europe    2895
Name: count, dtype: int64

Sample Data:
           Player    Pos        Squad       League
0      Max Aarons     DF  Bournemouth  Big5_Europe
1      Max Aarons  DF,MF     Valencia  Big5_Europe
2  Rodrigo Abajas     DF     Valencia  Big5_Europe

Execution time: 35.10 seconds
