In [1]:
import os
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd



In [None]:
# Data directory configuration
DATA_DIR = "data"
RAW_DATA_DIR = os.path.join("..", DATA_DIR, "raw", "serie a brazil")
os.makedirs(RAW_DATA_DIR, exist_ok=True)

In [3]:
# FBref URLs for Brazilian Serie A seasons
SEASONS = {
    "2024": "https://fbref.com/en/comps/24/2024/stats/2024-Serie-A-Stats",
    "2023": "https://fbref.com/en/comps/24/2023/stats/2023-Serie-A-Stats",
    "2022": "https://fbref.com/en/comps/24/2022/stats/2022-Serie-A-Stats",
    "2021": "https://fbref.com/en/comps/24/2021/stats/2021-Serie-A-Stats"
}

In [4]:
# Function to parse HTML table into DataFrame
def get_fbref_table(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to load page {url} with status code {response.status_code}")
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    # FBref hides tables inside HTML comments
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    
    for comment in comments:
        comment_soup = BeautifulSoup(str(comment), "html.parser")
        table = comment_soup.find("table", id="stats_standard")
        if table:
            return table
    
    raise Exception("Could not find the 'stats_standard' table in the comments.")

In [5]:
# Function to parse HTML table into DataFrame
def parse_table_to_df(table):
    
    header_rows = table.find("thead").find_all("tr")
    headers = [th.get_text(strip=True) for th in header_rows[-1].find_all("th")]

    rows = table.find("tbody").find_all("tr")
    data = []

    for row in rows:
        if row.get("class") and "thead" in row.get("class"):
            continue  # Skip sub-header rows

        cells = row.find_all(["th", "td"])
        row_data = [cell.get_text(strip=True) for cell in cells]

        if len(row_data) != len(headers):
            print(f"⚠️ Skipping row due to length mismatch ({len(row_data)} vs {len(headers)}): {row_data[:5]}")
            continue

        data.append(row_data)

    if not data:
        raise ValueError("No valid rows found in table.")

    df = pd.DataFrame(data, columns=headers)

    # Clean column names
    df.columns = [col.replace("\n", " ").strip() for col in df.columns]

    return df

In [6]:
# Save DataFrame to CSV
def save_to_csv(df, filename):
    file_path = os.path.join(RAW_DATA_DIR, filename)
    df.to_csv(file_path, index=False)
    print(f"✅ Saved data to {file_path}")

In [7]:
# Main script runner
def main():
    for season, url in SEASONS.items():
        try:
            table = get_fbref_table(url)
            df = parse_table_to_df(table)
            save_to_csv(df, f"brasileiro_serie_a_{season}_standard_stats.csv")
        except Exception as e:
            print(f"❌ Error for season {season}: {e}")

if __name__ == "__main__":
    main()

  comment_soup = BeautifulSoup(str(comment), "html.parser")


✅ Saved data to ../data/raw/Big5/brasileiro_serie_a_2024_standard_stats.csv
✅ Saved data to ../data/raw/Big5/brasileiro_serie_a_2023_standard_stats.csv
✅ Saved data to ../data/raw/Big5/brasileiro_serie_a_2022_standard_stats.csv
✅ Saved data to ../data/raw/Big5/brasileiro_serie_a_2021_standard_stats.csv
