In [1]:
import os
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd



In [2]:
# Data directory configuration
DATA_DIR = "data"
RAW_DATA_DIR = os.path.join("..", DATA_DIR, "raw")
os.makedirs(RAW_DATA_DIR, exist_ok=True)

In [3]:
# FBref URL for Brazilian Serie A 2024 Standard Stats
URL = "https://fbref.com/en/comps/24/stats/Serie-A-Stats#all_stats_standard"

In [4]:
# Function to extract table from FBref HTML comments
def get_fbref_table(url, table_id="stats_standard"):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to load page {url} with status code {response.status_code}")
    
    soup = BeautifulSoup(response.content, "html.parser")
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    for comment in comments:
        comment_soup = BeautifulSoup(str(comment), "html.parser")
        table = comment_soup.find("table", id=table_id)
        if table:
            return table

    raise Exception(f"Could not find the table '{table_id}' in the HTML comments.")

In [None]:
# Function to parse HTML table into DataFrame
def parse_table_to_df(table):
    try:
        header_rows = table.find("thead").find_all("tr")
        headers = [th.get_text(strip=True) for th in header_rows[1].find_all("th")] # Use second row as header
        if not headers:
            raise Exception("Header contains no data.")
    except (AttributeError, IndexError):
        raise Exception("Header not found or malformed.")
    data_rows = []

    body_rows = table.find("tbody").find_all("tr")
    for row in body_rows:
        if row.get("class") and "thead" in row.get("class"):
            continue  # Skip sub-header rows

        cells = row.find_all(["th", "td"])
        row_data = [cell.get_text(strip=True) for cell in cells]

        # Pad row_data with None if it's shorter than headers
        if len(row_data) < len(headers):
            row_data.extend([None] * (len(headers) - len(row_data)))
        elif len(row_data) > len(headers):
            print(f"⚠️ Row length exceeds header length ({len(row_data)} vs {len(headers)}): {row_data}")
            row_data = row_data[:len(headers)]
        data_rows.append(row_data)

    if not data_rows:
        raise Exception("❌ Error: No valid rows found in table.")

    return pd.DataFrame(data_rows, columns=headers)

In [6]:
# Save DataFrame to CSV
def save_to_csv(df, filename):
    file_path = os.path.join(RAW_DATA_DIR, filename)
    df.to_csv(file_path, index=False)
    print(f"✅ Saved data to {file_path}")

In [7]:
# Main script runner
def main():
    table = get_fbref_table(URL)
    df = parse_table_to_df(table)
    save_to_csv(df, "serie_a_brazil_2024_standard_stats.csv")
    
if __name__ == "__main__":
    main()

  comment_soup = BeautifulSoup(str(comment), "html.parser")


✅ Saved data to ../data/raw/serie_a_brazil_2024_standard_stats.csv
