In [1]:
import os
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd



In [2]:
# Data directory configuration
DATA_DIR = "data"
RAW_DATA_DIR = os.path.join("..", DATA_DIR, "raw")
os.makedirs(RAW_DATA_DIR, exist_ok=True)

In [3]:
# FBref URL for Süper Lig 2024-2025 Standard Stats
URL = "https://fbref.com/en/comps/26/stats/Super-Lig-Stats#all_stats_standard"

In [None]:
# Function to parse HTML table into DataFrame
def get_fbref_table(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to load page {url} with status code {response.status_code}")
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    # FBref hides tables inside HTML comments
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    
    for comment in comments:
        comment_soup = BeautifulSoup(str(comment), "html.parser")
        table = comment_soup.find("table", id="stats_standard")
        if table:
            return table
    
    raise Exception("Could not find the 'stats_standard' table in the comments.")

In [None]:
# Function to parse HTML table into DataFrame
def parse_table_to_df(table):

    header_rows = table.find("thead").find_all("tr")
    headers = [th.get_text(strip=True) for th in header_rows[-1].find_all("th")]

    rows = table.find("tbody").find_all("tr")
    data = []

    for row in rows:
        if row.get("class") and "thead" in row.get("class"):
            continue  # Skip sub-header rows

        cells = row.find_all(["th", "td"])
        row_data = [cell.get_text(strip=True) for cell in cells]

        if len(row_data) != len(headers):
            print(f"⚠️ Skipping row due to length mismatch ({len(row_data)} vs {len(headers)}): {row_data[:5]}")
            continue

        data.append(row_data)

    if not data:
        raise ValueError("No valid rows found in table.")

    df = pd.DataFrame(data, columns=headers)

    # Clean column names
    df.columns = [col.replace("\n", " ").strip() for col in df.columns]

    return df

In [None]:
# Skip sub-header rows
def save_to_csv(df, filename):
    file_path = os.path.join(RAW_DATA_DIR, filename)
    df.to_csv(file_path, index=False)
    print(f"✅ Saved data to {file_path}")

In [None]:
# Main script runner
def main():
    try:
        table = get_fbref_table(URL)
        df = parse_table_to_df(table)
        save_to_csv(df, "super_lig_Turkye_2024_2025_stats.csv")
    except Exception as e:
        print(f"❌ Error: {e}")

if __name__ == "__main__":
    main()

  comment_soup = BeautifulSoup(str(comment), "html.parser")


✅ Saved data to ../data/raw/super_lig_Turkye_2024_2025_stats.csv
