In [None]:
import os
import re
from glob import glob
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup


HTML_DIR = "html"  # carpeta donde tienes tm_scorers_1992.html, tm_scorers_1993.html, ...
OUT_CSV = "../data/transfermarkt_ucl_scorers_1992_2025.csv"


def to_int(text: str):
    """Convierte texto a int (limpiando símbolos, comas, etc.)."""
    if text is None:
        return None
    if not isinstance(text, str):
        text = str(text)
    # nos quedamos solo con dígitos
    digits = re.sub(r"[^\d]", "", text)
    return int(digits) if digits else None


def parse_tm_scorers_html(html_path: str | Path) -> pd.DataFrame:
    """
    Parsea un HTML de goleadores de Transfermarkt y devuelve:
    season_year, player_name, nationality, team_name, apps, assists, penalties, goals
    """
    html_path = Path(html_path)
    print(f"➡️ Procesando {html_path}")

    html = html_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")

    # 1) Temporada desde el <select name="saison_id">
    season_year = None
    sel = soup.find("select", {"name": "saison_id"})
    if sel:
        opt = sel.find("option", selected=True)
        if opt and opt.get("value", "").isdigit():
            season_year = int(opt["value"])

    # Si por lo que sea no lo encontramos, intentamos sacarlo del nombre del archivo
    if season_year is None:
        m = re.search(r"(\d{4})", html_path.name)
        if m:
            season_year = int(m.group(1))

    # 2) Tabla principal de goleadores
    table = soup.find("table", class_="items")
    if not table:
        print(f"   ⚠ No se encontró la tabla .items en {html_path}")
        return pd.DataFrame()

    tbody = table.find("tbody")
    if tbody is None:
        print(f"   ⚠ No se encontró <tbody> en {html_path}")
        return pd.DataFrame()

    rows = tbody.find_all("tr", recursive=False)

    data_rows = []

    for tr in rows:
        # Filas de jugadores tienen clase odd/even
        classes = tr.get("class", [])
        if "odd" not in classes and "even" not in classes:
            continue

        # Solo td de primer nivel (evitar la tabla interna)
        tds = tr.find_all("td", recursive=False)
        if len(tds) < 12:
            continue

        # Índices de columnas:
        # 0 -> #
        # 1 -> tabla interna: imagen + nombre + posición
        # 2 -> nacionalidad (banderas)
        # 3 -> edad
        # 4 -> club
        # 5 -> alineaciones
        # 6 -> asistencias
        # 7 -> penaltis
        # 8 -> minutos jugados
        # 9 -> minutos por gol
        # 10 -> goles por encuentro
        # 11 -> goles

        # ---- Jugador ----
        player_cell = tds[1]
        player_a = player_cell.select_one("td.hauptlink a")
        if player_a:
            player_name = player_a.get_text(strip=True)
        else:
            player_name = player_cell.get_text(" ", strip=True)

        # ---- Nacionalidad ----
        nat_imgs = tds[2].select("img[title]")
        nationality = " / ".join(
            img["title"].strip()
            for img in nat_imgs
            if img.get("title")
        )

        # ---- Club ----
        club_cell = tds[4]
        club_a = club_cell.select_one("a[title]")
        if club_a and club_a.get("title"):
            team_name = club_a["title"].strip()
        else:
            # fallback por si cambia algo
            team_name = club_cell.get_text(" ", strip=True)

        # ---- Stats ----
        apps = to_int(tds[5].get_text(strip=True))       # Alineaciones
        assists = to_int(tds[6].get_text(strip=True))    # Asistencias
        penalties = to_int(tds[7].get_text(strip=True))  # Penaltis (goles de penalti)
        goals = to_int(tds[11].get_text(strip=True))     # Goles

        data_rows.append(
            {
                "season_year": season_year,
                "player_name": player_name,
                "nationality": nationality,
                "team_name": team_name,
                "apps": apps,
                "assists": assists,
                "penalties": penalties,
                "goals": goals,
            }
        )

    return pd.DataFrame(data_rows)


def main():
    os.makedirs("data", exist_ok=True)

    pattern = os.path.join(HTML_DIR, "tm_scorers_*.html")
    files = sorted(glob(pattern))

    if not files:
        print(f"❌ No se encontraron archivos HTML con patrón {pattern}")
        return

    all_dfs = []

    for f in files:
        df = parse_tm_scorers_html(f)
        if not df.empty:
            all_dfs.append(df)

    if not all_dfs:
        print("❌ No se ha generado ningún dato (¿las tablas cambian o están vacías?).")
        return

    df_all = pd.concat(all_dfs, ignore_index=True)

    df_all.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
    print(f"\n✅ CSV de goleadores generado: {OUT_CSV}")
    print(df_all.head())


if __name__ == "__main__":
    main()


➡️ Procesando html\tm_scorers_1992.html
➡️ Procesando html\tm_scorers_1993.html
➡️ Procesando html\tm_scorers_1994.html
➡️ Procesando html\tm_scorers_1995.html
➡️ Procesando html\tm_scorers_1996.html
➡️ Procesando html\tm_scorers_1997.html
➡️ Procesando html\tm_scorers_1998.html
➡️ Procesando html\tm_scorers_1999.html
➡️ Procesando html\tm_scorers_2000.html
➡️ Procesando html\tm_scorers_2001.html
➡️ Procesando html\tm_scorers_2002.html
➡️ Procesando html\tm_scorers_2003.html
➡️ Procesando html\tm_scorers_2004.html
➡️ Procesando html\tm_scorers_2005.html
➡️ Procesando html\tm_scorers_2006.html
➡️ Procesando html\tm_scorers_2007.html
➡️ Procesando html\tm_scorers_2008.html
➡️ Procesando html\tm_scorers_2009.html
➡️ Procesando html\tm_scorers_2010.html
➡️ Procesando html\tm_scorers_2011.html
➡️ Procesando html\tm_scorers_2012.html
➡️ Procesando html\tm_scorers_2013.html
➡️ Procesando html\tm_scorers_2014.html
➡️ Procesando html\tm_scorers_2015.html
➡️ Procesando html\tm_scorers_2016.html


In [None]:
import os
import re
from glob import glob
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup


HTML_DIR = "html"  # carpeta donde tienes tm_goalkeepers_1992.html, tm_goalkeepers_1993.html, ...
OUT_CSV = "../data/transfermarkt_ucl_goalkeepers_1992_2025.csv"


def to_int(text: str):
    """Convierte texto a int (limpiando símbolos, comas, etc.)."""
    if text is None:
        return None
    if not isinstance(text, str):
        text = str(text)
    digits = re.sub(r"[^\d]", "", text)
    return int(digits) if digits else None


def parse_tm_goalkeepers_html(html_path: str | Path) -> pd.DataFrame:
    """
    Parsea un HTML de PORTEROS de Transfermarkt y devuelve:
    season_year, player_name, team_name, nationality,
    apps, clean_sheets, goals_conceded, minutes_per_goal_conceded
    """
    html_path = Path(html_path)
    print(f"➡️ Procesando {html_path}")

    html = html_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")

    # ---- 1) Temporada desde <select name="saison_id"> ----
    season_year = None
    sel = soup.find("select", {"name": "saison_id"})
    if sel:
        opt = sel.find("option", selected=True)
        if opt and opt.get("value", "").isdigit():
            season_year = int(opt["value"])

    # Fallback: intentar sacarlo del nombre del archivo
    if season_year is None:
        m = re.search(r"(\d{4})", html_path.name)
        if m:
            season_year = int(m.group(1))

    # ---- 2) Tabla de porteros ----
    table = soup.find("table", class_="items")
    if not table:
        print(f"   ⚠ No se encontró la tabla .items en {html_path}")
        return pd.DataFrame()

    tbody = table.find("tbody")
    if not tbody:
        print(f"   ⚠ No se encontró <tbody> en {html_path}")
        return pd.DataFrame()

    rows = tbody.find_all("tr", recursive=False)
    data_rows = []

    for tr in rows:
        classes = tr.get("class", [])
        if "odd" not in classes and "even" not in classes:
            continue

        tds = tr.find_all("td", recursive=False)
        # En los ejemplos modernos hay 9 columnas (0–8)
        if len(tds) < 8:
            continue

        # -------------------------------------------------
        # Estructura típica:
        # 0 -> #
        # 1 -> tabla interna: imagen + nombre + club
        # 2 -> país (bandera)
        # 3 -> alineaciones
        # 4 -> sin encajar
        # 5 -> goles encajados
        # 6 -> minutos jugados
        # 7 -> minutos por gol encajado
        # 8 -> % paradas (no lo necesitamos)
        # -------------------------------------------------

        # ---- Jugador y club (ambos están en el TD 1) ----
        player_cell = tds[1]

        # Nombre del jugador
        player_a = player_cell.select_one("td.hauptlink a[title]")
        if player_a:
            player_name = player_a.get("title", "").strip() or player_a.get_text(strip=True)
        else:
            player_name = player_cell.get_text(" ", strip=True)

        # Club: en el mismo <td>, suele ser el enlace con href del tipo "/.../startseite/verein/..."
        club_a = player_cell.select_one("a[href*='startseite/verein']")
        if club_a:
            team_name = club_a.get("title", "").strip() or club_a.get_text(strip=True)
        else:
            # Fallback por si cambia un poco
            team_name = player_cell.get_text(" ", strip=True)

        # ---- País / nacionalidad ----
        nat_imgs = tds[2].select("img[title]")
        nationality = " / ".join(
            img["title"].strip()
            for img in nat_imgs
            if img.get("title")
        )

        # ---- Stats ----
        apps = to_int(tds[3].get_text(strip=True))                # Alineaciones
        clean_sheets = to_int(tds[4].get_text(strip=True))        # Sin encajar
        goals_conceded = to_int(tds[5].get_text(strip=True))      # Goles encajados
        minutes_per_goal = to_int(tds[7].get_text(strip=True))    # Minutos por gol en contra

        data_rows.append(
            {
                "season_year": season_year,
                "player_name": player_name,
                "team_name": team_name,
                "nationality": nationality,
                "apps": apps,
                "clean_sheets": clean_sheets,
                "goals_conceded": goals_conceded,
                "minutes_per_goal_conceded": minutes_per_goal,
            }
        )

    return pd.DataFrame(data_rows)


def main():
    os.makedirs("data", exist_ok=True)

    pattern = os.path.join(HTML_DIR, "tm_goalkeepers_*.html")
    files = sorted(glob(pattern))

    if not files:
        print(f"❌ No se encontraron archivos HTML con patrón {pattern}")
        return

    all_dfs = []

    for f in files:
        df = parse_tm_goalkeepers_html(f)
        if not df.empty:
            all_dfs.append(df)

    if not all_dfs:
        print("❌ No se ha generado ningún dato (¿las tablas están vacías o la estructura es distinta?).")
        return

    df_all = pd.concat(all_dfs, ignore_index=True)
    df_all.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

    print(f"\n✅ CSV de porteros generado: {OUT_CSV}")
    print(df_all.head())


if __name__ == "__main__":
    main()


➡️ Procesando html\tm_goalkeepers_1992.html
➡️ Procesando html\tm_goalkeepers_1993.html
➡️ Procesando html\tm_goalkeepers_1994.html
➡️ Procesando html\tm_goalkeepers_1995.html
➡️ Procesando html\tm_goalkeepers_1996.html
➡️ Procesando html\tm_goalkeepers_1997.html
➡️ Procesando html\tm_goalkeepers_1998.html
➡️ Procesando html\tm_goalkeepers_1999.html
➡️ Procesando html\tm_goalkeepers_2000.html
➡️ Procesando html\tm_goalkeepers_2001.html
➡️ Procesando html\tm_goalkeepers_2002.html
➡️ Procesando html\tm_goalkeepers_2003.html
➡️ Procesando html\tm_goalkeepers_2004.html
➡️ Procesando html\tm_goalkeepers_2005.html
➡️ Procesando html\tm_goalkeepers_2006.html
➡️ Procesando html\tm_goalkeepers_2007.html
➡️ Procesando html\tm_goalkeepers_2008.html
➡️ Procesando html\tm_goalkeepers_2009.html
➡️ Procesando html\tm_goalkeepers_2010.html
➡️ Procesando html\tm_goalkeepers_2011.html
➡️ Procesando html\tm_goalkeepers_2012.html
➡️ Procesando html\tm_goalkeepers_2013.html
➡️ Procesando html\tm_goalkeeper