## Voy pegando aquí abajo el código útil

In [4]:
from pathlib import Path
import os

# Establece la raíz del proyecto manualmente
project_root = Path("F:/JCMDataCenter/Cursos/Evolve Academy/Data Scientist IA/Futpeak") # sobremesa
#project_root = Path("C:/Users/juanm/Desktop/FUTPEAK/Futpeak") # portátil

# Cambia el directorio de trabajo actual a esa raíz
os.chdir(project_root)

print("📁 Directorio de trabajo actual:", Path.cwd())

📁 Directorio de trabajo actual: F:\JCMDataCenter\Cursos\Evolve Academy\Data Scientist IA\Futpeak


In [None]:
country_urls = {
    "Argentina": "https://fbref.com/en/country/players/ARG/Argentina-Football-Players",
    "Brazil": "https://fbref.com/en/country/players/BRA/Brazil-Football-Players",
    "France": "https://fbref.com/en/country/players/FRA/France-Football-Players",
    "Spain": "https://fbref.com/en/country/players/ESP/Spain-Football-Players",
    "England": "https://fbref.com/en/country/players/ENG/England-Football-Players",
    "Germany": "https://fbref.com/en/country/players/GER/Germany-Football-Players",
    "Italy": "https://fbref.com/en/country/players/ITA/Italy-Football-Players",
    "Belgium": "https://fbref.com/en/country/players/BEL/Belgium-Football-Players",
    "Netherlands": "https://fbref.com/en/country/players/NED/Netherlands-Football-Players",
    "Portugal": "https://fbref.com/en/country/players/POR/Portugal-Football-Players"
}

In [None]:
# Scraping de jugadores de top 10 países FIFA para crear el yaml

import yaml
import time
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# === CONFIGURATION ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
BASE_URL = "https://fbref.com"
OUTPUT_YAML = Path("data/meta/top_10_countries_players.yaml")
OUTPUT_YAML.parent.mkdir(parents=True, exist_ok=True)
WAIT_PER_PAGE = 10
WAIT_BETWEEN_COUNTRIES = 15

# === SETUP SELENIUM (visible) ===
options = Options()
# Comment out to see browser:
# options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(CHROME_PATH), options=options)

# === INIT YAML clean ===
with open(OUTPUT_YAML, "w", encoding="utf-8") as f:
    yaml.dump([], f)

# === SCRAPE ALL COUNTRIES ===
total_players = 0
consecutive_fails = 0

for country, url in country_urls.items():
    print(f"\n🌍 Opening: {country}...")
    driver.get(url)
    print(f"⏳ Waiting {WAIT_PER_PAGE}s to load or solve CAPTCHA...")
    time.sleep(WAIT_PER_PAGE)

    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Dynamically find player section
    section = None
    for candidate in soup.select("div.section_content"):
        p_tags = candidate.find_all("p")
        valid = [
            p for p in p_tags
            if p.find("a", href=True) and "/en/players/" in p.find("a")["href"]
        ]
        if len(valid) > 10:
            section = candidate
            break

    if not section:
        print(f"⚠️ Could not find player block for {country}. Skipping.")
        consecutive_fails += 1
        if consecutive_fails >= 3:
            print("⛔ Too many skips. Stopping early.")
            break
        continue

    consecutive_fails = 0
    player_tags = section.find_all("p")
    print(f"🔍 Found {len(player_tags)} players in {country}")

    players = []

    for tag in player_tags:
        a = tag.find("a")
        if not a or not a.get("href"):
            continue

        name_raw = a.text.strip()
        href = a["href"].strip()
        full_url = BASE_URL + href

        try:
            player_id = href.split("/")[3]
        except IndexError:
            player_id = None

        trailing = tag.get_text(separator=" ").replace(name_raw, "").strip()
        parts = [p.strip() for p in trailing.split("·")]

        players.append({
            "id": player_id,
            "name": name_raw.replace(" ", "_").lower(),
            "display_name": name_raw,
            "url": full_url,
            "years": parts[0] if len(parts) > 0 else None,
            "position": parts[1] if len(parts) > 1 else None,
            "clubs": parts[2] if len(parts) > 2 else None,
            "country": country
        })

    # Save incrementally
    with open(OUTPUT_YAML, "r", encoding="utf-8") as f:
        current_data = yaml.safe_load(f) or []

    current_data.extend(players)

    with open(OUTPUT_YAML, "w", encoding="utf-8") as f:
        yaml.dump(current_data, f, allow_unicode=True)

    total_players += len(players)
    print(f"✅ {country} done. Total so far: {total_players}")
    time.sleep(WAIT_BETWEEN_COUNTRIES)

driver.quit()
print(f"\n💾 Saved all players to: {OUTPUT_YAML}")


In [None]:
# Get all players from 2014 and with 10+ years career

# 📦 Imports
import yaml
import pandas as pd
from pathlib import Path

# === Paths ===
input_path = Path("data/meta/top_10_countries_players.yaml")
output_path = Path("data/meta/top_10_countries_players_filtered.yaml")

# === Load YAML ===
with open(input_path, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

# === Filter players
filtered_players = []

for p in all_players:
    years = p.get("years", "")
    if not years or "-" not in years:
        continue
    try:
        start, end = years.split("-")
        start_year = int(start)
        end_year = int(end)
        if start_year >= 2014 and (end_year - start_year + 1) >= 10:
            filtered_players.append(p)
    except:
        continue

# === Save result
with open(output_path, "w", encoding="utf-8") as f:
    yaml.dump(filtered_players, f, allow_unicode=True)

print(f"✅ Total filtered players: {len(filtered_players)}")
print(f"📄 Saved to: {output_path}")


In [None]:
# Código scraping matchlogs fbref para seguir desde un jugador con un ID específico

# 📦 Imports
import time
import yaml
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path

# === CONFIGURATION ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
INPUT_YAML = Path("data/meta/top_10_countries_players_filtered.yaml")
OUTPUT_CSV = Path("data/raw/top_10_countries_matchlogs_filtered.csv")
RESUME_FROM_ID = "42fd9c7f" # Kylian Mbappé
SLEEP_TIME = 8

# === Load YAML players ===
with open(INPUT_YAML, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

# === Find resume point
resume_index = 0
for i, player in enumerate(all_players):
    if player.get("id") == RESUME_FROM_ID:
        resume_index = i
        break
remaining_players = all_players[resume_index:]
print(f"🔁 Resuming from index {resume_index}: {remaining_players[0]['name']}")

# === Setup Selenium
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(CHROME_PATH), options=options)

# === Check if CSV exists
output_exists = OUTPUT_CSV.exists()
if not output_exists:
    pd.DataFrame().to_csv(OUTPUT_CSV, index=False)

# === Scraping from RESUME point onward ===
for player in remaining_players:
    player_id = player.get("id")
    player_name = player.get("name")
    profile_url = player.get("url")
    if not player_id or not player_name or not profile_url:
        continue

    print(f"\n🧍 Scraping {player_name} | ID: {player_id}")

    try:
        driver.get(profile_url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # === Extract summary match log seasons
        summary_section = soup.find("div", id="inner_nav")
        matchlog_links = []
        if summary_section:
            matchlog_links = {
                a["href"] for a in summary_section.find_all("a", href=True)
                if "/matchlogs/" in a["href"] and "Match-Logs" in a["href"] and "summary" in a["href"].lower()
        }


        if not matchlog_links:
            print("   ⚠️ No Match Logs (Summary) seasons found.")
            continue

        player_rows = []

        for rel_url in matchlog_links:
            full_url = "https://fbref.com" + rel_url
            season = rel_url.split("/matchlogs/")[1].split("/")[0]
            print(f"   🔍 {season} — {full_url}")

            try:
                driver.get(full_url)
                time.sleep(SLEEP_TIME)
                sub_soup = BeautifulSoup(driver.page_source, "html.parser")
                table = sub_soup.find("table", {"id": "matchlogs_all"})

                if table:
                    header_rows = table.find("thead").find_all("tr")
                    final_header = header_rows[-1]
                    columns = [th.text.strip() for th in final_header.find_all("th")]

                    for row in table.find("tbody").find_all("tr"):
                        if "class" in row.attrs and "thead" in row["class"]:
                            continue
                        cells = row.find_all(["th", "td"])
                        values = [cell.text.strip() for cell in cells]
                        row_data = dict(zip(columns, values))
                        row_data["season"] = season
                        row_data["player_name"] = player_name
                        row_data["player_id"] = player_id
                        player_rows.append(row_data)
                else:
                    print("      ⚠️ No matchlogs_all table in season.")

            except Exception as e:
                print(f"      ❌ Error processing season {season}: {e}")

        if player_rows:
            df_player = pd.DataFrame(player_rows)
            df_player.to_csv(OUTPUT_CSV, mode="a", index=False, header=not output_exists)
            output_exists = True
            print(f"   ✅ {len(df_player)} rows added for {player_name}")
        else:
            print(f"   🚫 No data scraped for {player_name}")

    except Exception as e:
        print(f"❌ Error with {player_name}: {e}")

driver.quit()
print(f"\n📦 Done. Appended new match logs to: {OUTPUT_CSV}")

In [None]:
# Scraping metadata players filtered

# 📦 Imports
import yaml
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
from pathlib import Path
import random

# === Paths ===
YAML_PATH = "data/meta/top_10_countries_players_filtered.yaml"
OUTPUT_FILE = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"

# === Load YAML
with open(YAML_PATH, "r", encoding="utf-8") as f:
    players = yaml.safe_load(f)

# === Set the last scraped URL (for continuation)
# Leave it empty "" to scrape from the beginning
last_scraped_url = ""

# === Find starting point
start_index = 0  # By default start from 0

if last_scraped_url:
    for idx, player in enumerate(players):
        if player["url_template"] == last_scraped_url:
            start_index = idx + 1  # Start AFTER the last scraped player
            break
    else:
        raise ValueError("❌ last_scraped_url not found in male_players.yaml!")

players = players[start_index:]  # Only keep players after last scraped

print(f"🚀 Starting scraping from index {start_index} ({players[0]['name']})")

# === Setup Selenium
options = Options()
# options.add_argument("--headless")  # Optional: hide browser if you want
options.add_argument("--disable-gpu")
options.add_argument("--start-maximized")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Metadata extraction
def extract_metadata(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    meta = soup.find("div", id="meta")
    if not meta:
        return {}

    try:
        full_name = meta.find("p").text.strip()
    except:
        full_name = None

    position = footed = None
    try:
        pos_block = meta.find("strong", string="Position:").parent
        if pos_block:
            text = pos_block.get_text(separator="|")
            parts = text.split("|")
            position = parts[1].strip() if len(parts) > 1 else None
            footed = parts[3].strip() if "Footed:" in text and len(parts) > 3 else None
    except:
        pass

    birth_date = age = birth_place = None
    try:
        birth_tag = meta.find("strong", string="Born:")
        if birth_tag:
            birth_block = birth_tag.parent
            date_span = birth_block.find("span")
            if date_span:
                birth_date = date_span.get("data-birth")
                if not birth_date:
                    raw_text = date_span.text.strip()
                    try:
                        birth_date = pd.to_datetime(raw_text).strftime("%Y-%m-%d")
                    except:
                        birth_date = None

            nobr = birth_block.find("nobr")
            if nobr:
                raw_age = nobr.text
                match = re.search(r"Age:\s*([\d\-]+)", raw_age)
                age = match.group(1) if match else None

            birth_place_span = nobr.find_next("span") if nobr else None
            if birth_place_span:
                birth_place = birth_place_span.text.strip()
    except:
        pass

    nationality = None
    try:
        nat_tag = meta.find("strong", string="National Team:")
        if nat_tag:
            nationality = nat_tag.find_next("a").text.strip()
    except:
        pass

    if not nationality:
        try:
            citizen_tag = meta.find("strong", string="Citizenship:")
            if citizen_tag:
                nationality = citizen_tag.find_next("a").text.strip()
        except:
            pass

    club = None
    try:
        club_tag = meta.find("strong", string="Club:")
        if club_tag:
            club = club_tag.find_next("a").text.strip()
    except:
        pass

    return {
        "full_name": full_name,
        "position": position,
        "footed": footed,
        "birth_date": birth_date,
        "age": age,
        "birth_place": birth_place,
        "nationality": nationality,
        "club": club
    }

# === Create output file if not exists
if not OUTPUT_FILE.exists():
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    pd.DataFrame(columns=[
        "player_name", "url_template", "full_name", "position", "footed",
        "birth_date", "age", "birth_place", "nationality", "club"
    ]).to_csv(OUTPUT_FILE, index=False)

# === Main loop
for i, player in enumerate(players, start=start_index + 1):
    player_name = player["name"]
    player_url = player["url_template"]

    print(f"\n🔍 [{i}] Scraping: {player_url}")

    retries = 0
    max_retries = 3
    success = False

    while retries < max_retries and not success:
        try:
            driver.get(player_url)
            sleep_time = random.uniform(8, 12)
            print(f"⏳ Waiting {sleep_time:.2f} seconds after loading...")
            time.sleep(sleep_time)

            data = extract_metadata(driver)
            if not data:
                print(f"⚠️ No metadata found for {player_name}")
                break

            data["player_name"] = player_name
            data["url_template"] = player_url

            pd.DataFrame([data]).to_csv(OUTPUT_FILE, mode="a", header=False, index=False)
            print(f"✅ Saved metadata for {player_name}")
            success = True

        except WebDriverException as e:
            if "ERR_INTERNET_DISCONNECTED" in str(e):
                retries += 1
                print(f"⚠️ Internet disconnected. Retrying ({retries}/{max_retries})...")
                time.sleep(10)
            else:
                print(f"❌ WebDriver error: {e}")
                break

driver.quit()
print(f"\n💾 Done! Full metadata saved to: {OUTPUT_FILE}")

In [None]:
# Test scraping Mbappé

# 📦 Imports
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path

# === CONFIGURATION ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"  # Ajusta si hace falta
OUTPUT_CSV = Path("data/debug/mbappe_matchlogs_raw.csv")
SLEEP_TIME = 8

# === Mbappé Info
player_id = "42fd9c7f"
player_name = "kylian_mbappe"
profile_url = "https://fbref.com/en/players/42fd9c7f/matchlogs/"

# === Selenium Setup
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(CHROME_PATH), options=options)

# === Scrape Mbappé
driver.get(profile_url)
time.sleep(SLEEP_TIME)
soup = BeautifulSoup(driver.page_source, "html.parser")

summary_section = soup.find("div", id="inner_nav")
matchlog_links = []
if summary_section:
    matchlog_links = {
        a["href"] for a in summary_section.find_all("a", href=True)
        if "/matchlogs/" in a["href"]
        and "Match-Logs" in a["href"]
        and "summary" in a["href"].lower() 
    }

if not matchlog_links:
    print("⚠️ No Match Logs (Summary) seasons found.")
else:
    player_rows = []

    for rel_url in matchlog_links:
        full_url = "https://fbref.com" + rel_url
        season = rel_url.split("/matchlogs/")[1].split("/")[0]
        print(f"🔍 {season} — {full_url}")

        try:
            driver.get(full_url)
            time.sleep(SLEEP_TIME)
            sub_soup = BeautifulSoup(driver.page_source, "html.parser")
            table = sub_soup.find("table", {"id": "matchlogs_all"})

            if table:
                header_rows = table.find("thead").find_all("tr")
                final_header = header_rows[-1]
                columns = [th.text.strip() for th in final_header.find_all("th")]

                for row in table.find("tbody").find_all("tr"):
                    if "class" in row.attrs and "thead" in row["class"]:
                        continue
                    cells = row.find_all(["th", "td"])
                    values = [cell.text.strip() for cell in cells]
                    row_data = dict(zip(columns, values))

                    # ✅ Añadir datos clave al principio
                    row_data_ordered = {
                        "player_name": player_name,
                        "player_id": player_id,
                        "season": season
                    }
                    row_data_ordered.update(row_data)
                    player_rows.append(row_data_ordered)
            else:
                print("   ⚠️ No matchlogs_all table found.")

        except Exception as e:
            print(f"   ❌ Error processing season {season}: {e}")

    if player_rows:
        df_mbappe = pd.DataFrame(player_rows)
        OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)

        # ✅ Asegurar orden de columnas
        ordered_cols = ["player_name", "player_id", "season"]
        remaining_cols = [col for col in df_mbappe.columns if col not in ordered_cols]
        df_mbappe = df_mbappe[ordered_cols + remaining_cols]

        df_mbappe.to_csv(OUTPUT_CSV, index=False)
        print(f"\n✅ Guardado: {len(df_mbappe)} filas en {OUTPUT_CSV}")
    else:
        print("🚫 No se extrajeron datos para Mbappé.")

driver.quit()



🔍 2022-2023 — https://fbref.com/en/players/42fd9c7f/matchlogs/2022-2023/summary/Kylian-Mbappe-Match-Logs
🔍 2023-2024 — https://fbref.com/en/players/42fd9c7f/matchlogs/2023-2024/summary/Kylian-Mbappe-Match-Logs
🔍 2024-2025 — https://fbref.com/en/players/42fd9c7f/matchlogs/2024-2025/summary/Kylian-Mbappe-Match-Logs
🔍 2019-2020 — https://fbref.com/en/players/42fd9c7f/matchlogs/2019-2020/summary/Kylian-Mbappe-Match-Logs
🔍 2018-2019 — https://fbref.com/en/players/42fd9c7f/matchlogs/2018-2019/summary/Kylian-Mbappe-Match-Logs
🔍 2016-2017 — https://fbref.com/en/players/42fd9c7f/matchlogs/2016-2017/summary/Kylian-Mbappe-Match-Logs
🔍 2017-2018 — https://fbref.com/en/players/42fd9c7f/matchlogs/2017-2018/summary/Kylian-Mbappe-Match-Logs
🔍 2021-2022 — https://fbref.com/en/players/42fd9c7f/matchlogs/2021-2022/summary/Kylian-Mbappe-Match-Logs
🔍 2015-2016 — https://fbref.com/en/players/42fd9c7f/matchlogs/2015-2016/summary/Kylian-Mbappe-Match-Logs
🔍 2020-2021 — https://fbref.com/en/players/42fd9c7f/mat

In [None]:
# DEFINITIVO, 100% REAL NO FAKE

# Scraping matchlogs FBref desde YAML FILTERED (alineado y reanudable)

import time
import yaml
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path

# === CONFIGURATION ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
INPUT_YAML = Path("data/meta/top_10_countries_players_filtered.yaml")
OUTPUT_CSV = Path("data/raw/top_10_countries_matchlogs_filtered.csv")
RESUME_FROM_ID = "fd58113c"  # Cambia si quieres reanudar desde otro jugador
SLEEP_TIME = 8
LIMIT_PLAYERS = None  # Usa None para recorrer todos los jugadores

# === Column definitions
stat_columns = [
    "Date", "Day", "Comp", "Round", "Venue", "Result", "Squad", "Opponent", "Start",
    "Pos", "Min", "Gls", "Ast", "PK", "PKatt", "Sh", "SoT", "CrdY", "CrdR", "Fls", "Fld", "Off",
    "Crs", "TklW", "Int", "OG", "PKwon", "PKcon", "Touches", "Tkl", "Blocks", "xG", "npxG", "xAG",
    "SCA", "GCA", "Cmp", "Att", "Cmp%", "PrgP", "Carries", "PrgC", "Succ"
]
final_columns = ["player_name", "player_id", "season"] + stat_columns

# === Load YAML
with open(INPUT_YAML, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

# === Resume from specific ID
resume_index = 0
for i, player in enumerate(all_players):
    if player.get("id") == RESUME_FROM_ID:
        resume_index = i
        break
remaining_players = all_players[resume_index:]
if LIMIT_PLAYERS:
    remaining_players = remaining_players[:LIMIT_PLAYERS]

print(f"🔁 Resuming from index {resume_index}: {remaining_players[0]['name']}")

# === Selenium setup
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(CHROME_PATH), options=options)

# === Check if CSV exists and whether it is empty
output_exists = OUTPUT_CSV.exists()
if output_exists:
    try:
        existing_df = pd.read_csv(OUTPUT_CSV, dtype=str)
        output_exists = not existing_df.empty
    except Exception:
        output_exists = False

if not output_exists:
    pd.DataFrame(columns=final_columns).to_csv(OUTPUT_CSV, index=False)

# === Scraping loop
for player in remaining_players:
    player_id = player.get("id")
    player_name = player.get("name")
    profile_url = player.get("url")
    if not player_id or not player_name or not profile_url:
        continue

    print(f"\n🧍 Scraping {player_name} | ID: {player_id}")
    try:
        driver.get(profile_url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # === Get matchlog links (all seasons)
        summary_section = soup.find("div", id="inner_nav")
        matchlog_links = {
            a["href"] for a in summary_section.find_all("a", href=True)
            if "/matchlogs/" in a["href"] and "Match-Logs" in a["href"] and "summary" in a["href"].lower()
        } if summary_section else set()

        if not matchlog_links:
            print("   ⚠️ No match logs found.")
            continue

        player_rows = []

        for rel_url in matchlog_links:
            full_url = "https://fbref.com" + rel_url
            season = rel_url.split("/matchlogs/")[1].split("/")[0]
            print(f"   🔍 {season} — {full_url}")

            try:
                driver.get(full_url)
                time.sleep(SLEEP_TIME)
                sub_soup = BeautifulSoup(driver.page_source, "html.parser")
                table = sub_soup.find("table", {"id": "matchlogs_all"})

                if table:
                    header = table.find("thead").find_all("tr")[-1]
                    columns = [th.text.strip() for th in header.find_all("th")]

                    for row in table.find("tbody").find_all("tr"):
                        if "class" in row.attrs and "thead" in row["class"]:
                            continue
                        cells = row.find_all(["th", "td"])
                        values = [cell.text.strip() for cell in cells]
                        raw_data = dict(zip(columns, values))

                        aligned = {
                            "player_name": player_name,
                            "player_id": player_id,
                            "season": season
                        }
                        for col in stat_columns:
                            aligned[col] = raw_data.get(col, "")
                        player_rows.append(aligned)
                else:
                    print("      ⚠️ No table found.")

            except Exception as e:
                print(f"      ❌ Error scraping season {season}: {e}")

        if player_rows:
            df_player = pd.DataFrame(player_rows)[final_columns]
            df_player.to_csv(OUTPUT_CSV, mode="a", index=False, header=not output_exists)
            output_exists = True
            print(f"   ✅ {len(df_player)} rows added for {player_name}")
        else:
            print(f"   🚫 No data for {player_name}")

    except Exception as e:
        print(f"❌ Error with {player_name}: {e}")

driver.quit()
print(f"\n📦 Done. Appended logs to: {OUTPUT_CSV}")





🔁 Resuming from index 2416: lewis_kinsella

🧍 Scraping lewis_kinsella | ID: fd58113c
   🔍 2019-2020 — https://fbref.com/en/players/fd58113c/matchlogs/2019-2020/summary/Lewis-Kinsella-Match-Logs
   🔍 2014-2015 — https://fbref.com/en/players/fd58113c/matchlogs/2014-2015/summary/Lewis-Kinsella-Match-Logs
   🔍 2023-2024 — https://fbref.com/en/players/fd58113c/matchlogs/2023-2024/summary/Lewis-Kinsella-Match-Logs
   🔍 2016-2017 — https://fbref.com/en/players/fd58113c/matchlogs/2016-2017/summary/Lewis-Kinsella-Match-Logs
   🔍 2020-2021 — https://fbref.com/en/players/fd58113c/matchlogs/2020-2021/summary/Lewis-Kinsella-Match-Logs
   🔍 2021-2022 — https://fbref.com/en/players/fd58113c/matchlogs/2021-2022/summary/Lewis-Kinsella-Match-Logs
   🔍 2018-2019 — https://fbref.com/en/players/fd58113c/matchlogs/2018-2019/summary/Lewis-Kinsella-Match-Logs
   🔍 2017-2018 — https://fbref.com/en/players/fd58113c/matchlogs/2017-2018/summary/Lewis-Kinsella-Match-Logs
   🔍 2022-2023 — https://fbref.com/en/playe

KeyboardInterrupt: 

In [18]:
# DEFINITIVO, 100% REAL NO FAKE

# Scraping matchlogs FBref desde YAML (alineado y reanudable)

# JUGADORES JÓVENES (FUNCIONAL)

import time
import yaml
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path

# === CONFIGURATION ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
INPUT_YAML = Path("data/meta/top_10_countries_players.yaml")
OUTPUT_CSV = Path("data/raw/top_10_countries_matchlogs_young_players.csv")
RESUME_FROM_ID = "6b9960cf"  # Cambia si quieres reanudar desde otro jugador
SLEEP_TIME = 12
LIMIT_PLAYERS = 1  # Usa None para recorrer todos los jugadores

# === Column definitions
stat_columns = [
    "Date", "Day", "Comp", "Round", "Venue", "Result", "Squad", "Opponent", "Start",
    "Pos", "Min", "Gls", "Ast", "PK", "PKatt", "Sh", "SoT", "CrdY", "CrdR", "Fls", "Fld", "Off",
    "Crs", "TklW", "Int", "OG", "PKwon", "PKcon", "Touches", "Tkl", "Blocks", "xG", "npxG", "xAG",
    "SCA", "GCA", "Cmp", "Att", "Cmp%", "PrgP", "Carries", "PrgC", "Succ"
]
final_columns = ["player_name", "player_id", "season"] + stat_columns

# === Load YAML
with open(INPUT_YAML, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

# === Resume from specific ID
resume_index = 0
for i, player in enumerate(all_players):
    if player.get("id") == RESUME_FROM_ID:
        resume_index = i
        break
remaining_players = all_players[resume_index:]
if LIMIT_PLAYERS:
    remaining_players = remaining_players[:LIMIT_PLAYERS]

print(f"🔁 Resuming from index {resume_index}: {remaining_players[0]['name']}")

# === Selenium setup
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(CHROME_PATH), options=options)

# === Check if CSV exists and whether it is empty
output_exists = OUTPUT_CSV.exists()
if output_exists:
    try:
        existing_df = pd.read_csv(OUTPUT_CSV, dtype=str)
        output_exists = not existing_df.empty
    except Exception:
        output_exists = False

if not output_exists:
    pd.DataFrame(columns=final_columns).to_csv(OUTPUT_CSV, index=False)

# === Scraping loop
for player in remaining_players:
    player_id = player.get("id")
    player_name = player.get("name")
    profile_url = player.get("url")
    if not player_id or not player_name or not profile_url:
        continue

    print(f"\n🧍 Scraping {player_name} | ID: {player_id}")
    try:
        driver.get(profile_url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # === Get matchlog links (all seasons)
        summary_section = soup.find("div", id="inner_nav")
        matchlog_links = {
            a["href"] for a in summary_section.find_all("a", href=True)
            if "/matchlogs/" in a["href"] and "Match-Logs" in a["href"] and "summary" in a["href"].lower()
        } if summary_section else set()

        if not matchlog_links:
            print("   ⚠️ No match logs found.")
            continue

        player_rows = []

        for rel_url in matchlog_links:
            full_url = "https://fbref.com" + rel_url
            season = rel_url.split("/matchlogs/")[1].split("/")[0]
            print(f"   🔍 {season} — {full_url}")

            try:
                driver.get(full_url)
                time.sleep(SLEEP_TIME)
                sub_soup = BeautifulSoup(driver.page_source, "html.parser")
                table = sub_soup.find("table", {"id": "matchlogs_all"})

                if table:
                    header = table.find("thead").find_all("tr")[-1]
                    columns = [th.text.strip() for th in header.find_all("th")]

                    for row in table.find("tbody").find_all("tr"):
                        if "class" in row.attrs and "thead" in row["class"]:
                            continue
                        cells = row.find_all(["th", "td"])
                        values = [cell.text.strip() for cell in cells]
                        raw_data = dict(zip(columns, values))

                        aligned = {
                            "player_name": player_name,
                            "player_id": player_id,
                            "season": season
                        }
                        for col in stat_columns:
                            aligned[col] = raw_data.get(col, "")
                        player_rows.append(aligned)
                else:
                    print("      ⚠️ No table found.")

            except Exception as e:
                print(f"      ❌ Error scraping season {season}: {e}")

        if player_rows:
            df_player = pd.DataFrame(player_rows)[final_columns]
            df_player.to_csv(OUTPUT_CSV, mode="a", index=False, header=not output_exists)
            output_exists = True
            print(f"   ✅ {len(df_player)} rows added for {player_name}")
        else:
            print(f"   🚫 No data for {player_name}")

    except Exception as e:
        print(f"❌ Error with {player_name}: {e}")

driver.quit()
print(f"\n📦 Done. Appended logs to: {OUTPUT_CSV}")

🔁 Resuming from index 30907: warren_zaïre-emery

🧍 Scraping warren_zaïre-emery | ID: 6b9960cf
   🔍 2022-2023 — https://fbref.com/en/players/6b9960cf/matchlogs/2022-2023/summary/Warren-Zaire-Emery-Match-Logs
   🔍 2024-2025 — https://fbref.com/en/players/6b9960cf/matchlogs/2024-2025/summary/Warren-Zaire-Emery-Match-Logs
   🔍 2023-2024 — https://fbref.com/en/players/6b9960cf/matchlogs/2023-2024/summary/Warren-Zaire-Emery-Match-Logs
   🔍 nat_tm — https://fbref.com/en/players/6b9960cf/matchlogs/nat_tm/summary/Warren-Zaire-Emery-Match-Logs
   ✅ 184 rows added for warren_zaïre-emery

📦 Done. Appended logs to: data\raw\top_10_countries_matchlogs_young_players.csv


In [None]:
# Scraping metadata young players (FUNCIONAL) 

# 📦 Imports
import yaml
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
from pathlib import Path
import random

# === Paths ===
YAML_PATH = "data/meta/top_10_countries_players.yaml"
OUTPUT_FILE = Path("data/raw/future_stars_raw_metadata.csv")
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"

# === Load YAML
with open(YAML_PATH, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

# 🎯 Filter only young players by Player_ID
target_id = "829aa60c"
players = [p for p in all_players if target_id in p["url"]]

if not players:
    raise ValueError(f"❌ No player found with ID {target_id} in the YAML.")
else:
    print(f"✅ Found player: {players[0]['name']} ({players[0]['url']})")

# === Set the last scraped URL (for continuation)
# Leave it empty "" to scrape from the beginning
last_scraped_url = ""

# === Find starting point
start_index = 0  # By default start from 0

if last_scraped_url:
    for idx, player in enumerate(players):
        if player["url_template"] == last_scraped_url:
            start_index = idx + 1  # Start AFTER the last scraped player
            break
    else:
        raise ValueError("❌ last_scraped_url not found in male_players.yaml!")

players = players[start_index:]  # Only keep players after last scraped

print(f"🚀 Starting scraping from index {start_index} ({players[0]['name']})")

# === Setup Selenium
options = Options()
# options.add_argument("--headless")  # Optional: hide browser if you want
options.add_argument("--disable-gpu")
options.add_argument("--start-maximized")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Metadata extraction
def extract_metadata(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    meta = soup.find("div", id="meta")
    if not meta:
        return {}

    try:
        full_name = meta.find("p").text.strip()
    except:
        full_name = None

    position = footed = None
    try:
        pos_block = meta.find("strong", string="Position:").parent
        if pos_block:
            text = pos_block.get_text(separator="|")
            parts = text.split("|")
            position = parts[1].strip() if len(parts) > 1 else None
            footed = parts[3].strip() if "Footed:" in text and len(parts) > 3 else None
    except:
        pass

    birth_date = age = birth_place = None
    try:
        birth_tag = meta.find("strong", string="Born:")
        if birth_tag:
            birth_block = birth_tag.parent
            date_span = birth_block.find("span")
            if date_span:
                birth_date = date_span.get("data-birth")
                if not birth_date:
                    raw_text = date_span.text.strip()
                    try:
                        birth_date = pd.to_datetime(raw_text).strftime("%Y-%m-%d")
                    except:
                        birth_date = None

            nobr = birth_block.find("nobr")
            if nobr:
                raw_age = nobr.text
                match = re.search(r"Age:\s*([\d\-]+)", raw_age)
                age = match.group(1) if match else None

            birth_place_span = nobr.find_next("span") if nobr else None
            if birth_place_span:
                birth_place = birth_place_span.text.strip()
    except:
        pass

    nationality = None
    try:
        nat_tag = meta.find("strong", string="National Team:")
        if nat_tag:
            nationality = nat_tag.find_next("a").text.strip()
    except:
        pass

    if not nationality:
        try:
            citizen_tag = meta.find("strong", string="Citizenship:")
            if citizen_tag:
                nationality = citizen_tag.find_next("a").text.strip()
        except:
            pass

    club = None
    try:
        club_tag = meta.find("strong", string="Club:")
        if club_tag:
            club = club_tag.find_next("a").text.strip()
    except:
        pass

    return {
        "full_name": full_name,
        "position": position,
        "footed": footed,
        "birth_date": birth_date,
        "age": age,
        "birth_place": birth_place,
        "nationality": nationality,
        "club": club
    }

# === Create output file if not exists
if not OUTPUT_FILE.exists():
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    pd.DataFrame(columns=[
        "player_name", "url_template", "full_name", "position", "footed",
        "birth_date", "age", "birth_place", "nationality", "club"
    ]).to_csv(OUTPUT_FILE, index=False)

# === Main loop
for i, player in enumerate(players, start=start_index + 1):
    player_name = player["name"]
    player_url = player["url"]

    print(f"\n🔍 [{i}] Scraping: {player_url}")

    retries = 0
    max_retries = 3
    success = False

    while retries < max_retries and not success:
        try:
            driver.get(player_url)
            sleep_time = random.uniform(8, 12)
            print(f"⏳ Waiting {sleep_time:.2f} seconds after loading...")
            time.sleep(sleep_time)

            data = extract_metadata(driver)
            if not data:
                print(f"⚠️ No metadata found for {player_name}")
                break

            data["player_name"] = player_name
            data["url_template"] = player_url

            pd.DataFrame([data]).to_csv(OUTPUT_FILE, mode="a", header=False, index=False)
            print(f"✅ Saved metadata for {player_name}")
            success = True

        except WebDriverException as e:
            if "ERR_INTERNET_DISCONNECTED" in str(e):
                retries += 1
                print(f"⚠️ Internet disconnected. Retrying ({retries}/{max_retries})...")
                time.sleep(10)
            else:
                print(f"❌ WebDriver error: {e}")
                break

driver.quit()
print(f"\n💾 Done! Full metadata saved to: {OUTPUT_FILE}")

✅ Found player: mathys_tel (https://fbref.com/en/players/829aa60c/Mathys-Tel)
🚀 Starting scraping from index 0 (mathys_tel)

🔍 [1] Scraping: https://fbref.com/en/players/829aa60c/Mathys-Tel
⏳ Waiting 9.52 seconds after loading...
✅ Saved metadata for mathys_tel

💾 Done! Full metadata saved to: data\raw\future_stars_raw_metadata.csv


In [None]:
# Scraping total de top_10_countries_players.yaml para aumentar la muestra

# Ignora lo que ya está en el csv de raw data.

# 📦 Imports
import time
import yaml
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path

# === CONFIGURATION ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
INPUT_YAML = Path("data/meta/top_10_countries_players.yaml")
OUTPUT_CSV = Path("data/raw/top_10_countries_matchlogs_filtered.csv")
SLEEP_TIME = 8
LIMIT_PLAYERS = None  # Optional limit

# === Column definitions
stat_columns = [
    "Date", "Day", "Comp", "Round", "Venue", "Result", "Squad", "Opponent", "Start",
    "Pos", "Min", "Gls", "Ast", "PK", "PKatt", "Sh", "SoT", "CrdY", "CrdR", "Fls", "Fld", "Off",
    "Crs", "TklW", "Int", "OG", "PKwon", "PKcon", "Touches", "Tkl", "Blocks", "xG", "npxG", "xAG",
    "SCA", "GCA", "Cmp", "Att", "Cmp%", "PrgP", "Carries", "PrgC", "Succ"
]
final_columns = ["player_name", "player_id", "season"] + stat_columns

# === Load all players from YAML
with open(INPUT_YAML, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

# === Detect already scraped player_ids in CSV
if OUTPUT_CSV.exists():
    try:
        existing_df = pd.read_csv(OUTPUT_CSV, usecols=["player_id"], dtype=str)
        scraped_ids = set(existing_df["player_id"].dropna().unique())
        print(f"🔁 Found {len(scraped_ids)} already-scraped players in CSV.")
    except Exception as e:
        print(f"⚠️ Couldn't read existing CSV, will continue as if empty. {e}")
        scraped_ids = set()
else:
    scraped_ids = set()

# === Filter unscripted players
remaining_players = [p for p in all_players if p.get("id") not in scraped_ids]
if LIMIT_PLAYERS:
    remaining_players = remaining_players[:LIMIT_PLAYERS]

print(f"🎯 {len(remaining_players)} players pending scraping.")

# === Prepare CSV header if needed
output_exists = OUTPUT_CSV.exists() and OUTPUT_CSV.stat().st_size > 0
if not output_exists:
    pd.DataFrame(columns=final_columns).to_csv(OUTPUT_CSV, index=False)

# === Setup Selenium
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(CHROME_PATH), options=options)

# === Scrape loop
for player in remaining_players:
    player_id = player.get("id")
    player_name = player.get("name")
    profile_url = player.get("url")

    if not player_id or not player_name or not profile_url:
        continue

    print(f"\n🧍 Scraping {player_name} | ID: {player_id}")

    try:
        driver.get(profile_url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        summary_section = soup.find("div", id="inner_nav")
        matchlog_links = {
            a["href"] for a in summary_section.find_all("a", href=True)
            if "/matchlogs/" in a["href"] and "Match-Logs" in a["href"] and "summary" in a["href"].lower()
        } if summary_section else set()

        if not matchlog_links:
            print("   ⚠️ No match logs found.")
            continue

        player_rows = []

        for rel_url in matchlog_links:
            full_url = "https://fbref.com" + rel_url
            season = rel_url.split("/matchlogs/")[1].split("/")[0]
            print(f"   🔍 {season} — {full_url}")

            try:
                driver.get(full_url)
                time.sleep(SLEEP_TIME)
                sub_soup = BeautifulSoup(driver.page_source, "html.parser")
                table = sub_soup.find("table", {"id": "matchlogs_all"})

                if table:
                    header = table.find("thead").find_all("tr")[-1]
                    columns = [th.text.strip() for th in header.find_all("th")]

                    for row in table.find("tbody").find_all("tr"):
                        if "class" in row.attrs and "thead" in row["class"]:
                            continue
                        cells = row.find_all(["th", "td"])
                        values = [cell.text.strip() for cell in cells]
                        raw_data = dict(zip(columns, values))

                        aligned = {
                            "player_name": player_name,
                            "player_id": player_id,
                            "season": season
                        }
                        for col in stat_columns:
                            aligned[col] = raw_data.get(col, "")
                        player_rows.append(aligned)
                else:
                    print("      ⚠️ No table found.")

            except Exception as e:
                print(f"      ❌ Error scraping season {season}: {e}")

        if player_rows:
            df_player = pd.DataFrame(player_rows)[final_columns]
            df_player.to_csv(OUTPUT_CSV, mode="a", index=False, header=not output_exists)
            output_exists = True
            print(f"   ✅ {len(df_player)} rows added for {player_name}")
        else:
            print(f"   🚫 No data for {player_name}")

    except Exception as e:
        print(f"❌ Error with {player_name}: {e}")

driver.quit()
print(f"\n📦 Done. Match logs saved to: {OUTPUT_CSV}")


🔁 Found 8761 already-scraped players in CSV.
🎯 78219 players pending scraping.

🧍 Scraping francisco_aguirre | ID: 02b5a342
   ⚠️ No match logs found.

🧍 Scraping matías_alasia | ID: a03caaef
   ⚠️ No match logs found.

🧍 Scraping martín_albano_pautasso | ID: eedd1bcb
   ⚠️ No match logs found.

🧍 Scraping esteban_alberto_gonzález | ID: f4d66862
   ⚠️ No match logs found.

🧍 Scraping carlos_alberto_massara | ID: ada913ae
   ⚠️ No match logs found.

🧍 Scraping lucas_alessandría | ID: 1a9b820c
   ⚠️ No match logs found.

🧍 Scraping javier_almirón | ID: 8b8c57b7
   ⚠️ No match logs found.

🧍 Scraping jorge_almirón | ID: 2730c92b
   ⚠️ No match logs found.

🧍 Scraping leonel_altobelli | ID: abc4431d
   ⚠️ No match logs found.

🧍 Scraping luciano_álvarez | ID: efb353a4
   ⚠️ No match logs found.

🧍 Scraping franco_amoroso | ID: 30aaf515
   ⚠️ No match logs found.

🧍 Scraping daniel_andrés_ríos | ID: 820b991d
   ⚠️ No match logs found.

🧍 Scraping martín_andrizzi | ID: 20081075
   ⚠️ No matc

In [6]:
# Scraping total de top_10_countries_players.yaml para aumentar la muestra con jugadores de 2000 y 10+ años de carrera

# FUNCIONAL

# 📦 Imports
import time
import yaml
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path

# === CONFIGURATION ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
INPUT_YAML = Path("data/meta/top_10_countries_players.yaml")
OUTPUT_CSV = Path("data/raw/top_10_countries_matchlogs_filtered.csv")
SLEEP_TIME = 8
LIMIT_PLAYERS = None  # Optional limit

# === Column definitions
stat_columns = [
    "Date", "Day", "Comp", "Round", "Venue", "Result", "Squad", "Opponent", "Start",
    "Pos", "Min", "Gls", "Ast", "PK", "PKatt", "Sh", "SoT", "CrdY", "CrdR", "Fls", "Fld", "Off",
    "Crs", "TklW", "Int", "OG", "PKwon", "PKcon", "Touches", "Tkl", "Blocks", "xG", "npxG", "xAG",
    "SCA", "GCA", "Cmp", "Att", "Cmp%", "PrgP", "Carries", "PrgC", "Succ"
]
final_columns = ["player_name", "player_id", "season"] + stat_columns

# === Load all players from YAML
with open(INPUT_YAML, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

# === Detect already scraped player_ids in CSV
if OUTPUT_CSV.exists():
    try:
        existing_df = pd.read_csv(OUTPUT_CSV, usecols=["player_id"], dtype=str)
        scraped_ids = set(existing_df["player_id"].dropna().unique())
        print(f"🔁 Found {len(scraped_ids)} already-scraped players in CSV.")
    except Exception as e:
        print(f"⚠️ Couldn't read existing CSV, will continue as if empty. {e}")
        scraped_ids = set()
else:
    scraped_ids = set()

# === Filter players: from 2000, 10+ years, and NOT already scraped
filtered_players = []
for p in all_players:
    years = p.get("years", "")
    if not years or "-" not in years:
        continue
    try:
        start, end = years.split("-")
        start_year = int(start)
        end_year = int(end)
        if start_year >= 2000 and (end_year - start_year + 1) >= 10:
            if p.get("id") not in scraped_ids:
                filtered_players.append(p)
    except:
        continue

# === Optional limit
if LIMIT_PLAYERS:
    filtered_players = filtered_players[:LIMIT_PLAYERS]

print(f"🎯 {len(filtered_players)} players pending scraping.")

# === Prepare CSV header if needed
output_exists = OUTPUT_CSV.exists() and OUTPUT_CSV.stat().st_size > 0
if not output_exists:
    pd.DataFrame(columns=final_columns).to_csv(OUTPUT_CSV, index=False)

# === Setup Selenium
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(CHROME_PATH), options=options)

# === Scrape loop
for player in filtered_players:
    player_id = player.get("id")
    player_name = player.get("name")
    profile_url = player.get("url")

    if not player_id or not player_name or not profile_url:
        continue

    print(f"\n🧍 Scraping {player_name} | ID: {player_id}")

    try:
        driver.get(profile_url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        summary_section = soup.find("div", id="inner_nav")
        matchlog_links = {
            a["href"] for a in summary_section.find_all("a", href=True)
            if "/matchlogs/" in a["href"] and "Match-Logs" in a["href"] and "summary" in a["href"].lower()
        } if summary_section else set()

        if not matchlog_links:
            print("   ⚠️ No match logs found.")
            continue

        player_rows = []

        for rel_url in matchlog_links:
            full_url = "https://fbref.com" + rel_url
            season = rel_url.split("/matchlogs/")[1].split("/")[0]
            print(f"   🔍 {season} — {full_url}")

            try:
                driver.get(full_url)
                time.sleep(SLEEP_TIME)
                sub_soup = BeautifulSoup(driver.page_source, "html.parser")
                table = sub_soup.find("table", {"id": "matchlogs_all"})

                if table:
                    header = table.find("thead").find_all("tr")[-1]
                    columns = [th.text.strip() for th in header.find_all("th")]

                    for row in table.find("tbody").find_all("tr"):
                        if "class" in row.attrs and "thead" in row["class"]:
                            continue
                        cells = row.find_all(["th", "td"])
                        values = [cell.text.strip() for cell in cells]
                        raw_data = dict(zip(columns, values))

                        aligned = {
                            "player_name": player_name,
                            "player_id": player_id,
                            "season": season
                        }
                        for col in stat_columns:
                            aligned[col] = raw_data.get(col, "")
                        player_rows.append(aligned)
                else:
                    print("      ⚠️ No table found.")

            except Exception as e:
                print(f"      ❌ Error scraping season {season}: {e}")

        if player_rows:
            df_player = pd.DataFrame(player_rows)[final_columns]
            df_player.to_csv(OUTPUT_CSV, mode="a", index=False, header=not output_exists)
            output_exists = True
            print(f"   ✅ {len(df_player)} rows added for {player_name}")
        else:
            print(f"   🚫 No data for {player_name}")

    except Exception as e:
        print(f"❌ Error with {player_name}: {e}")

driver.quit()
print(f"\n📦 Done. Match logs saved to: {OUTPUT_CSV}")


🔁 Found 9852 already-scraped players in CSV.
🎯 5512 players pending scraping.

🧍 Scraping rubén_marcelo_gómez | ID: 0fae9840
   ⚠️ No match logs found.

🧍 Scraping daniel_pendín | ID: d58d3881
   ⚠️ No match logs found.

🧍 Scraping andradina | ID: 57f6ec3e
   ⚠️ No match logs found.

🧍 Scraping elton_da_costa | ID: c8093328
   ⚠️ No match logs found.

🧍 Scraping cristiano_dias | ID: 6d439dd2
   ⚠️ No match logs found.

🧍 Scraping leonardo | ID: 42fa8fc4
   ⚠️ No match logs found.

🧍 Scraping mauricio_salles | ID: 2a5b0f97
   ⚠️ No match logs found.

🧍 Scraping elvis_santana | ID: b67f7a40
   ⚠️ No match logs found.

🧍 Scraping sidney_cristiano_dos_santos | ID: 180380c2
   ⚠️ No match logs found.

🧍 Scraping dennis_souza | ID: ce7a00b5
   ⚠️ No match logs found.

🧍 Scraping magno_vieira | ID: b5535c41
   ⚠️ No match logs found.

🧍 Scraping olivier_caillas | ID: 8ba34161
   ⚠️ No match logs found.

🧍 Scraping grégory_christ | ID: 37d02e4b
   ⚠️ No match logs found.

🧍 Scraping mathias_do

KeyboardInterrupt: 