In [6]:
from pathlib import Path

# Explicitly define your project root here (adjust if needed)
project_root = Path.cwd().parents[0]  # assuming you're inside 'notebooks/'

raw_data_path = project_root / "data" / "raw"
raw_data_path.mkdir(parents=True, exist_ok=True)


In [7]:
print(f"Project root: {project_root}")
print(f"CSV will be saved to: {raw_data_path}")


Project root: F:\JCMDataCenter\Cursos\Evolve Academy\Data Scientist IA
CSV will be saved to: F:\JCMDataCenter\Cursos\Evolve Academy\Data Scientist IA\data\raw


In [4]:
# Know the root

from pathlib import Path

print("📂 Current directory:", Path.cwd())

📂 Current directory: F:\JCMDataCenter\Cursos\Evolve Academy\Data Scientist IA\Futpeak


In [3]:
import os
from pathlib import Path

# Set your absolute project root
project_root = Path(r"F:\JCMDataCenter\Cursos\Evolve Academy\Data Scientist IA\Futpeak") # sobremesa
# project_root = Path(r"C:\Users\juanm\Desktop\FUTPEAK\Futpeak") # portátil
os.chdir(project_root)

print("📂 Project root set to:", Path.cwd())

📂 Project root set to: F:\JCMDataCenter\Cursos\Evolve Academy\Data Scientist IA\Futpeak


In [None]:
# Get every URL

from itertools import product
import string

# Base path
base_url = "https://fbref.com/en/players/"

# Generate all combinations: aa, ab, ..., zz
suffixes = [''.join(pair) for pair in product(string.ascii_lowercase, repeat=2)]

# Build full URLs
player_index_urls = [f"{base_url}{suffix}/" for suffix in suffixes]

# Output
print(f"✅ Total index URLs generated: {len(player_index_urls)}")
for url in player_index_urls[:10]:  # show first 10
    print(url)


✅ Total index URLs generated: 676
https://fbref.com/en/players/aa/
https://fbref.com/en/players/ab/
https://fbref.com/en/players/ac/
https://fbref.com/en/players/ad/
https://fbref.com/en/players/ae/
https://fbref.com/en/players/af/
https://fbref.com/en/players/ag/
https://fbref.com/en/players/ah/
https://fbref.com/en/players/ai/
https://fbref.com/en/players/aj/


In [42]:
# Import necessary libraries
import time
import re
import yaml
import random
import os
from itertools import product
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# ------------------ CONFIGURACIÓN DE SELENIUM ------------------

# Configure Chrome in headless mode (no ventana)
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")

# Initialize the ChromeDriver
driver = webdriver.Chrome(options=options)

# ------------------ CONFIGURACIÓN DE SCRAPING ------------------

# Base URL that lists all players alphabetically
base_url = "https://fbref.com/en/players/"

# Generate all two-letter combinations from 'aa' to 'zz'
suffixes = [''.join(pair) for pair in product("abcdefghijklmnopqrstuvwxyz", repeat=2)]

# This list will hold all the scraped player data
players = []

# ------------------ SCRAPEO DE TODAS LAS SUBPÁGINAS ------------------

# Iterate through each player subindex page (e.g., /aa/, /ab/, ..., /zz/)
for suffix in suffixes:
    url = f"{base_url}{suffix}/"
    print(f"🔍 Scraping: {url}")
    
    try:
        # Load the page using Selenium
        driver.get(url)
        
        # Wait a bit to avoid overwhelming the server and mimic human browsing
        time.sleep(random.uniform(1.5, 3.0))
        
        # Parse the loaded page content with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Each player is inside a <p> with a single <a> link
        for p in soup.find_all("p"):
            a = p.find("a", href=True)
            if a and "/en/players/" in a["href"]:
                # Extract the player ID and clean name from the href
                match = re.search(r"/en/players/([a-zA-Z0-9]{8})/([A-Za-z0-9\-]+)", a["href"])
                if match:
                    player_id = match.group(1)
                    player_slug = match.group(2)  # Used in the URL
                    player_name = player_slug.strip().lower().replace("-", "_")
                    
                    # Construct the matchlog URL template
                    matchlog_url_template = (
                        f"https://fbref.com/en/players/{player_id}/matchlogs/{{season}}/{player_slug}-Match-Logs"
                    )
                    
                    # Append to the players list
                    players.append({
                        "name": player_name,
                        "id": player_id,
                        "url_template": matchlog_url_template
                    })

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")

# Close the browser when done
driver.quit()

# ------------------ GUARDAR ARCHIVO YAML ------------------

# Path where the YAML file will be saved (in current directory)
output_filename = "players_config.yaml"
output_path = os.path.join(os.getcwd(), output_filename)

# Write the data to a YAML file
with open(output_path, "w", encoding="utf-8") as f:
    yaml.dump(players, f, sort_keys=False, allow_unicode=True)

print(f"\n✅ Scraping complete. YAML saved to: {output_path}")
print(f"📦 Total players scraped: {len(players)}")


🔍 Scraping: https://fbref.com/en/players/aa/
🔍 Scraping: https://fbref.com/en/players/ab/
🔍 Scraping: https://fbref.com/en/players/ac/
🔍 Scraping: https://fbref.com/en/players/ad/
🔍 Scraping: https://fbref.com/en/players/ae/
🔍 Scraping: https://fbref.com/en/players/af/
🔍 Scraping: https://fbref.com/en/players/ag/
🔍 Scraping: https://fbref.com/en/players/ah/
🔍 Scraping: https://fbref.com/en/players/ai/
🔍 Scraping: https://fbref.com/en/players/aj/
🔍 Scraping: https://fbref.com/en/players/ak/
🔍 Scraping: https://fbref.com/en/players/al/
🔍 Scraping: https://fbref.com/en/players/am/
🔍 Scraping: https://fbref.com/en/players/an/
🔍 Scraping: https://fbref.com/en/players/ao/
🔍 Scraping: https://fbref.com/en/players/ap/
🔍 Scraping: https://fbref.com/en/players/aq/
🔍 Scraping: https://fbref.com/en/players/ar/
🔍 Scraping: https://fbref.com/en/players/as/
🔍 Scraping: https://fbref.com/en/players/at/
🔍 Scraping: https://fbref.com/en/players/au/
🔍 Scraping: https://fbref.com/en/players/av/
🔍 Scraping

In [27]:
# 🔍 Scrape Male Competitions FBREF

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time

# Setup Selenium
chrome_path = "C:/Windows/System32/chromedriver.exe"  

options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
service = Service(executable_path=chrome_path)
driver = webdriver.Chrome(service=service, options=options)

# Go to competitions page
url = "https://fbref.com/en/comps/"
driver.get(url)

# Wait 3 seconds
time.sleep(10)

# Get page source and parse it
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()  # cerrar navegador

# Scrape male competitions
competitions = []
for row in soup.select("tr.gender-m"):
    th = row.find("th", {"data-stat": "league_name"})
    a = th.find("a", href=True) if th else None
    if a:
        comp_name = a.text.strip()
        comp_url = urljoin(url, a["href"])
        competitions.append({"name": comp_name, "url": comp_url})

        

# Result
df = pd.DataFrame(competitions)
df.to_csv("data/meta/male_competitions.csv", index=False)

print(f"✅ {len(df)} male competitions scraped.")
df.head()



✅ 118 male competitions scraped.


Unnamed: 0,name,url
0,Copa Libertadores de América,https://fbref.com/en/comps/14/history/Copa-Lib...
1,Copa CONMEBOL Sudamericana,https://fbref.com/en/comps/205/history/Copa-Su...
2,UEFA Champions League,https://fbref.com/en/comps/8/history/Champions...
3,UEFA Europa League,https://fbref.com/en/comps/19/history/Europa-L...
4,UEFA Conference League,https://fbref.com/en/comps/882/history/Confere...


In [None]:
# Metadata players

# 📦 Imports
import yaml
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
from pathlib import Path
import random

# === Paths ===
YAML_PATH = "data/meta/male_players.yaml"
OUTPUT_FILE = Path("data/meta/male_players_raw_metadata.csv")
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"

# === Load YAML
with open(YAML_PATH, "r", encoding="utf-8") as f:
    players = yaml.safe_load(f)

# === Set the last scraped URL (for continuation)
# Leave it empty "" to scrape from the beginning
last_scraped_url = "https://fbref.com/en/players/8d5ce962/matchlogs/{season}/Victor-Aquino-Match-Logs"

# === Find starting point
start_index = 0  # By default start from 0

if last_scraped_url:
    for idx, player in enumerate(players):
        if player["url_template"] == last_scraped_url:
            start_index = idx + 1  # Start AFTER the last scraped player
            break
    else:
        raise ValueError("❌ last_scraped_url not found in male_players.yaml!")

players = players[start_index:]  # Only keep players after last scraped

print(f"🚀 Starting scraping from index {start_index} ({players[0]['name']})")

# === Setup Selenium
options = Options()
# options.add_argument("--headless")  # Optional: hide browser if you want
options.add_argument("--disable-gpu")
options.add_argument("--start-maximized")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Metadata extraction
def extract_metadata(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    meta = soup.find("div", id="meta")
    if not meta:
        return {}

    try:
        full_name = meta.find("p").text.strip()
    except:
        full_name = None

    position = footed = None
    try:
        pos_block = meta.find("strong", string="Position:").parent
        if pos_block:
            text = pos_block.get_text(separator="|")
            parts = text.split("|")
            position = parts[1].strip() if len(parts) > 1 else None
            footed = parts[3].strip() if "Footed:" in text and len(parts) > 3 else None
    except:
        pass

    birth_date = age = birth_place = None
    try:
        birth_tag = meta.find("strong", string="Born:")
        if birth_tag:
            birth_block = birth_tag.parent
            date_span = birth_block.find("span")
            if date_span:
                birth_date = date_span.get("data-birth")
                if not birth_date:
                    raw_text = date_span.text.strip()
                    try:
                        birth_date = pd.to_datetime(raw_text).strftime("%Y-%m-%d")
                    except:
                        birth_date = None

            nobr = birth_block.find("nobr")
            if nobr:
                raw_age = nobr.text
                match = re.search(r"Age:\s*([\d\-]+)", raw_age)
                age = match.group(1) if match else None

            birth_place_span = nobr.find_next("span") if nobr else None
            if birth_place_span:
                birth_place = birth_place_span.text.strip()
    except:
        pass

    nationality = None
    try:
        nat_tag = meta.find("strong", string="National Team:")
        if nat_tag:
            nationality = nat_tag.find_next("a").text.strip()
    except:
        pass

    if not nationality:
        try:
            citizen_tag = meta.find("strong", string="Citizenship:")
            if citizen_tag:
                nationality = citizen_tag.find_next("a").text.strip()
        except:
            pass

    club = None
    try:
        club_tag = meta.find("strong", string="Club:")
        if club_tag:
            club = club_tag.find_next("a").text.strip()
    except:
        pass

    return {
        "full_name": full_name,
        "position": position,
        "footed": footed,
        "birth_date": birth_date,
        "age": age,
        "birth_place": birth_place,
        "nationality": nationality,
        "club": club
    }

# === Create output file if not exists
if not OUTPUT_FILE.exists():
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    pd.DataFrame(columns=[
        "player_name", "url_template", "full_name", "position", "footed",
        "birth_date", "age", "birth_place", "nationality", "club"
    ]).to_csv(OUTPUT_FILE, index=False)

# === Main loop
for i, player in enumerate(players, start=start_index + 1):
    player_name = player["name"]
    player_url = player["url_template"]

    print(f"\n🔍 [{i}] Scraping: {player_url}")

    retries = 0
    max_retries = 3
    success = False

    while retries < max_retries and not success:
        try:
            driver.get(player_url)
            sleep_time = random.uniform(8, 12)
            print(f"⏳ Waiting {sleep_time:.2f} seconds after loading...")
            time.sleep(sleep_time)

            data = extract_metadata(driver)
            if not data:
                print(f"⚠️ No metadata found for {player_name}")
                break

            data["player_name"] = player_name
            data["url_template"] = player_url

            pd.DataFrame([data]).to_csv(OUTPUT_FILE, mode="a", header=False, index=False)
            print(f"✅ Saved metadata for {player_name}")
            success = True

        except WebDriverException as e:
            if "ERR_INTERNET_DISCONNECTED" in str(e):
                retries += 1
                print(f"⚠️ Internet disconnected. Retrying ({retries}/{max_retries})...")
                time.sleep(10)
            else:
                print(f"❌ WebDriver error: {e}")
                break

driver.quit()
print(f"\n💾 Done! Full metadata saved to: {OUTPUT_FILE}")




🚀 Starting scraping from index 4930 (jacob_akanyirige)

🔍 [4931] Scraping: https://fbref.com/en/players/760e85e2/matchlogs/{season}/Jacob-Akanyirige-Match-Logs
⏳ Waiting 11.26 seconds after loading...
✅ Saved metadata for jacob_akanyirige

🔍 [4932] Scraping: https://fbref.com/en/players/2680235d/matchlogs/{season}/Ilias-Akaouch-Match-Logs
⏳ Waiting 9.45 seconds after loading...
✅ Saved metadata for ilias_akaouch

🔍 [4933] Scraping: https://fbref.com/en/players/f2a68b97/matchlogs/{season}/Carlos-Akapo-Match-Logs
⏳ Waiting 10.68 seconds after loading...
✅ Saved metadata for carlos_akapo

🔍 [4934] Scraping: https://fbref.com/en/players/51625162/matchlogs/{season}/Akin-Akar-Match-Logs
⏳ Waiting 11.49 seconds after loading...
✅ Saved metadata for akin_akar

🔍 [4935] Scraping: https://fbref.com/en/players/caf407eb/matchlogs/{season}/Shuhei-Akasaki-Match-Logs
⏳ Waiting 8.84 seconds after loading...
✅ Saved metadata for shuhei_akasaki

🔍 [4936] Scraping: https://fbref.com/en/players/5aec0369/m

In [None]:
# Processing de metadata



In [9]:
import pandas as pd

# Cargar el CSV limpio
df = pd.read_csv("data/male_players_metadata_cleaned_correct_nationality.csv")

# Buscar un jugador por nombre
URL = "https://fbref.com/en/players/{player_id}/matchlogs/{{season}}/Lamine-Yamal-Match-Logs"  # ← Cambiá por el nombre que quieras buscar

# Filtrar coincidencias exactas
result = df[df["URL"] == URL]

# Mostrar el resultado
print(result)


Empty DataFrame
Columns: [ID, URL, Full Name, Player Name, Birthdate, Age, Birth Place, Nationality, Club, Position]
Index: []


In [None]:
# 📦 Imports de partidos de Lamine Yamal (ejemplo)
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path

# ⚙️ Selenium Setup
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# 🧍 Player Info
player_name = "lamine_yamal"
player_id = "82ec26c1"
base_url = f"https://fbref.com/en/players/{player_id}/matchlogs/{{season}}/Lamine-Yamal-Match-Logs"
seasons = [f"{y}-{y+1:02d}" for y in range(2014, 2025)]

# 📦 Data
all_data = []

# 🔁 Loop seasons
for season in seasons:
    url = base_url.replace("{season}", season)
    print(f"\n🔍 {player_name} | {season}")
    print(f"🌐 {url}")

    try:
        driver.get(url)
        time.sleep(5)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        table = soup.find("table", {"id": "matchlogs_all"})

        if table:
            # ✅ Fix column headers: only pick last header row
            header_rows = table.find("thead").find_all("tr")
            final_header = header_rows[-1]  # last row has the correct labels
            columns = [th.text.strip() for th in final_header.find_all("th")]

            # ✅ Loop data rows
            for row in table.find("tbody").find_all("tr"):
                if "class" in row.attrs and "thead" in row["class"]:
                    continue  # Skip sub-headers
                cells = row.find_all(["th", "td"])
                values = [cell.text.strip() for cell in cells]
                row_data = dict(zip(columns, values))
                row_data["player"] = player_name
                row_data["season"] = season
                all_data.append(row_data)
        else:
            print("⚠️ No se encontró la tabla matchlogs_all.")

    except Exception as e:
        print(f"❌ Error en {season}: {e}")

driver.quit()

# 📊 Convert to DataFrame
df_lamine = pd.DataFrame(all_data)
print(f"\n✅ Total rows collected for {player_name}: {len(df_lamine)}")



🔍 lamine_yamal | 2014-2015
🌐 https://fbref.com/en/players/82ec26c1/matchlogs/2014-2015/Lamine-Yamal-Match-Logs
⚠️ No se encontró la tabla matchlogs_all.

🔍 lamine_yamal | 2015-2016
🌐 https://fbref.com/en/players/82ec26c1/matchlogs/2015-2016/Lamine-Yamal-Match-Logs
⚠️ No se encontró la tabla matchlogs_all.

🔍 lamine_yamal | 2016-2017
🌐 https://fbref.com/en/players/82ec26c1/matchlogs/2016-2017/Lamine-Yamal-Match-Logs
⚠️ No se encontró la tabla matchlogs_all.

🔍 lamine_yamal | 2017-2018
🌐 https://fbref.com/en/players/82ec26c1/matchlogs/2017-2018/Lamine-Yamal-Match-Logs
⚠️ No se encontró la tabla matchlogs_all.

🔍 lamine_yamal | 2018-2019
🌐 https://fbref.com/en/players/82ec26c1/matchlogs/2018-2019/Lamine-Yamal-Match-Logs
⚠️ No se encontró la tabla matchlogs_all.

🔍 lamine_yamal | 2019-2020
🌐 https://fbref.com/en/players/82ec26c1/matchlogs/2019-2020/Lamine-Yamal-Match-Logs
⚠️ No se encontró la tabla matchlogs_all.

🔍 lamine_yamal | 2020-2021
🌐 https://fbref.com/en/players/82ec26c1/matchlog

In [None]:
# Mostrar dataframe en formato bonito

from tabulate import tabulate
import pandas as pd

# Evita truncamiento de filas y columnas
pd.set_option("display.max_rows", None)         # ← muestra todas las filas
pd.set_option("display.max_columns", None)      # ← muestra todas las columnas
pd.set_option("display.width", 1000)            # ← ancho para que no se corte horizontalmente

# Mostrar el DataFrame completo como tabla
print(tabulate(df_lamine, headers="keys", tablefmt="pretty"))


+-----+------------+-----+----------------------+----------------+---------+---------------+--------------+----------------------+-------+-------------------------------------+--------------+--------------+-----------+-----+-----+-----+-------+-----+-----+------+------+---------+-----+-----+--------+-----+------+-----+-----+-----+-----+-----+-------+------+---------+------+------+--------------+
|     |    Date    | Day |         Comp         |     Round      |  Venue  |    Result     |    Squad     |       Opponent       | Start |                 Pos                 |     Min      |    player    |  season   | Gls | Ast | PK  | PKatt | Sh  | SoT | CrdY | CrdR | Touches | Tkl | Int | Blocks | xG  | npxG | xAG | SCA | GCA | Cmp | Att | Cmp%  | PrgP | Carries | PrgC | Succ | Match Report |
+-----+------------+-----+----------------------+----------------+---------+---------------+--------------+----------------------+-------+-------------------------------------+--------------+-----------

In [29]:
# Tener en cuenta discrepancias en las competiciones como en la liga croata que aparece con un nombre en el csv y con otro nombre en la página o los clasificatorios al mundial WQC y friendlies.

# 📦 Imports
import time
import yaml
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path
from difflib import SequenceMatcher

# === Config
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
BLOCK_SIZE = 6000
SLEEP_TIME = 5

# === Paths
players_config_path = Path("data/meta/players_config.yaml")
male_competitions_path = Path("data/meta/male_competitions.csv")
players_male_path = Path("data/meta/players_male.yaml")

# === Load data
with open(players_config_path, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

male_comps = pd.read_csv(male_competitions_path)
valid_comp_names = set(male_comps["name"].str.strip().str.lower())

# === Load already saved players
if players_male_path.exists():
    with open(players_male_path, "r", encoding="utf-8") as f:
        players_male = yaml.safe_load(f)
else:
    players_male = []

# === Create a set of saved names for tracking
already_processed = {p["name"] for p in players_male}
remaining_players = [p for p in all_players if p["name"] not in already_processed]
batch = remaining_players[:BLOCK_SIZE]

# === Selenium Setup
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Utility: fuzzy match
def is_match(comp_name: str, valid_set: set, threshold: float = 0.75) -> bool:
    comp_name = comp_name.lower()
    for valid in valid_set:
        ratio = SequenceMatcher(None, comp_name, valid).ratio()
        if ratio >= threshold or comp_name in valid or valid in comp_name:
            return True
    return False

# === Scraping block
valid_new_players = []

for i, player in enumerate(batch, 1):
    name = player["name"]
    url = f"https://fbref.com/en/players/{player['id']}/{name.replace('_', '-')}"
    print(f"\n🔍 [{i}/{len(batch)}] Checking: {name}")
    print(f"🌍 {url}")

    try:
        driver.get(url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        found = False
        for tag in soup.select("table a[href*='/en/comps/']"):
            comp_name = tag.text.strip()
            if is_match(comp_name, valid_comp_names):
                print(f"✅ Found: '{comp_name}' matches a valid male competition.")
                valid_new_players.append(player)
                found = True
                break

        if not found:
            print("⚠️ No valid competitions matched.")

    except Exception as e:
        print(f"❌ Error processing {name}: {e}")

driver.quit()

# === Save new players
if valid_new_players:
    updated = players_male + valid_new_players
    with open(players_male_path, "w", encoding="utf-8") as f:
        yaml.dump(updated, f, allow_unicode=True)
    print(f"\n✅ Saved {len(valid_new_players)} new male players to {players_male_path}")
else:
    print("\n🚫 No valid male players found in this batch.")





🔍 [1/6000] Checking: temurkhuja_abdukholiqov
🌍 https://fbref.com/en/players/56814550/temurkhuja-abdukholiqov
⚠️ No valid competitions matched.

🔍 [2/6000] Checking: iyaan_abdul_aleem
🌍 https://fbref.com/en/players/9d0cf90a/iyaan-abdul-aleem
⚠️ No valid competitions matched.

🔍 [3/6000] Checking: khalid_abdul_basit
🌍 https://fbref.com/en/players/7bb0fe4a/khalid-abdul-basit
⚠️ No valid competitions matched.

🔍 [4/6000] Checking: hamidu_abdul_fatawu
🌍 https://fbref.com/en/players/8c003373/hamidu-abdul-fatawu
⚠️ No valid competitions matched.

🔍 [5/6000] Checking: anwar_abdul_ghanee
🌍 https://fbref.com/en/players/6322cf13/anwar-abdul-ghanee
⚠️ No valid competitions matched.

🔍 [6/6000] Checking: maeesha_abdul_hannan
🌍 https://fbref.com/en/players/4ac493d5/maeesha-abdul-hannan
⚠️ No valid competitions matched.

🔍 [7/6000] Checking: aslam_abdul_raheem
🌍 https://fbref.com/en/players/956ebae6/aslam-abdul-raheem
⚠️ No valid competitions matched.

🔍 [8/6000] Checking: faizal_abdul_rashid
🌍 http

KeyboardInterrupt: 

In [30]:
# Excepciones explícitas: nombre "mostrado" en FBRef → nombre estándar en male_competitions.csv
EXCEPTION_COMPETITIONS = {
    "Friendlies (M)": "International Friendlies (M)",
    "Europa Lg": "UEFA Europa League",
    "1. HNL": "Croatian Football League",
    "HNL": "Croatian Football League",
    "Série A": "Campeonato Brasileiro Série A"
}


In [31]:
from difflib import SequenceMatcher

def normalize(text):
    return text.strip().lower()

def is_valid_competition(comp_name, valid_names_set, exception_map, threshold=0.75):
    name_norm = normalize(comp_name)
    
    # Match exacto o por excepciones
    if name_norm in valid_names_set:
        return True
    if name_norm in map(normalize, exception_map.keys()):
        mapped = exception_map.get(comp_name, "")
        return normalize(mapped) in valid_names_set

    # Fuzzy matching
    for valid in valid_names_set:
        ratio = SequenceMatcher(None, name_norm, normalize(valid)).ratio()
        if ratio >= threshold:
            return True
    return False


In [32]:
# Test 2 with exceptions

import time
import yaml
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# === Configuración
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
BLOCK_SIZE = 5
SLEEP_TIME = 5

# === Rutas
players_config_path = Path("data/meta/players_config.yaml")
male_competitions_path = Path("data/meta/male_competitions.csv")
players_male_path = Path("data/meta/players_male.yaml")

# === Cargar datos
with open(players_config_path, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

male_comps = pd.read_csv(male_competitions_path)
valid_comp_names = set(male_comps["name"].str.strip().str.lower())

# === Jugadores ya guardados
if players_male_path.exists():
    with open(players_male_path, "r", encoding="utf-8") as f:
        players_male = yaml.safe_load(f)
else:
    players_male = []

already_processed = {p["name"] for p in players_male}
remaining_players = [p for p in all_players if p["name"] not in already_processed]
batch = remaining_players[:BLOCK_SIZE]

# === Selenium Setup
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Scrape
valid_new_players = []

for i, player in enumerate(batch, 1):
    name = player["name"]
    url = f"https://fbref.com/en/players/{player['id']}/{name.replace('_', '-')}"
    print(f"\n🔍 [{i}/{len(batch)}] Checking: {name}")
    print(f"🌍 {url}")

    try:
        driver.get(url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Buscar competiciones únicas
        comp_tags = soup.select("table a[href*='/en/comps/']")
        competitions_found = set(tag.text.strip() for tag in comp_tags)

        matched = False
        for comp_name in competitions_found:
            if is_valid_competition(comp_name, valid_comp_names, EXCEPTION_COMPETITIONS):
                print(f"✅ Match found: {comp_name}")
                valid_new_players.append(player)
                matched = True
                break

        if not matched:
            print("⚠️ No valid competitions matched.")

    except Exception as e:
        print(f"❌ Error processing {name}: {e}")

driver.quit()

# === Guardar jugadores válidos
if valid_new_players:
    updated = players_male + valid_new_players
    with open(players_male_path, "w", encoding="utf-8") as f:
        yaml.dump(updated, f, allow_unicode=True)
    print(f"\n✅ Saved {len(valid_new_players)} new male players to {players_male_path}")
else:
    print("\n🚫 No valid male players found in this batch.")



🔍 [1/5] Checking: temurkhuja_abdukholiqov
🌍 https://fbref.com/en/players/56814550/temurkhuja-abdukholiqov
✅ Match found: 1. HNL

🔍 [2/5] Checking: iyaan_abdul_aleem
🌍 https://fbref.com/en/players/9d0cf90a/iyaan-abdul-aleem
⚠️ No valid competitions matched.

🔍 [3/5] Checking: khalid_abdul_basit
🌍 https://fbref.com/en/players/7bb0fe4a/khalid-abdul-basit
✅ Match found: Europa Lg

🔍 [4/5] Checking: hamidu_abdul_fatawu
🌍 https://fbref.com/en/players/8c003373/hamidu-abdul-fatawu
⚠️ No valid competitions matched.

🔍 [5/5] Checking: anwar_abdul_ghanee
🌍 https://fbref.com/en/players/6322cf13/anwar-abdul-ghanee
⚠️ No valid competitions matched.

✅ Saved 2 new male players to data\meta\players_male.yaml


In [None]:
# Definitive version with exceptions. Vuelve a scrapear a jugadores que no hayan entrado en male_players. Nos puede venir bien para plantear nuevas excepciones y hacer un repaso.

import time
import yaml
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# === Configuration ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
BLOCK_SIZE = 1000
SLEEP_TIME = 5

# === Paths ===
players_config_path = Path("data/meta/players_config.yaml")
male_competitions_path = Path("data/meta/male_competitions.csv")
players_male_path = Path("data/meta/male_players.yaml")

# === Load Data ===
with open(players_config_path, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

male_comps = pd.read_csv(male_competitions_path)
valid_comp_names = set(male_comps["name"].str.strip().str.lower())

# === Exception Dictionary ===
EXCEPTION_COMPETITIONS = {
    "Friendlies (M)": "International Friendlies (M)",
    "Europa Lg": "UEFA Europa League",
    "1. HNL": "Croatian Football League",
    "HNL": "Croatian Football League",
    "Série A": "Campeonato Brasileiro Série A"
}

# === Functions ===
def is_valid_competition(comp_name: str, valid_names: set, exceptions: dict) -> bool:
    comp_name_lower = comp_name.lower()
    
    if "wcq" in comp_name_lower:
        return True
    
    if comp_name_lower in valid_names:
        return True
    
    for alias, real_name in exceptions.items():
        if alias in comp_name_lower or real_name.lower() in comp_name_lower:
            return True
    return False

# === Load Already Processed ===
if players_male_path.exists():
    with open(players_male_path, "r", encoding="utf-8") as f:
        players_male = yaml.safe_load(f)
else:
    players_male = []

already_processed_ids = {p["id"] for p in players_male}
remaining_players = [p for p in all_players if p["id"] not in already_processed_ids]
batch = remaining_players[:BLOCK_SIZE]

# === Selenium Setup ===
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Scrape Block ===
valid_new_players = []

for i, player in enumerate(batch, 1):
    name = player["name"]
    url = f"https://fbref.com/en/players/{player['id']}/{name.replace('_', '-')}"
    print(f"\n🔍 [{i}/{len(batch)}] Checking: {name}")
    print(f"🌍 {url}")

    try:
        driver.get(url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find unique competitions
        comp_tags = soup.select("table a[href*='/en/comps/']")
        competitions_found = set(tag.text.strip() for tag in comp_tags)

        matched = False
        for comp in competitions_found:
            if is_valid_competition(comp, valid_comp_names, EXCEPTION_COMPETITIONS):
                print(f"✅ Match found: {comp}")
                valid_new_players.append(player)
                matched = True
                break

        if not matched:
            print("⚠️ No valid competitions matched.")

    except Exception as e:
        print(f"❌ Error processing {name}: {e}")

driver.quit()

# === Save New Players ===
if valid_new_players:
    updated = players_male + valid_new_players
    with open(players_male_path, "w", encoding="utf-8") as f:
        yaml.dump(updated, f, allow_unicode=True)
    print(f"\n✅ Saved {len(valid_new_players)} new male players to {players_male_path}")
else:
    print("\n🚫 No valid male players found in this batch.")




🔍 [1/1000] Checking: jamal_aabbou
🌍 https://fbref.com/en/players/ad713dff/jamal-aabbou
⚠️ No valid competitions matched.

🔍 [2/1000] Checking: kamilla_aabel
🌍 https://fbref.com/en/players/d7ed844d/kamilla-aabel
⚠️ No valid competitions matched.

🔍 [3/1000] Checking: hicham_aaboubou
🌍 https://fbref.com/en/players/cbe904fa/hicham-aaboubou


KeyboardInterrupt: 

In [5]:
# Usamos este ahora mismo

# Este ignora a los jugadores/as en player config que ya han sido scrapeados

# 📦 Imports
import time
import yaml
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# === Configuration ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
START_INDEX = 10000    # ⚠️ Set your starting index here (0-based)
END_INDEX = 10030     # ⚠️ Set your ending index here
SLEEP_TIME = 5

# === Paths ===
players_config_path = Path("data/meta/players_config.yaml")
male_competitions_path = Path("data/meta/male_competitions.csv")
players_male_path = Path("data/meta/male_players.yaml")

# === Load Data ===
with open(players_config_path, "r", encoding="utf-8") as f:
    all_players = yaml.safe_load(f)

male_comps = pd.read_csv(male_competitions_path)
valid_comp_names = set(male_comps["name"].str.strip().str.lower())

# === Exceptions ===
EXCEPTION_COMPETITIONS = {
    "Friendlies (M)": "International Friendlies (M)",
    "Europa Lg": "UEFA Europa League",
    "1. HNL": "Croatian Football League",
    "HNL": "Croatian Football League",
    "Série A": "Campeonato Brasileiro Série A"
}

def is_valid_competition(comp_name: str, valid_names: set, exceptions: dict) -> bool:
    comp_name_lower = comp_name.lower()
    if "wcq" in comp_name_lower:
        return True
    if comp_name_lower in valid_names:
        return True
    for alias, real_name in exceptions.items():
        if alias.lower() in comp_name_lower or real_name.lower() in comp_name_lower:
            return True
    return False

# === Load already validated players
if players_male_path.exists():
    with open(players_male_path, "r", encoding="utf-8") as f:
        players_male = yaml.safe_load(f)
else:
    players_male = []

existing_names = {p["name"] for p in players_male}
batch = all_players[START_INDEX:END_INDEX]

print(f"\n🧭 Manually scraping players from index {START_INDEX} to {END_INDEX}...")

# === Selenium Setup ===
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Scrape block ===
valid_new_players = []

for i, player in enumerate(batch, start=START_INDEX + 1):
    name = player["name"]
    
    if name in existing_names:
        print(f"⏩ Already processed: {name}")
        continue

    url = f"https://fbref.com/en/players/{player['id']}/{name.replace('_', '-')}"
    print(f"\n🔍 [{i}] Checking: {name}")
    print(f"🌍 {url}")

    try:
        driver.get(url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        comp_tags = soup.select("table a[href*='/en/comps/']")
        competitions_found = set(tag.text.strip() for tag in comp_tags)

        matched = False
        for comp in competitions_found:
            if is_valid_competition(comp, valid_comp_names, EXCEPTION_COMPETITIONS):
                print(f"✅ Match found: {comp}")
                valid_new_players.append(player)
                matched = True
                break

        if not matched:
            print("⚠️ No valid competitions matched.")

    except Exception as e:
        print(f"❌ Error processing {name}: {e}")

driver.quit()

# === Save new players ===
if valid_new_players:
    updated = players_male + valid_new_players
    with open(players_male_path, "w", encoding="utf-8") as f:
        yaml.dump(updated, f, allow_unicode=True)
    print(f"\n✅ Saved {len(valid_new_players)} new male players to {players_male_path}")
else:
    print("\n🚫 No valid male players found in this range.")





🧭 Manually scraping players from index 10000 to 10030...

🔍 [10001] Checking: david_aquino
🌍 https://fbref.com/en/players/6d79c525/david-aquino
✅ Match found: Coppa Italia

🔍 [10002] Checking: derlis_aquino
🌍 https://fbref.com/en/players/cfe18471/derlis-aquino
⚠️ No valid competitions matched.

🔍 [10003] Checking: disney_aquino
🌍 https://fbref.com/en/players/5d7d2841/disney-aquino
✅ Match found: WCQ

🔍 [10004] Checking: dylan_aquino
🌍 https://fbref.com/en/players/dababecf/dylan-aquino
⚠️ No valid competitions matched.

🔍 [10005] Checking: luis_carlos_de_aquino_guirra
🌍 https://fbref.com/en/players/0ec7045c/luis-carlos-de-aquino-guirra
⚠️ No valid competitions matched.

🔍 [10006] Checking: hugo_aquino
🌍 https://fbref.com/en/players/0448941a/hugo-aquino
⚠️ No valid competitions matched.

🔍 [10007] Checking: javier_aquino
🌍 https://fbref.com/en/players/215104cc/javier-aquino
✅ Match found: Liga MX

🔍 [10008] Checking: joel_aquino
🌍 https://fbref.com/en/players/aca05709/joel-aquino
⚠️ No 

In [6]:
# Código para obtener un csv con competiciones unique y otros datos.

# 📦 Imports
import time
import yaml
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# === Config ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
SLEEP_TIME = 5
N_PLAYERS = 5100

# === Paths ===
players_config_path = Path("data/meta/players_config.yaml")
male_players_path = Path("data/meta/male_players.yaml")
output_path = Path("data/meta/unmatched_competitions_sample.csv")

# === Load YAMLs ===
with open(players_config_path, "r", encoding="utf-8") as f:
    players_config = yaml.safe_load(f)

if male_players_path.exists():
    with open(male_players_path, "r", encoding="utf-8") as f:
        male_players = yaml.safe_load(f)
else:
    male_players = []

# === Filter Players Not Yet Processed ===
processed_names = {p["name"] for p in male_players}
remaining_players = [p for p in players_config if p["name"] not in processed_names]
batch = remaining_players[:N_PLAYERS]

# === Setup Selenium ===
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Scrape Competitions ===
all_competitions = set()

for i, player in enumerate(batch, 1):
    name = player["name"]
    url = f"https://fbref.com/en/players/{player['id']}/{name.replace('_', '-')}"
    print(f"\n🔍 [{i}/{N_PLAYERS}] Checking: {name}")
    print(f"🌍 {url}")

    try:
        driver.get(url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        comps = soup.select("table a[href*='/en/comps/']")
        for tag in comps:
            comp_name = tag.text.strip()
            if comp_name:
                all_competitions.add(comp_name)
    except Exception as e:
        print(f"❌ Error on {name}: {e}")

driver.quit()

# === Save to CSV ===
df = pd.DataFrame(sorted(all_competitions), columns=["competition_name"])
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
print(f"\n✅ {len(df)} unique competitions saved to: {output_path}")



🔍 [1/5100] Checking: jamal_aabbou
🌍 https://fbref.com/en/players/ad713dff/jamal-aabbou

🔍 [2/5100] Checking: kamilla_aabel
🌍 https://fbref.com/en/players/d7ed844d/kamilla-aabel

🔍 [3/5100] Checking: hicham_aaboubou
🌍 https://fbref.com/en/players/cbe904fa/hicham-aaboubou

🔍 [4/5100] Checking: gustav_aabro
🌍 https://fbref.com/en/players/58c909ec/gustav-aabro

🔍 [5/5100] Checking: abubaker_aadem
🌍 https://fbref.com/en/players/12648b72/abubaker-aadem

🔍 [6/5100] Checking: anna_aahjem
🌍 https://fbref.com/en/players/8f5fa9e1/anna-aahjem

🔍 [7/5100] Checking: isak_aalberg
🌍 https://fbref.com/en/players/617c2622/isak-aalberg

🔍 [8/5100] Checking: andreas_aalbu
🌍 https://fbref.com/en/players/4fe6659a/andreas-aalbu

🔍 [9/5100] Checking: levi_aalders
🌍 https://fbref.com/en/players/34793b4e/levi-aalders

🔍 [10/5100] Checking: redouan_aalhoul
🌍 https://fbref.com/en/players/db4c2506/redouan-aalhoul

🔍 [11/5100] Checking: uriel_van_aalst
🌍 https://fbref.com/en/players/ba469ddc/uriel-van-aalst

🔍 [12

In [7]:
# Limpieza de unmatched competitions

import pandas as pd
import re

# 📂 Load original CSV
df = pd.read_csv("data/meta/unmatched_competitions_sample.csv")

# 🔧 Normalize and clean competition names
def clean_competition(name):
    name = str(name).strip()

    # Remove Matchweek entries
    if re.match(r"^Matchweek\s*\d+$", name, re.IGNORECASE):
        return None

    # Remove years or year ranges like '2023', '2021–22', etc.
    name = re.sub(r"\b\d{4}(–\d{2,4})?\b", "", name)

    # Remove extra symbols or leftover punctuation
    name = re.sub(r"[-–—]+", " ", name)  # convert dashes to spaces
    name = re.sub(r"\s+", " ", name)  # normalize whitespace
    name = name.strip()

    # Skip empty results
    return name if name else None

# 🧼 Apply cleaning
df["clean_name"] = df["competition_name"].map(clean_competition)

# ❌ Drop rows with empty or None values
df_clean = df.dropna(subset=["clean_name"])

# 🧹 Drop duplicates and sort
df_final = df_clean[["clean_name"]].drop_duplicates().sort_values("clean_name").reset_index(drop=True)

# 💾 Save cleaned list
output_path = "data/meta/unmatched_clean_competitions.csv"
df_final.to_csv(output_path, index=False)

print(f"✅ Cleaned list saved to: {output_path}")
print(f"🔢 Total unique competitions: {len(df_final)}")


✅ Cleaned list saved to: data/meta/unmatched_clean_competitions.csv
🔢 Total unique competitions: 204


In [9]:
#  Código que haría la comparación y genera el CSV de ALIAS:

import pandas as pd

# 📂 Load data
unmatched = pd.read_csv("data/meta/unmatched_clean_competitions.csv")
male_comp = pd.read_csv("data/meta/male_competitions.csv")

# 🧠 Create a set of official names for fast lookup
official_names = set(male_comp["name"].str.strip().unique())

# 🔍 Identify unmatched names
unmatched["already_in_official"] = unmatched["clean_name"].apply(lambda x: x in official_names)
to_alias = unmatched[~unmatched["already_in_official"]]

# 📋 Create alias mapping file
alias_df = to_alias[["clean_name"]].rename(columns={"clean_name": "original_name"})
alias_df["official_competition_name"] = ""  # Empty for manual filling

# 💾 Save to new CSV
output_path = "data/meta/competitions_to_map_alias.csv"
alias_df.to_csv(output_path, index=False)

print(f"✅ Alias file created with {len(alias_df)} competitions needing mapping at: {output_path}")


✅ Alias file created with 158 competitions needing mapping at: data/meta/competitions_to_map_alias.csv


In [None]:
# Código para obtener un csv con competiciones unique y otros datos.

# Opción manual con cambio de índice.

# 📦 Imports
import time
import yaml
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# === Config ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
SLEEP_TIME = 5
N_PLAYERS = 500
START_INDEX = 3000  # 👈 Cambias aquí el número donde te quedaste

# === Paths ===
players_config_path = Path("data/meta/players_config.yaml")
male_players_path = Path("data/meta/male_players.yaml")
output_path = Path("data/meta/unmatched_competitions_sample.csv")

# === Load YAMLs
with open(players_config_path, "r", encoding="utf-8") as f:
    players_config = yaml.safe_load(f)

if male_players_path.exists():
    with open(male_players_path, "r", encoding="utf-8") as f:
        male_players = yaml.safe_load(f)
else:
    male_players = []

# === Filter Players Not Yet Processed
processed_names = {p["name"] for p in male_players}
remaining_players = [p for p in players_config if p["name"] not in processed_names]
batch = remaining_players[START_INDEX:START_INDEX + N_PLAYERS]

# === Setup Selenium
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Scrape Competitions
all_competitions = set()

for i, player in enumerate(batch, START_INDEX + 1):
    name = player["name"]
    url = f"https://fbref.com/en/players/{player['id']}/{name.replace('_', '-')}"
    print(f"\n🔍 [{i}] Checking: {name}")
    print(f"🌍 {url}")

    try:
        driver.get(url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        comps = soup.select("table a[href*='/en/comps/']")
        for tag in comps:
            comp_name = tag.text.strip()
            if comp_name:
                all_competitions.add(comp_name)
    except Exception as e:
        print(f"❌ Error on {name}: {e}")

driver.quit()

# === Save to CSV
df = pd.DataFrame(sorted(all_competitions), columns=["competition_name"])
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
print(f"\n✅ {len(df)} unique competitions saved to: {output_path}")


In [None]:
# Código para obtener un csv con competiciones unique y otros datos.

# Opción automática con checkpoint

# 📦 Imports
import time
import yaml
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# === Config ===
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"
SLEEP_TIME = 5
N_PLAYERS = 500
CHECKPOINT_PATH = Path("data/meta/competition_checkpoint.txt")

# === Paths ===
players_config_path = Path("data/meta/players_config.yaml")
male_players_path = Path("data/meta/male_players.yaml")
output_path = Path("data/meta/unmatched_competitions_sample.csv")

# === Load YAMLs
with open(players_config_path, "r", encoding="utf-8") as f:
    players_config = yaml.safe_load(f)

if male_players_path.exists():
    with open(male_players_path, "r", encoding="utf-8") as f:
        male_players = yaml.safe_load(f)
else:
    male_players = []

# === Load Checkpoint
if CHECKPOINT_PATH.exists():
    with open(CHECKPOINT_PATH, "r") as f:
        start_index = int(f.read().strip())
else:
    start_index = 0

print(f"⏩ Starting from player {start_index}")

# === Filter Players Not Yet Processed
processed_names = {p["name"] for p in male_players}
remaining_players = [p for p in players_config if p["name"] not in processed_names]
batch = remaining_players[start_index:start_index + N_PLAYERS]

# === Setup Selenium
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Scrape Competitions
all_competitions = set()

for i, player in enumerate(batch, start=start_index + 1):
    name = player["name"]
    url = f"https://fbref.com/en/players/{player['id']}/{name.replace('_', '-')}"
    print(f"\n🔍 [{i}] Checking: {name}")
    print(f"🌍 {url}")

    try:
        driver.get(url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        comps = soup.select("table a[href*='/en/comps/']")
        for tag in comps:
            comp_name = tag.text.strip()
            if comp_name:
                all_competitions.add(comp_name)
    except Exception as e:
        print(f"❌ Error on {name}: {e}")

driver.quit()

# === Save competitions
df = pd.DataFrame(sorted(all_competitions), columns=["competition_name"])
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
print(f"\n✅ {len(df)} unique competitions saved to: {output_path}")

# === Save new checkpoint
new_index = start_index + N_PLAYERS
CHECKPOINT_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(CHECKPOINT_PATH, "w") as f:
    f.write(str(new_index))
print(f"📝 New checkpoint saved: {new_index}")


In [None]:
# Código para obtener url de competiciones raras o con otros nombres para crear un csv y cruzar con male_competitions. Funciona perfectamente.
# Nos generará el competitions_mapped_test.csv con los nombres de las competiciones y el nombre oficial que le hemos dado.
# Luego habrá que cruzar con el csv de competiciones para ver si hay alguna que no esté en el csv de competiciones, por temas de alias o nombres raros.
# Hace un test de las 5 primeras filas.

# 📦 Imports
import pandas as pd
import time
import random
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# === Paths ===
input_csv = Path("data/meta/competitions_to_map_alias.csv")
output_csv = Path("data/meta/competitions_mapped_test.csv")

# === Load first 5 competitions
df = pd.read_csv(input_csv)
competitions = df["original_name"].dropna().tolist()[:5]

# === Selenium setup
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"

options = Options()
# options.add_argument("--headless")  # Show browser for now
options.add_argument("--disable-gpu")
options.add_argument("--start-maximized")

service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Gender extraction
def extract_gender(driver):
    try:
        gender_element = driver.find_element(By.XPATH, '//strong[text()="Gender"]/parent::p')
        gender_text = gender_element.text.lower()
        if "female" in gender_text:
            return "female"
        elif "male" in gender_text:
            return "male"
        else:
            return "unknown"
    except:
        return "unknown"

# === Helper to close Bing cookies overlay
def close_bing_overlay(driver):
    try:
        # Wait a little for any overlay to load
        time.sleep(1)
        accept_buttons = driver.find_elements(By.XPATH, '//input[@type="submit" and contains(@aria-label, "Accept")]')
        if accept_buttons:
            accept_buttons[0].click()
            print("✅ Closed Bing overlay")
            time.sleep(2)  # Give it a moment after closing
    except Exception as e:
        print(f"⚠️ Could not close overlay: {e}")

# === Process each competition
results = []

for i, comp in enumerate(competitions, 1):
    try:
        query = f"{comp} FBREF"
        print(f"\n🔍 [{i}/5] Searching: {query}")

        # Bing search
        search_url = f"https://www.bing.com/search?q={query.replace(' ', '+')}"
        driver.get(search_url)
        time.sleep(random.uniform(5, 7))

        # Try closing overlay if appears
        close_bing_overlay(driver)

        # Find FBREF link
        results_links = driver.find_elements(By.CSS_SELECTOR, 'li.b_algo h2 a')
        fbref_url = ""
        for link in results_links:
            href = link.get_attribute("href")
            if href and "fbref.com" in href:
                fbref_url = href
                link.click()
                break

        if not fbref_url:
            print(f"⚠️ No FBREF result found for '{comp}'")
            continue

        # Wait for page + switch if new tab
        time.sleep(random.uniform(6, 8))
        if len(driver.window_handles) > 1:
            driver.switch_to.window(driver.window_handles[-1])
            time.sleep(1)

        official_title = driver.title
        gender = extract_gender(driver)

        print(f"✅ {official_title} | Gender: {gender}")

        results.append({
            "original_name": comp,
            "fbref_url": driver.current_url,
            "official_title": official_title,
            "gender": gender
        })

        # Close tab if needed
        if len(driver.window_handles) > 1:
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            time.sleep(1)

        time.sleep(random.uniform(4, 6))

    except Exception as e:
        print(f"❌ Error on '{comp}': {e}")
        continue

driver.quit()

# === Save to CSV
if results:
    df_results = pd.DataFrame(results)
    output_csv.parent.mkdir(parents=True, exist_ok=True)
    df_results.to_csv(output_csv, index=False)
    print(f"\n💾 Test results saved to: {output_csv}")
    print(df_results)
else:
    print("\n⚠️ No results to save.")





🔍 [1/5] Searching: 1. HNL FBREF
✅ 2021-2022 1. HNL Stats | FBref.com | Gender: male

🔍 [2/5] Searching: 2. Bundesliga FBREF
✅ 2. Bundesliga Stats | FBref.com | Gender: male

🔍 [3/5] Searching: 3. Liga FBREF
✅ 3. Liga Stats | FBref.com | Gender: male

🔍 [4/5] Searching: A Group FBREF
✅ Serie A Stats | FBref.com | Gender: male

🔍 [5/5] Searching: A League FBREF
✅ A-League Men Estadísticas | FBref.com | Gender: unknown

💾 Test results saved to: data\meta\competitions_mapped_test.csv
   original_name                                          fbref_url  \
0         1. HNL  https://fbref.com/en/comps/63/2021-2022/2021-2...   
1  2. Bundesliga   https://fbref.com/en/comps/33/2-Bundesliga-Stats   
2        3. Liga         https://fbref.com/en/comps/59/3-Liga-Stats   
3        A Group        https://fbref.com/en/comps/11/Serie-A-Stats   
4       A League  https://fbref.com/es/comps/65/Estadisticas-de-...   

                          official_title   gender  
0     2021-2022 1. HNL Stats | FBre

In [4]:
# Scraping de competiciones masculinas raras o secundarias y obtención de URLs para añadir a un csv máster de competiciones masculinas.

# 📦 Imports
import pandas as pd
import time
import random
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# === Paths ===
input_csv = Path("data/meta/competitions_to_map_alias.csv")
output_csv = Path("data/meta/sec_male_competitions.csv")

# === Load all competitions
df = pd.read_csv(input_csv)
competitions = df["original_name"].dropna().tolist()

# === Selenium setup
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"

options = Options()
# options.add_argument("--headless")  # Show browser for now
options.add_argument("--disable-gpu")
options.add_argument("--start-maximized")

service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Gender extraction function
def extract_gender(driver):
    try:
        gender_element = driver.find_element(By.XPATH, '//strong[text()="Gender"]/parent::p')
        gender_text = gender_element.text.lower()
        if "female" in gender_text:
            return "female"
        elif "male" in gender_text:
            return "male"
        else:
            return "unknown"
    except:
        return "unknown"

# === Helper to close Bing cookies overlay
def close_bing_overlay(driver):
    try:
        time.sleep(1)
        accept_buttons = driver.find_elements(By.XPATH, '//input[@type="submit" and contains(@aria-label, "Accept")]')
        if accept_buttons:
            accept_buttons[0].click()
            print("✅ Closed Bing overlay")
            time.sleep(2)
    except Exception as e:
        print(f"⚠️ Could not close overlay: {e}")

# === Process each competition
results = []

for i, comp in enumerate(competitions, 1):
    try:
        query = f"{comp} FBREF"
        print(f"\n🔍 [{i}/{len(competitions)}] Searching: {query}")

        # Bing search
        search_url = f"https://www.bing.com/search?q={query.replace(' ', '+')}"
        driver.get(search_url)
        time.sleep(random.uniform(5, 7))

        close_bing_overlay(driver)

        # Find FBREF link
        results_links = driver.find_elements(By.CSS_SELECTOR, 'li.b_algo h2 a')
        fbref_url = ""
        for link in results_links:
            href = link.get_attribute("href")
            if href and "fbref.com" in href:
                fbref_url = href
                link.click()
                break

        if not fbref_url:
            print(f"⚠️ No FBREF result found for '{comp}'")
            continue

        # Wait for page + switch if new tab
        time.sleep(random.uniform(6, 8))
        if len(driver.window_handles) > 1:
            driver.switch_to.window(driver.window_handles[-1])
            time.sleep(1)

        official_title = driver.title
        gender = extract_gender(driver)

        print(f"✅ {official_title} | Gender: {gender}")

        # Only save if gender == male
        if gender == "male":
            results.append({
                "original_name": comp,
                "fbref_url": driver.current_url,
                "official_title": official_title,
                "gender": gender
            })

        # Close tab if needed
        if len(driver.window_handles) > 1:
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            time.sleep(1)

        time.sleep(random.uniform(4, 6))

    except Exception as e:
        print(f"❌ Error on '{comp}': {e}")
        continue

driver.quit()

# === Save to CSV
if results:
    df_results = pd.DataFrame(results)
    output_csv.parent.mkdir(parents=True, exist_ok=True)
    df_results.to_csv(output_csv, index=False)
    print(f"\n💾 Male competitions saved to: {output_csv}")
    print(df_results)
else:
    print("\n⚠️ No male competitions found to save.")



🔍 [1/158] Searching: 1. HNL FBREF
✅ 2021-2022 1. HNL Scores & Fixtures | FBref.com | Gender: male

🔍 [2/158] Searching: 2. Bundesliga FBREF
✅ 2. Bundesliga Statistiken | FBref.com | Gender: unknown

🔍 [3/158] Searching: 3. Liga FBREF
✅ 3. Liga Stats | FBref.com | Gender: male

🔍 [4/158] Searching: A Group FBREF
✅ Serie A Stats | FBref.com | Gender: male

🔍 [5/158] Searching: A League FBREF
✅ A-League Men Stats | FBref.com | Gender: male

🔍 [6/158] Searching: A League Men FBREF
✅ A-League Men Stats | FBref.com | Gender: male

🔍 [7/158] Searching: A League Women FBREF
✅ A-League Women Stats | FBref.com | Gender: female

🔍 [8/158] Searching: Alpha Ethniki FBREF
✅ 2001-2002 Alpha Ethniki Stats | FBref.com | Gender: male

🔍 [9/158] Searching: Apertura First stage FBREF
✅ Primera A Marcadores y partidos | FBref.com | Gender: unknown

🔍 [10/158] Searching: Asian Cup FBREF
✅ 2027 Asian Cup Stats | FBref.com | Gender: male

🔍 [11/158] Searching: Asian Cup Q FBREF
⚠️ No FBREF result found for '

In [6]:
# Crear csv master de competiciones masculinas

# 📦 Imports
import pandas as pd
from pathlib import Path

# === Paths
male_competitions_path = Path("data/meta/male_competitions.csv")
sec_male_competitions_path = Path("data/meta/sec_male_competitions.csv")
output_master_path = Path("data/meta/male_competitions_master.csv")

# === Load datasets
df_male = pd.read_csv(male_competitions_path)
df_sec = pd.read_csv(sec_male_competitions_path)

# === Normalize column names if needed
# Assuming columns: 'competition_name', 'url' in df_male
# Assuming columns: 'original_name', 'fbref_url', 'official_title', 'gender' in df_sec

# Rename for easier merge
df_sec = df_sec.rename(columns={
    "fbref_url": "url",
    "official_title": "official_name",
    "original_name": "alias_name"
})

# Filter only male in second dataset (redundant but safe)
df_sec = df_sec[df_sec["gender"] == "male"]

# === Merge datasets
# Merge left: keep all original male competitions
master = df_male.copy()
master["alias_name"] = None  # Create alias column

# Prepare a dictionary from sec_male_competitions
url_to_alias = dict(zip(df_sec["url"], df_sec["alias_name"]))

# Add aliases where URL matches
master["alias_name"] = master["url"].map(url_to_alias)

# Find new competitions not yet in original male_competitions
existing_urls = set(master["url"])
new_entries = df_sec[~df_sec["url"].isin(existing_urls)].copy()

# For new entries, fill fields
new_entries["competition_name"] = new_entries["alias_name"]
new_entries = new_entries[["competition_name", "url", "alias_name", "gender"]]

# Add new entries to the master
master["gender"] = "male"  # Make sure gender column exists
master = master[["name", "url", "alias_name", "gender"]]
final_master = pd.concat([master, new_entries], ignore_index=True)

# === Save to master CSV
output_master_path.parent.mkdir(parents=True, exist_ok=True)
final_master.to_csv(output_master_path, index=False)
print(f"💾 Master competition file saved to: {output_master_path}")


💾 Master competition file saved to: data\meta\male_competitions_master.csv


In [None]:
# Añadir nuevos jugadores a male players, segunda pasada

# 📦 Imports
import yaml
import time
import pandas as pd
import shutil
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path

# === Paths ===
players_config_path = Path("data/meta/players_config.yaml")
male_players_path = Path("data/meta/male_players.yaml")
master_competitions_path = Path("data/meta/male_competitions_master.csv")
backup_path = Path("data/meta/male_players_backup.yaml")  # Backup!

# === Load Files ===
with open(players_config_path, "r", encoding="utf-8") as f:
    players_config = yaml.safe_load(f)

if male_players_path.exists():
    with open(male_players_path, "r", encoding="utf-8") as f:
        male_players = yaml.safe_load(f)
else:
    male_players = []

# Load master competitions
master_comp_df = pd.read_csv(master_competitions_path)

# Create sets for fast lookup
official_comp_names = set(master_comp_df["competition_name"].dropna().str.strip())
alias_names = set(master_comp_df["alias_name"].dropna().str.strip())
all_valid_names = official_comp_names.union(alias_names)

# === Set last scraped player url
last_scraped_url = "https://fbref.com/en/players/10f81c11/matchlogs/{season}/Qudus-Akanni-Match-Logs"

# Find starting point
start_index = 0

for idx, player in enumerate(players_config):
    if player["url_template"] == last_scraped_url:
        start_index = idx + 1  # Start AFTER this player
        break

remaining_players = players_config[start_index:]

print(f"🚀 Starting scraping from index {start_index} ({remaining_players[0]['name']})")

# === Setup Selenium
CHROME_PATH = "C:/Windows/System32/chromedriver.exe"

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
service = Service(executable_path=CHROME_PATH)
driver = webdriver.Chrome(service=service, options=options)

# === Scrape players
new_players = []
players_added_count = 0

for idx, player in enumerate(remaining_players, start=start_index + 1):
    player_id = player["id"]
    player_name = player["name"]

    print(f"\n🔍 [{idx}/{len(players_config)}] Checking {player_name}...")

    try:
        profile_url = f"https://fbref.com/en/players/{player_id}/{player_name.replace('_', '-')}"
        driver.get(profile_url)
        time.sleep(5)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract competition names from player's tables
        comps_tags = soup.select("table a[href*='/en/comps/']")
        competitions_found = {tag.text.strip() for tag in comps_tags if tag.text.strip()}

        matched = False

        for comp in competitions_found:
            if comp in all_valid_names:
                matched = True
                break

        if matched:
            print(f"✅ Adding {player_name}")
            new_players.append({
                "id": player_id,
                "name": player_name,
                "url_template": player["url_template"]
            })
            players_added_count += 1
            print(f"🎯 Total players added so far: {players_added_count}")
        else:
            print(f"⚠️ No matching competition found for {player_name}")

    except Exception as e:
        print(f"❌ Error scraping {player_name}: {e}")

driver.quit()

# === Update male_players.yaml safely ===
if new_players:
    # 1. Create a backup
    shutil.copy(male_players_path, backup_path)
    print(f"\n📦 Backup created at {backup_path}")

    # 2. Extend existing list
    male_players.extend(new_players)

    # 3. Save updated male_players.yaml
    with open(male_players_path, "w", encoding="utf-8") as f:
        yaml.dump(male_players, f, allow_unicode=True)

    print(f"\n💾 {len(new_players)} new players added to {male_players_path}")
else:
    print("\n✅ No new players found to add.")



🚀 Starting scraping from index 3697 (remi_akanni)

🔍 Checking remi_akanni...
⚠️ No matching competition found for remi_akanni

🔍 Checking soufian_akanni...
⚠️ No matching competition found for soufian_akanni

🔍 Checking jacob_akanyirige...
✅ jacob_akanyirige will be added.

🔍 Checking akira_akao...
⚠️ No matching competition found for akira_akao

🔍 Checking ilias_akaouch...
✅ ilias_akaouch will be added.

🔍 Checking carlos_akapo...
✅ carlos_akapo will be added.

🔍 Checking javier_akapo...
⚠️ No matching competition found for javier_akapo

🔍 Checking akin_akar...
✅ akin_akar will be added.

🔍 Checking eray_akar...
⚠️ No matching competition found for eray_akar

🔍 Checking rita_akarekor...
⚠️ No matching competition found for rita_akarekor

🔍 Checking muhammed_akarslan...
⚠️ No matching competition found for muhammed_akarslan

🔍 Checking arda_akarsu...
⚠️ No matching competition found for arda_akarsu

🔍 Checking murat_akarsu...
⚠️ No matching competition found for murat_akarsu

🔍 Checkin

In [9]:
# Función para corregir el error de la falta de scrapeo de URLs en male_players.append

# 📦 Imports
import yaml
from pathlib import Path

# === Paths
male_players_path = Path("data/meta/male_players.yaml")

# === Load YAML
with open(male_players_path, "r", encoding="utf-8") as f:
    players = yaml.safe_load(f)

# === URL builder
def build_url_template(player_id, player_name):
    formatted_name = "-".join([part.capitalize() for part in player_name.split("_")])
    return f"https://fbref.com/en/players/{player_id}/matchlogs/{{season}}/{formatted_name}-Match-Logs"

# === Fill missing url_template
updated = False
for player in players:
    if "url_template" not in player:
        player["url_template"] = build_url_template(player["id"], player["name"])
        updated = True

# === Save only if something was changed
if updated:
    with open(male_players_path, "w", encoding="utf-8") as f:
        yaml.dump(players, f, allow_unicode=True)
    print("✅ URL templates added to players without them.")
else:
    print("✅ All players already had url_template.")


✅ URL templates added to players without them.


# 🧩 Proyecto: Scraping y filtrado de jugadores masculinos (FBRef)

## ✅ Paso 1: Scraping inicial de jugadores

- Leer archivo `players_config.yaml` con todos los jugadores.
- Leer archivo `male_players.yaml` con los ya identificados como masculinos.
- Filtrar jugadores **no procesados previamente** (por nombre).
- Scrapear por bloques (ej: 500 jugadores) y verificar si participa en competiciones masculinas conocidas o excepciones.
- Guardar los jugadores válidos en `male_players.yaml`.

---

## ✅ Paso 2: Extracción de competiciones de jugadores descartados

- Filtrar jugadores **que no están** en `male_players.yaml`.
- Para cada uno, extraer todas las competiciones encontradas en su perfil.
- Generar un CSV con **nombres únicos** de competiciones (`unmatched_competitions_sample.csv`).
- Este archivo servirá para identificar:
  - Nombres incorrectos
  - Alias no controlados
  - Posibles ligas femeninas
- Aquí hemos añadido la opción de usar el competitions_to_map_alias.csv para buscar en google los enlaces a las ligas y así comparar URLs.

---

## ✅ Paso 3: Segundo barrido con diccionario de excepciones

- Crear un diccionario `EXCEPTION_COMPETITIONS` con alias → nombre estándar.
- Aplicar lógica `is_valid_competition()` mejorada con fuzzy matching.
- Revisar **solo los jugadores descartados**.
- Agregar los jugadores válidos por alias o excepciones a `male_players.yaml`.

---

## ✅ Paso 4: Verificación de errores

- Revisar `male_players.yaml` para comprobar que **no se han incluido mujeres**.
  - Buscar patrones como: "Liga femenina", "Toppserien", etc.
- Eliminar los jugadores que no cumplan con las condiciones.
- Guardar un nuevo archivo: `male_players_filtered.yaml`.

---

## ✅ Paso 5: Exportación a formato final

- Leer `male_players_filtered.yaml`.
- Exportar a `male_players_final.csv` para análisis en Pandas, R, Excel, etc.
- Asegurarse de que cada fila contiene:
  - Nombre, ID, URL, país, competiciones válidas (opcionalmente)

---

## 🛠 Extras opcionales

- Añadir log de progreso (`n of total`).
- Contador de tiempo estimado por bloque.
- Registro de errores durante scraping.
- Comparativa entre `players_config` y `male_players.yaml` para debugging.

