In [1]:
from pathlib import Path
import os

# Establece la raíz del proyecto manualmente
project_root = Path("F:/JCMDataCenter/Cursos/Evolve Academy/Data Scientist IA/Futpeak") # sobremesa
#project_root = Path("C:/Users/juanm/Desktop/FUTPEAK/Futpeak") # portátil

# Cambia el directorio de trabajo actual a esa raíz
os.chdir(project_root)

print("📁 Directorio de trabajo actual:", Path.cwd())


📁 Directorio de trabajo actual: F:\JCMDataCenter\Cursos\Evolve Academy\Data Scientist IA\Futpeak


In [9]:
#  BLOQUE DE CÓDIGO para detectar líneas con 51 columnas

from pathlib import Path

raw_path = Path("data/raw/raw_matchlogs_ordered.csv")

# Buscar primera línea válida con 51 columnas no vacías
with open(raw_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f, 1):
        parts = [p.strip() for p in line.strip().split(",")]
        if len(parts) == 51 and any(p for p in parts):  # 51 columnas y alguna no vacía
            print(f"✅ Line {i} has 51 columns and content:")
            for j, val in enumerate(parts):
                print(f"Col {j+1:02d}: {val}")
            break



✅ Line 2 has 51 columns and content:
Col 01: Date
Col 02: Day
Col 03: Comp
Col 04: Round
Col 05: Venue
Col 06: Result
Col 07: Squad
Col 08: Opponent
Col 09: Start
Col 10: Pos
Col 11: Min
Col 12: season
Col 13: player_name
Col 14: player_id
Col 15: Gls
Col 16: Ast
Col 17: PK
Col 18: PKatt
Col 19: Sh
Col 20: SoT
Col 21: CrdY
Col 22: CrdR
Col 23: Fls
Col 24: Fld
Col 25: Off
Col 26: Crs
Col 27: TklW
Col 28: Int
Col 29: OG
Col 30: PKwon
Col 31: PKcon
Col 32: Match Report
Col 33: Touches
Col 34: Tkl
Col 35: Blocks
Col 36: xG
Col 37: npxG
Col 38: xAG
Col 39: SCA
Col 40: GCA
Col 41: Cmp
Col 42: Att
Col 43: Cmp%
Col 44: PrgP
Col 45: Carries
Col 46: PrgC
Col 47: Succ
Col 48: 
Col 49: 
Col 50: 
Col 51: 


In [11]:
from pathlib import Path

# === Load raw CSV as lines
raw_path = Path("data/raw/raw_matchlogs_ordered.csv")
fixed_lines = []

with open(raw_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip().strip("\n")
        parts = line.split(",")
        if len(parts) < 51:
            parts += [""] * (51 - len(parts))
        elif len(parts) > 51:
            parts = parts[:51]
        fixed_lines.append(parts)

# === Define expected columns
expected_columns = [
    "Date", "Day", "Comp", "Round", "Venue", "Result", "Squad", "Opponent", "Start", "Pos", "Min",
    "season", "player_name", "player_id", "Gls", "Ast", "PK", "PKatt", "Sh", "SoT", "CrdY", "CrdR",
    "Fls", "Fld", "Off", "Crs", "TklW", "Int", "OG", "PKwon", "PKcon", "Match Report", "Touches",
    "Tkl", "Blocks", "xG", "npxG", "xAG", "SCA", "GCA", "Cmp", "Att", "Cmp%", "PrgP", "Carries",
    "PrgC", "Succ", "Extra1", "Extra2", "Extra3", "Extra4"
]


# === Create DataFrame
import pandas as pd
df = pd.DataFrame(fixed_lines, columns=expected_columns)
print("✅ Raw data loaded with 51 columns aligned.")


✅ Raw data loaded with 51 columns aligned.


In [13]:
# Drop rows that only contain season, player_name, player_id and are otherwise empty
non_empty_cols = df.drop(columns=["season", "player_name", "player_id"]).notna().sum(axis=1)
df = df[non_empty_cols > 0].copy()
print(f"✅ Remaining rows after dropping non-match rows: {len(df)}")


✅ Remaining rows after dropping non-match rows: 621128


In [14]:
# Nullify stats if player did not play
no_play_mask = df["Pos"] == "On matchday squad, but did not play"

# Columnas que no se deben tocar
meta_cols = ["Date", "Day", "Comp", "Round", "Venue", "Result", "Squad", "Opponent",
             "Start", "Pos", "Min", "season", "player_name", "player_id"]

# El resto serán estadísticas a vaciar
stat_cols = [col for col in df.columns if col not in meta_cols]
df.loc[no_play_mask, stat_cols] = None

print(f"✅ Stats nulled for {no_play_mask.sum()} matches where player did not play.")


✅ Stats nulled for 0 matches where player did not play.


In [15]:
# === Rename columns to standard format
rename_dict = {
    "player_id": "Player_ID",
    "player_name": "Player_name",
    "season": "Seasons",
    "Date": "Date",
    "Day": "Day",
    "Comp": "Competition",
    "Round": "Round",
    "Venue": "Home_Away",
    "Result": "Result",
    "Squad": "Player_team",
    "Opponent": "Rival_team",
    "Start": "Start",
    "Pos": "Position",
    "Min": "Minutes",
    "Gls": "Goals",
    "Ast": "Assists",
    "PK": "Penalty_kick",
    "PKatt": "Penalty_kick_att",
    "Sh": "Shots",
    "SoT": "Shots_on_target",
    "CrdY": "Yellow_cards",
    "CrdR": "Red_cards",
    "Fls": "Fls",
    "Fld": "Fld",
    "Off": "Off",
    "Crs": "Crs",
    "TklW": "TklW",
    "Int": "Interceptions",
    "OG": "OG",
    "PKwon": "PKwon",
    "PKcon": "PKcon",
    "Touches": "Touches",
    "Tkl": "Tackles",
    "Blocks": "Blocks",
    "xG": "xG",
    "npxG": "non_penalty_xG",
    "xAG": "x_assisted_G",
    "SCA": "Shot_creating_actions",
    "GCA": "Goal_creating_actions",
    "Cmp": "Passes_completed",
    "Att": "Passes_att",
    "Cmp%": "Percent_passes",
    "PrgP": "Progressive_passes",
    "Carries": "Feet_control",
    "PrgC": "Progressive_control",
    "Succ": "Dribling_suc"
}

df.rename(columns=rename_dict, inplace=True)

# === Drop 'Match Report' column if it exists
if "Match Report" in df.columns:
    df.drop(columns=["Match Report"], inplace=True)

# === Format player name
df["Player_name"] = df["Player_name"].astype(str).str.replace("_", " ").str.title()

# === Final column order
final_columns = [
    "Player_name", "Player_ID", "Seasons", "Date", "Day", "Competition", "Round", "Home_Away",
    "Result", "Player_team", "Rival_team", "Start", "Position", "Minutes", "Goals", "Assists",
    "Penalty_kick", "Penalty_kick_att", "Shots", "Shots_on_target", "Yellow_cards", "Red_cards",
    "Fls", "Fld", "Off", "Crs", "TklW", "Interceptions", "OG", "PKwon", "PKcon", "Touches",
    "Tackles", "Blocks", "xG", "non_penalty_xG", "x_assisted_G", "Shot_creating_actions",
    "Goal_creating_actions", "Passes_completed", "Passes_att", "Percent_passes",
    "Progressive_passes", "Feet_control", "Progressive_control", "Dribling_suc"
]

# Fill missing columns with None
for col in final_columns:
    if col not in df.columns:
        df[col] = None

# Reorder
df = df[final_columns]

print(f"✅ Columns renamed, 'Match Report' dropped, and columns ordered correctly.")


✅ Columns renamed, 'Match Report' dropped, and columns ordered correctly.


In [16]:
import numpy as np

# === 1. Remove lines without valid Date (i.e., solo Season, Name, ID)
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df[df["Date"].notna() & df["Player_ID"].notna() & df["Player_name"].notna()]

# === 2. Nullify stats if player did not play
no_play_mask = df["Position"] == "On matchday squad, but did not play"
meta_cols = [
    "Player_name", "Player_ID", "Seasons", "Date", "Day", "Competition", "Round",
    "Home_Away", "Result", "Player_team", "Rival_team", "Start", "Position"
]
stat_cols = [col for col in df.columns if col not in meta_cols]
df.loc[no_play_mask, stat_cols] = np.nan

print(f"✅ Cleaned empty lines and nullified stats for matches not played.")


  df["Date"] = pd.to_datetime(df["Date"], errors="coerce")


✅ Cleaned empty lines and nullified stats for matches not played.


In [17]:
from pathlib import Path

# === Output path
clean_output = Path("data/processed/matchlogs_cleaned_test.csv")
clean_output.parent.mkdir(parents=True, exist_ok=True)

# === Sample 5 player IDs
sample_ids = df["Player_ID"].dropna().unique()[:5]
df_sample = df[df["Player_ID"].isin(sample_ids)].copy()

# === Save to CSV
df_sample.to_csv(clean_output, index=False, encoding="utf-8")
print(f"🎯 Clean CSV saved at: {clean_output}")


🎯 Clean CSV saved at: data\processed\matchlogs_cleaned_test.csv


In [19]:
from pathlib import Path
import re

input_path = Path("data/raw/raw_matchlogs_ordered.csv")
output_path = Path("data/processed/processed_matchlogs_normalized.csv")

# Leer todo el contenido
with open(input_path, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

# Usar la línea buena como encabezado
header = lines[1]
expected_cols = header.split(",")

# Prepara archivo nuevo
with open(output_path, "w", encoding="utf-8") as out:
    out.write("Player_name,Player_ID,Seasons," + ",".join(expected_cols) + "\n")

    for line in lines[2:]:
        cols = line.split(",")

        # Rellenar o cortar hasta 51 columnas exactas
        if len(cols) < 51:
            cols += [""] * (51 - len(cols))
        elif len(cols) > 51:
            cols = cols[:51]

        # Detectar name, ID y season
        name = next((c for c in cols if re.fullmatch(r"[a-zA-ZáéíóúÁÉÍÓÚñÑüÜ]{2,}_[a-zA-ZáéíóúÁÉÍÓÚñÑüÜ]{2,}", c)), "")
        pid = next((c for c in cols if re.fullmatch(r"[a-f0-9]{8}", c)), "")
        season = next((c for c in cols if re.fullmatch(r"\d{4}(-\d{4})?", c)), "")

        # Si no hay info útil, saltar
        if not any([name, pid, season]) or not any(c for c in cols if c not in [name, pid, season]):
            continue

        # Eliminar esos campos del resto para evitar duplicados
        rest = [c for c in cols if c not in [name, pid, season]]

        # Reconstruir la fila
        new_row = [name, pid, season] + rest
        new_row = new_row[:54]  # 3 campos clave + 51 datos
        out.write(",".join(new_row) + "\n")

print(f"✅ Normalized file saved at: {output_path}")



✅ Normalized file saved at: data\processed\processed_matchlogs_normalized.csv


In [22]:
# Eliminar Match Report y formatear correctamente Player_name

import pandas as pd
from pathlib import Path

# === Load normalized CSV ===
file_path = Path("data\processed\processed_matchlogs_normalized.csv")
df = pd.read_csv(file_path, dtype=str, encoding="utf-8", low_memory=False)

# === Drop 'Match Report' column if present
if "Match Report" in df.columns:
    df.drop(columns=["Match Report"], inplace=True)

# === Remove any 'Match Report' value present in the rest of the DataFrame
df.replace("Match Report", pd.NA, inplace=True)

# === Format player names: remove underscores and title-case
df["Player_name"] = df["Player_name"].fillna("").astype(str)
df["Player_name"] = df["Player_name"].str.replace("_", " ", regex=False)
df["Player_name"] = df["Player_name"].str.title()

print("✅ Match Report removed and player names formatted.")

# === Save cleaned CSV
output_path = Path("data/processed/matchlogs_no_matchreport.csv")
df.to_csv(output_path, index=False, encoding="utf-8")
print(f"🎯 CSV saved at: {output_path}")


  df.replace("Match Report", pd.NA, inplace=True)


✅ Match Report removed and player names formatted.
🎯 CSV saved at: data\processed\matchlogs_no_matchreport.csv


In [23]:
import pandas as pd
from pathlib import Path

# === Load previous cleaned CSV ===
input_path = Path("data/processed/matchlogs_no_matchreport.csv")
df = pd.read_csv(input_path, dtype=str, encoding="utf-8", low_memory=False)

# === Drop unwanted unnamed columns
unnamed_to_drop = [col for col in df.columns if col.startswith("Unnamed")]
df.drop(columns=unnamed_to_drop, inplace=True)

# === Rename columns using existing mapping
rename_dict = {
    "Date": "Date",
    "Day": "Day",
    "Comp": "Competition",
    "Round": "Round",
    "Venue": "Home_Away",
    "Result": "Result",
    "Squad": "Player_team",
    "Opponent": "Rival_team",
    "Start": "Start",
    "Pos": "Position",
    "Min": "Minutes",
    "season": "Seasons",
    "player_name": "Player_name",
    "player_id": "Player_ID",
    "Gls": "Goals",
    "Ast": "Assists",
    "PK": "Penalty_kick",
    "PKatt": "Penalty_kick_att",
    "Sh": "Shots",
    "SoT": "Shots_on_target",
    "CrdY": "Yellow_cards",
    "CrdR": "Red_cards",
    "Fls": "Fouls_committed",
    "Fld": "Fouls_drawn",
    "Off": "Offsides",
    "Crs": "Crosses",
    "TklW": "Tackles_won",
    "Int": "Interceptions",
    "OG": "Own_goals",
    "PKwon": "Penaltys_won",
    "PKcon": "Penaltys_conceded",
    "Touches": "Touches",
    "Tkl": "Tackles",
    "Blocks": "Blocks",
    "xG": "xG",
    "npxG": "non_penalty_xG",
    "xAG": "x_assisted_G",
    "SCA": "Shot_creating_actions",
    "GCA": "Goal_creating_actions",
    "Cmp": "Passes_completed",
    "Att": "Passes_att",
    "Cmp%": "Percent_passes",
    "PrgP": "Progressive_passes",
    "Carries": "Feet_control",
    "PrgC": "Progressive_control",
    "Succ": "Dribling_suc",
    "Player_name": "Player_name",
    "Player_ID": "Player_ID",
    "Seasons": "Seasons"
}

# Rename columns (only those found in the dict)
df.rename(columns={k: v for k, v in rename_dict.items() if k in df.columns}, inplace=True)

# === Save final cleaned columns CSV
output_path = Path("data/processed/matchlogs_cleaned_columns.csv")
df.to_csv(output_path, index=False, encoding="utf-8")
print(f"🎯 Column-cleaned CSV saved at: {output_path}")


🎯 Column-cleaned CSV saved at: data\processed\matchlogs_cleaned_columns.csv


In [24]:
# Segunda limpieza de datos

import pandas as pd
from pathlib import Path

# === Load cleaned columns CSV ===
input_path = Path("data/processed/matchlogs_cleaned_columns.csv")
df = pd.read_csv(input_path, dtype=str, encoding="utf-8")

# === Step 1: Drop rows with missing Player_name, Player_ID, Seasons, or Date
df = df[df["Player_name"].notna() & df["Player_ID"].notna() & df["Seasons"].notna() & df["Date"].notna()]

# === Step 2: Remove rows where player name appears again in later columns
def is_duplicate_name_in_row(row):
    name = row["Player_name"]
    rest_of_row = row[3:]  # Skip the first 3 (Player_name, ID, Season)
    return any(str(name).lower() in str(val).lower() for val in rest_of_row if pd.notna(val))

df = df[~df.apply(is_duplicate_name_in_row, axis=1)]

# === Step 3: Remove rows with no Competition or Player_team (likely broken)
df = df[df["Competition"].notna() & df["Player_team"].notna()]

# === Step 4: Save cleaned final version
output_path = Path("data/processed/matchlogs_cleaned_final.csv")
df.to_csv(output_path, index=False, encoding="utf-8")
print(f"🎯 Final cleaned CSV saved to: {output_path}")


🎯 Final cleaned CSV saved to: data\processed\matchlogs_cleaned_final.csv


In [25]:
# Ver jugadores únicos por nombre e ID
players = df[["Player_name", "Player_ID"]].drop_duplicates().sort_values("Player_name")
print(players.to_string(index=False))
print(f"\n🎯 Total unique players: {len(players)}")


           Player_name Player_ID
       Aarón Escandell  67669ce7
          Aarón Martín  2f3e911a
     Abdelrafik Gérard  236cbdf9
        Abdoul Diawara  1fd66907
     Abdoulaye Sissako  3b602b26
            Adam Anson  2126e09b
        Adama Diakhaby  65585e3a
          Aden Baldwin  8637abd1
         Aderlan Silva  48371573
          Adil Azbague  6a8894f7
          Adri Montoro  63da5cf7
        Adrián Arregui  2b972b34
           Adrián Cruz  c39bc7cb
          Adrián Gómez  f6d94b97
           Adrián León  48ad8e8f
          Adrián Marín  b7f2edff
         Adrián Sporle  f5122c46
           Ager Aketxe  c5eaff5c
           Agus Alonso  5788870f
           Agus Medina  c70d9fb2
          Agustin Jara  2d638d10
        Agustín Bouzat  a98f4de5
       Agustín Cardozo  1b677f5e
        Agustín Coscia  06424feb
     Agustín Cousillas  69f08ace
         Agustín Doffo  d4eddba8
        Agustín Farías  ea8d470b
       Agustín Fontana  deb75bd4
     Agustín Marchesín  853b7c48
         A

In [26]:
# Revisar si existen y mostrar valores únicos
for col in ["Seasons.1", "player_name.1", "player_id.1"]:
    if col in df.columns:
        print(f"🔍 {col} – Valores únicos no nulos:")
        print(df[col].dropna().unique(), "\n")


🔍 Seasons.1 – Valores únicos no nulos:
['0' '1' '3' '2' '4' '5'] 



In [30]:
import numpy as np

# Comparar duplicados vs. campos reales
mask_mismatch = (
    (df["Player_name.1"].notna()) & (df["Player_name.1"] != df["Player_name"]) |
    (df["Player_ID.1"].notna()) & (df["Player_ID.1"] != df["Player_ID"]) |
    (df["Seasons.1"].notna()) & (df["Seasons.1"] != df["Seasons"])
)

print(f"⚠️ Filas con valores diferentes entre duplicados y columnas clave: {mask_mismatch.sum()}")
print(df.loc[mask_mismatch, ["Player_name", "Player_name.1", "Player_ID", "Player_ID.1", "Seasons", "Seasons.1"]].head())


⚠️ Filas con valores diferentes entre duplicados y columnas clave: 346186
         Player_name Player_name.1 Player_ID Player_ID.1    Seasons Seasons.1
15  Luciano Abecasis             0  6c510f2d           0  2014-2015         0
17  Luciano Abecasis             0  6c510f2d           0  2014-2015         0
18  Luciano Abecasis             0  6c510f2d           0  2014-2015         0
20  Luciano Abecasis             0  6c510f2d           0       2016         0
21  Luciano Abecasis             0  6c510f2d           0       2016         0


In [31]:
# Veamos ejemplos concretos donde Seasons.1 tiene valores numéricos sospechosos
suspect_rows = df[df["Seasons.1"].isin(['0', '1', '2', '3', '4', '5'])]
print(suspect_rows[["Player_name", "Seasons.1", "Goals", "Assists", "Minutes"]].head(20))


         Player_name Seasons.1 Goals Assists Minutes
15  Luciano Abecasis         0     0     NaN      90
17  Luciano Abecasis         0     0     NaN      10
18  Luciano Abecasis         0     0     NaN      14
20  Luciano Abecasis         0     0       1      90
21  Luciano Abecasis         0     0       0      90
22  Luciano Abecasis         0     0       1      90
23  Luciano Abecasis         0     0       0      90
24  Luciano Abecasis         0     0       0      90
25  Luciano Abecasis         0     0       0      85
26  Luciano Abecasis         0     0       0      90
27  Luciano Abecasis         0     0       0      90
28  Luciano Abecasis         0     0       1      90
29  Luciano Abecasis         0     0       0      90
30  Luciano Abecasis         0     0       1      90
31  Luciano Abecasis         0     0       0      90
32  Luciano Abecasis         0     0       3      90
33  Luciano Abecasis         0     0       2      90
34  Luciano Abecasis         0     0       0  

In [33]:
import pandas as pd
from pathlib import Path

# === Load previous cleaned CSV ===
input_path = Path("data/processed/matchlogs_cleaned_final.csv")
df = pd.read_csv(input_path, dtype=str, encoding="utf-8")

# === Drop redundant columns if present
cols_to_drop = ["Seasons.1", "Player_name.1", "Player_ID.1"]
df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

# === Remove rows where the player did not play
df = df[df["Position"] != "On matchday squad, but did not play"]

# === Save final cleaned version
final_path = Path("data/finaldb/matchlogs_cleaned_final.csv")
final_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(final_path, index=False, encoding="utf-8")

print(f"🎯 Final cleaned DB saved at: {final_path}")


🎯 Final cleaned DB saved at: data\finaldb\matchlogs_cleaned_final.csv


## A partir de aquí tenemos el código realmente relevante

In [2]:
from pathlib import Path
import os

# Establece la raíz del proyecto manualmente
project_root = Path("F:/JCMDataCenter/Cursos/Evolve Academy/Data Scientist IA/Futpeak") # sobremesa
#project_root = Path("C:/Users/juanm/Desktop/FUTPEAK/Futpeak") # portátil

# Cambia el directorio de trabajo actual a esa raíz
os.chdir(project_root)

print("📁 Directorio de trabajo actual:", Path.cwd())


📁 Directorio de trabajo actual: F:\JCMDataCenter\Cursos\Evolve Academy\Data Scientist IA\Futpeak


In [7]:
# Processing de metadata

# Crear csv con id y player name

import pandas as pd
import re
from pathlib import Path

# === Paths
input_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
output_path = Path("data/processed/cleaned_metadata.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)

lines = []
with open(input_path, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if "fbref.com" in line]

records = []

for line in lines:
    try:
        # === 1. Player ID and Slug
        url_match = re.search(r"https://fbref\.com/en/players/([a-f0-9]{8})/([^\s/,]+)", line)
        if not url_match:
            continue
        player_id = url_match.group(1)
        slug = url_match.group(2)
        player_name = slug.replace("_", " ").title()
        url_template = url_match.group(0)

        # === 2. Full name (text after ID, before next comma)
        name_match = re.search(rf"{player_id},([^,]+)", line)
        full_name = name_match.group(1).strip() if name_match else None

        records.append({
            "Player_ID": player_id,
            "Player_name": player_name,
            "Full_name": full_name,
            "Url_template": url_template
        })
    except Exception as e:
        print(f"❌ Error on line:\n{line}\n→ {e}")

# === Save output
df = pd.DataFrame(records)
df.to_csv(output_path, index=False, encoding="utf-8")
print(f"✅ Step 1 completed → Basic metadata saved at: {output_path}")


✅ Step 1 completed → Basic metadata saved at: data\processed\cleaned_metadata.csv


In [8]:
# === Processing de metadata: Paso 2
# Añadir full name válido desde raw y corregir player_name

import pandas as pd
from pathlib import Path
import re

# === Paths
raw_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
cleaned_path = Path("data/processed/cleaned_metadata.csv")

# === Load cleaned DataFrame
df = pd.read_csv(cleaned_path, dtype=str, encoding="utf-8").fillna("")

# === Read raw lines (sin header)
with open(raw_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()[1:]

# === Regex para nombres con 3 o más palabras capitalizadas
name_pattern = re.compile(
    r"\b([A-ZÁÉÍÓÚÑ][a-záéíóúñ'’\-]+(?:\s+[A-ZÁÉÍÓÚÑ][a-záéíóúñ'’\-]+){2,})\b"
)

# === Extraer full names desde raw comparando contra player_name
clean_full_names = []

for idx, row in df.iterrows():
    player_name = row["Player_name"]

    # Buscar línea que contenga el ID del jugador
    matching_line = next((line for line in raw_lines if row["Player_ID"] in line), "")

    # Buscar candidato a nombre completo
    match = name_pattern.search(matching_line)
    candidate = match.group(1) if match else ""

    player_tokens = player_name.replace("-", " ").title().split()
    full_tokens = candidate.split()

    # Validar que al menos 2 palabras coincidan
    common = set(p.lower() for p in player_tokens) & set(f.lower() for f in full_tokens)
    if candidate and len(common) >= 2:
        clean_full_names.append(candidate)
    else:
        clean_full_names.append("")

# === Corregir player_name (sin guiones, capitalizado)
df["Player_name"] = df["Player_name"].astype(str).str.replace("-", " ", regex=False).str.title()

# === Insertar nombres completos validados
df["Full_name"] = clean_full_names

# === Guardar CSV actualizado
df.to_csv(cleaned_path, index=False, encoding="utf-8")
print(f"✅ Paso 2 completado → Full_name actualizado en: {cleaned_path}")




✅ Paso 2 completado → Full_name actualizado en: data\processed\cleaned_metadata.csv


In [9]:
# === Processing de metadata: Paso 3
# Añadir fecha de nacimiento

import pandas as pd
from pathlib import Path
import re

# === Paths
raw_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
cleaned_path = Path("data/processed/cleaned_metadata.csv")

# === Load cleaned DataFrame
df = pd.read_csv(cleaned_path, dtype=str, encoding="utf-8").fillna("")

# === Read raw lines (sin header)
with open(raw_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()[1:]

# === Regex para fechas en formato YYYY-MM-DD
date_pattern = re.compile(r"\b\d{4}-\d{2}-\d{2}\b")

# === Buscar fecha de nacimiento para cada jugador en raw_lines
birth_dates = []

for _, row in df.iterrows():
    player_id = row["Player_ID"]
    
    # Buscar la línea que contiene este ID
    matching_line = next((line for line in raw_lines if player_id in line), "")
    
    match = date_pattern.search(matching_line)
    birth_date = match.group(0) if match else ""
    birth_dates.append(birth_date)

# === Añadir columna al DataFrame
df["Birth_date"] = birth_dates

# === Guardar CSV actualizado
df.to_csv(cleaned_path, index=False, encoding="utf-8")
print(f"✅ Paso 3 completado → Birth_date añadido a: {cleaned_path}")


✅ Paso 3 completado → Birth_date añadido a: data\processed\cleaned_metadata.csv


In [10]:
# === Processing de metadata: Paso 4
# Añadir edad (Age)

import pandas as pd
from pathlib import Path
import re

# === Paths
raw_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
cleaned_path = Path("data/processed/cleaned_metadata.csv")

# === Load cleaned DataFrame
df = pd.read_csv(cleaned_path, dtype=str, encoding="utf-8").fillna("")

# === Read raw lines (sin header)
with open(raw_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()[1:]

# === Regex para edad en formato NN-NNN
age_pattern = re.compile(r"\b\d{2}-\d{3}\b")

# === Buscar edad para cada jugador en raw_lines
ages = []

for _, row in df.iterrows():
    player_id = row["Player_ID"]
    
    # Buscar línea que contiene el ID
    matching_line = next((line for line in raw_lines if player_id in line), "")
    
    match = age_pattern.search(matching_line)
    age = match.group(0) if match else ""
    ages.append(age)

# === Añadir columna al DataFrame
df["Age"] = ages

# === Guardar CSV actualizado
df.to_csv(cleaned_path, index=False, encoding="utf-8")
print(f"✅ Paso 4 completado → Age añadido a: {cleaned_path}")


✅ Paso 4 completado → Age añadido a: data\processed\cleaned_metadata.csv


In [11]:
# === Processing de metadata: Paso 5
# Añadir posición (Position)

import pandas as pd
from pathlib import Path
import re

# === Paths
raw_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
cleaned_path = Path("data/processed/cleaned_metadata.csv")

# === Load existing cleaned DataFrame
df = pd.read_csv(cleaned_path, dtype=str, encoding="utf-8").fillna("")

# === Read raw lines (sin header)
with open(raw_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()[1:]

# === Regex para extraer etiquetas en mayúsculas (ej. DF, MF, CB, etc.)
position_pattern = re.compile(r"\b([A-Z]{2,})\b")

positions = []
for _, row in df.iterrows():
    player_id = row["Player_ID"]

    # Buscar línea correspondiente
    matching_line = next((line for line in raw_lines if player_id in line), "")
    
    matches = position_pattern.findall(matching_line)

    # Filtrar posibles valores válidos (puedes ajustar lógica aquí si hace falta)
    valid = sorted(set(m for m in matches if len(m) >= 2))
    
    positions.append("-".join(valid) if valid else "")

# === Añadir al DataFrame
df["Position"] = positions

# === Guardar CSV actualizado
df.to_csv(cleaned_path, index=False, encoding="utf-8")
print(f"✅ Paso 5 completado → Position añadido correctamente a: {cleaned_path}")


✅ Paso 5 completado → Position añadido correctamente a: data\processed\cleaned_metadata.csv


In [12]:
# === Processing de metadata: Paso 6
# Añadir pierna dominante (Footed)

import pandas as pd
from pathlib import Path
import re

# === Paths
raw_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
cleaned_path = Path("data/processed/cleaned_metadata.csv")

# === Load current cleaned DataFrame
df = pd.read_csv(cleaned_path, dtype=str, encoding="utf-8").fillna("")

# === Read raw lines (sin header)
with open(raw_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()[1:]

# === Regex para detectar pierna dominante
footed_pattern = re.compile(r"\b(Right|Left)\b", flags=re.IGNORECASE)

footed_values = []
for _, row in df.iterrows():
    player_id = row["Player_ID"]

    # Buscar la línea que contiene el ID del jugador
    matching_line = next((line for line in raw_lines if player_id in line), "")

    match = footed_pattern.search(matching_line)
    footed = match.group(1).capitalize() if match else ""
    footed_values.append(footed)

# === Añadir columna Footed
df["Footed"] = footed_values

# === Guardar CSV actualizado
df.to_csv(cleaned_path, index=False, encoding="utf-8")
print(f"✅ Paso 6 completado → Footed añadido correctamente en: {cleaned_path}")



✅ Paso 6 completado → Footed añadido correctamente en: data\processed\cleaned_metadata.csv


In [13]:
# === Processing de metadata: Paso 7
# Añadir lugar de nacimiento (Birth_place)

import pandas as pd
from pathlib import Path
import re

# === Paths
raw_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
cleaned_path = Path("data/processed/cleaned_metadata.csv")

# === Load cleaned CSV
df = pd.read_csv(cleaned_path, dtype=str, encoding="utf-8").fillna("")

# === Read raw metadata lines (sin header)
with open(raw_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()[1:]

# === Regex para extraer 'in City, Country'
birthplace_pattern = re.compile(r'"in ([^"]+,[^"]+)"')

birth_places = []
for _, row in df.iterrows():
    player_id = row["Player_ID"]

    # Buscar línea correspondiente
    matching_line = next((line for line in raw_lines if player_id in line), "")

    match = birthplace_pattern.search(matching_line)
    place = match.group(1).strip() if match else ""
    birth_places.append(place)

# === Añadir al DataFrame
df["Birth_place"] = birth_places

# === Guardar CSV actualizado
df.to_csv(cleaned_path, index=False, encoding="utf-8")
print(f"✅ Paso 7 completado → Birth_place añadido correctamente en: {cleaned_path}")



✅ Paso 7 completado → Birth_place añadido correctamente en: data\processed\cleaned_metadata.csv


In [14]:
# === Processing de metadata: Paso 8
# Añadir nacionalidad (Nationality)

import pandas as pd
from pathlib import Path

# === Paths
metadata_path = Path("data/processed/cleaned_metadata.csv")
teams_path = Path("data/meta/World_Cup_Qualification_Teams.csv")
raw_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")

# === Load data
df = pd.read_csv(metadata_path, dtype=str, encoding="utf-8").fillna("")
teams_df = pd.read_csv(teams_path, dtype=str, encoding="utf-8")

# === Lista de países válidos
country_names = set(teams_df["National Team"].dropna().str.strip())

# === Leer las líneas originales (sin header)
with open(raw_path, "r", encoding="utf-8") as f:
    raw_lines = [line.strip() for line in f.readlines()][1:]

# === Buscar país para cada jugador
nationalities = []

for _, row in df.iterrows():
    player_id = row["Player_ID"]

    # Buscar línea correspondiente
    matching_line = next((line for line in raw_lines if player_id in line), "")
    
    found = next((country for country in country_names if country in matching_line), "")
    nationalities.append(found)

# === Asignar columna y guardar
df["Nationality"] = nationalities
df.to_csv(metadata_path, index=False, encoding="utf-8")
print(f"✅ Paso 8 completado → Nationality añadida correctamente en: {metadata_path}")




✅ Paso 8 completado → Nationality añadida correctamente en: data\processed\cleaned_metadata.csv


In [15]:
# === Processing de metadata: Paso 9
# Añadir club (Club) y corregir Full_name con control total

import pandas as pd
from pathlib import Path
import re

# === Paths
raw_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
df_path = Path("data/processed/cleaned_metadata.csv")

# === Cargar DataFrame existente
df = pd.read_csv(df_path, dtype=str).fillna("")

# === Cargar líneas crudas
with open(raw_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()[1:]

# === Regex para Full_name con 3+ palabras capitalizadas
name_regex = re.compile(
    r"\b([A-ZÁÉÍÓÚÑ][a-záéíóúñü']+(?:\s+[A-ZÁÉÍÓÚÑ][a-záéíóúñü']+){2,})\b"
)

# === Procesamiento
clean_full_names = []
clubs = []

for _, row in df.iterrows():
    player_id = row["Player_ID"]

    # Buscar línea correspondiente
    matching_line = next((line for line in raw_lines if player_id in line), "")

    # === Step 1: Full name limpio
    name_match = name_regex.search(matching_line)
    clean_name = name_match.group(1) if name_match else ""
    clean_full_names.append(clean_name)

    # === Step 2: Club
    parts = [p.strip() for p in matching_line.split(",") if p.strip()]

    known_vals = {
        row["Player_name"].lower(),
        clean_name.lower(),
        row["Player_ID"].lower(),
        row["Footed"].lower(),
        row["Birth_date"].lower(),
        row["Age"].lower(),
        row["Birth_place"].lower(),
        row["Nationality"].lower(),
    }

    best_candidate = ""
    for part in parts:
        pl = part.lower()

        if (
            not part
            or "http" in pl
            or "footed" in pl
            or "position" in pl
            or "in " in pl
            or re.fullmatch(r"[a-f0-9]{8}", pl)
            or re.fullmatch(r"\d{4}-\d{2}-\d{2}", pl)
            or re.fullmatch(r"\d{2}-\d{3}", pl)
            or pl in known_vals
            or re.fullmatch(r"[A-Z]{1,3}(-[A-Z]{1,3})+", part)  # Ej: CM-DM
        ):
            continue

        # Buscar candidato más probable
        if re.fullmatch(r"[A-Z][a-z]+(?: [A-Z][a-z]+)*", part) and len(part) > len(best_candidate):
            best_candidate = part

    clubs.append(best_candidate)

# === Asignar columnas finales
df["Full_name"] = clean_full_names
df["Club"] = clubs

# === Guardar archivo actualizado
df.to_csv(df_path, index=False, encoding="utf-8")
print(f"✅ Paso 9 completado → Full_name corregido y Club añadido en: {df_path}")



✅ Paso 9 completado → Full_name corregido y Club añadido en: data\processed\cleaned_metadata.csv


In [16]:
# === Processing de metadata: Paso 10
# Segunda pasada para refinar Club

import pandas as pd
from pathlib import Path

# === Paths
raw_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
df_path = Path("data/processed/cleaned_metadata.csv")

# === Cargar DataFrame y raw lines
df = pd.read_csv(df_path, dtype=str).fillna("")
with open(raw_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()[1:]

clubs_fixed = []

for _, row in df.iterrows():
    player_id = row["Player_ID"]

    # Buscar línea correspondiente
    matching_line = next((line for line in raw_lines if player_id in line), "").strip().replace('"', '')
    parts = [p.strip() for p in matching_line.split(",") if p.strip()]

    # Valores ya conocidos para este jugador
    known = {
        row["Player_name"].lower(),
        row["Full_name"].lower(),
        row["Footed"].lower(),
        row["Birth_date"].lower(),
        row["Age"].lower(),
        row["Birth_place"].lower(),
        row["Nationality"].lower(),
        row["Position"].lower(),
    }

    # Buscar mejor candidato a club
    best = ""
    for p in parts:
        pl = p.lower()
        if (
            not p
            or "http" in pl
            or pl in known
            or any(x in pl for x in ["position", "footed", "in "])
            or len(p) < 2
            or p.isupper()
            or p.replace("-", "").isupper()
        ):
            continue

        if p[0].isupper():
            best = p  # última candidata válida

    clubs_fixed.append(best)

df["Club"] = clubs_fixed

# === Guardar CSV final con Club corregido
df.to_csv(df_path, index=False, encoding="utf-8")
print(f"✅ Paso 10 completado → Clubs corregidos en: {df_path}")



✅ Paso 10 completado → Clubs corregidos en: data\processed\cleaned_metadata.csv


In [17]:
# === Processing de metadata: Inferir género usando gender-guesser

import pandas as pd
from pathlib import Path
import gender_guesser.detector as gender

# === Paths
csv_path = Path("data/processed/cleaned_metadata.csv")
df = pd.read_csv(csv_path, dtype=str).fillna("")

# === Crear detector
detector = gender.Detector(case_sensitive=False)

# === Extraer primer nombre
df["First_name"] = df["Player_name"].str.strip().str.split().str[0]

# === Inferir género usando la librería
def normalize_gender(name):
    try:
        raw = detector.get_gender(name)
    except:
        return "unknown"

    if raw in ["male", "mostly_male"]:
        return "male"
    elif raw in ["female", "mostly_female"]:
        return "female"
    else:
        return "unknown"

df["Gender"] = df["First_name"].apply(normalize_gender)

# === Guardar CSV actualizado
df.to_csv(csv_path, index=False, encoding="utf-8")
print(f"✅ Género inferido automáticamente y guardado en: {csv_path}")



✅ Género inferido automáticamente y guardado en: data\processed\cleaned_metadata.csv


In [18]:
# === Ver resumen de género
print("🔍 Recuento por género:")
print(df["Gender"].value_counts(dropna=False))
print("\n👀 Ejemplos de jugadores con género 'unknown':\n", df[df["Gender"] == "unknown"][["Full_name", "First_name", "Club"]].head(10))


🔍 Recuento por género:
Gender
male       14547
unknown     2104
female       876
Name: count, dtype: int64

👀 Ejemplos de jugadores con género 'unknown':
                    Full_name First_name  \
26                             Lisandro   
42                              Joaquin   
43        Yamil Rodrigo Asad      Yamil   
59   Cristian Nahuel Barrios     Nahuel   
101                              Nahuel   
151                             Lautaro   
157    Carlos Joaquín Correa    Joaquin   
170                              Braian   
172  Hernán Nicolás Da Campo     Hernan   
236       Luis Yamil Garnier      Yamil   

                                       Club  
26                                LDU Quito  
42                                           
43                                   Cuiabá  
59                         Barracas Central  
101                                Talleres  
151                        Sportivo Luqueno  
157                          Internazionale  
170

In [19]:
import time
import pandas as pd
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

# === Cargar dataset
csv_path = Path("data/processed/cleaned_metadata.csv")
df = pd.read_csv(csv_path, dtype=str).fillna("")

# === Configurar navegador
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

# === Asegurar columna Gender
if "Gender" not in df.columns:
    df["Gender"] = "unknown"

# === Función de detección con Bing + múltiples bloques
def infer_gender_bing(full_name):
    try:
        query = f"{full_name} futbolista"
        bing_url = f"https://www.bing.com/search?q={query.replace(' ', '+')}"
        driver.get(bing_url)
        time.sleep(3)

        snippets = []

        # Seleccionamos múltiples bloques relevantes
        selectors = [
            "div.b_entityTP",  # panel destacado
            "div.b_context",
            "div.b_caption",
            "div.b_algo",
            "div.b_snippet",
        ]

        for selector in selectors:
            try:
                el = driver.find_element(By.CSS_SELECTOR, selector)
                snippets.append(el.text.lower())
            except NoSuchElementException:
                continue

        full_text = " ".join(snippets)

        if "es una futbolista" in full_text:
            return "female"
        elif "es un futbolista" in full_text:
            return "male"
        else:
            return "unknown"

    except Exception as e:
        print(f"⚠️ Error con {full_name}: {e}")
        return "unknown"

# === Iterar por unknowns
for i, row in df[df["Gender"] == "unknown"].iterrows():
    full_name = row["Full_name"]
    if not isinstance(full_name, str) or not full_name.strip():
        continue

    try:
        gender = infer_gender_bing(full_name)
        df.at[i, "Gender"] = gender
        print(f"✅ {full_name} → {gender}")
        df.to_csv(csv_path, index=False, encoding="utf-8")
        time.sleep(1.5)
    except Exception as e:
        print(f"⚠️ Error general con {full_name}: {e}")
        continue

driver.quit()
print("🎯 Completado. Géneros actualizados en el CSV.")



✅ Yamil Rodrigo Asad → male
⚠️ Error con Cristian Nahuel Barrios: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=136.0.7103.114)
Stacktrace:
	GetHandleVerifier [0x00B7FC83+61635]
	GetHandleVerifier [0x00B7FCC4+61700]
	(No symbol) [0x009A05D3]
	(No symbol) [0x0098FE20]
	(No symbol) [0x009ADD1F]
	(No symbol) [0x00A13E8C]
	(No symbol) [0x00A2DF19]
	(No symbol) [0x00A0D096]
	(No symbol) [0x009DC840]
	(No symbol) [0x009DD6A4]
	GetHandleVerifier [0x00E045A3+2701795]
	GetHandleVerifier [0x00DFFD26+2683238]
	GetHandleVerifier [0x00E1AA6E+2793134]
	GetHandleVerifier [0x00B96945+155013]
	GetHandleVerifier [0x00B9D02D+181357]
	GetHandleVerifier [0x00B874D8+92440]
	GetHandleVerifier [0x00B87680+92864]
	GetHandleVerifier [0x00B72070+5296]
	BaseThreadInitThunk [0x74CD5D49+25]
	RtlInitializeExceptionChain [0x76FED03B+107]
	RtlGetAppContainerNamedObjectPath [0x76FECFC1+561]

✅ Cristian Nahuel Ba

KeyboardInterrupt: 

In [6]:
# Resumen final
print("\n🎯 Género final por categoría:")
print(df["Gender"].value_counts(dropna=False))



🎯 Género final por categoría:


KeyError: 'Gender'

In [5]:
# 📦 Metadata Processing Pipeline — Full Version 

import pandas as pd
import re
import time
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import gender_guesser.detector as gender

# === Paths de entrada y salida ===
raw_path = Path("data/raw/top_10_countries_players_filtered_raw_metadata.csv")
cleaned_path = Path("data/processed/cleaned_metadata.csv")
cleaned_path.parent.mkdir(parents=True, exist_ok=True)

# === Cargar CSV existente o crear uno nuevo si no existe ===
if cleaned_path.exists():
    df_cleaned = pd.read_csv(cleaned_path, dtype=str).fillna("")
else:
    df_cleaned = pd.DataFrame()

# === Leer líneas RAW ===
with open(raw_path, "r", encoding="utf-8") as f:
    raw_lines = [line.strip() for line in f if "fbref.com" in line]

# === IDs ya presentes en el archivo limpio ===
existing_ids = set(df_cleaned["Player_ID"]) if not df_cleaned.empty else set()

# === Extraer nuevos jugadores ===
new_records = []
for line in raw_lines:
    try:
        url_match = re.search(r"https://fbref\.com/en/players/([a-f0-9]{8})/([^\s/,]+)", line)
        if not url_match:
            continue
        player_id, slug = url_match.groups()
        if player_id in existing_ids:
            continue  # saltar si ya está

        player_name = slug.replace("_", " ").title()
        url_template = url_match.group(0)

        # Nombre completo si aparece
        name_match = re.search(rf"{player_id},([^,]+)", line)
        full_name = name_match.group(1).strip() if name_match else ""

        new_records.append({
            "Player_ID": player_id,
            "Player_name": player_name,
            "Full_name": full_name,
            "Url_template": url_template
        })
    except Exception as e:
        print(f"❌ Error con línea: {line}\\n→ {e}")

# === Si no hay nuevos, salir
if not new_records:
    print("✅ No hay jugadores nuevos por procesar.")
    exit()

df_new = pd.DataFrame(new_records).fillna("")

# === Paso 2: Full_name más limpio ===
name_pattern = re.compile(r"\b([A-ZÁÉÍÓÚÑ][a-záéíóúñ'\-]+(?:\s+[A-ZÁÉÍÓÚÑ][a-záéíóúñ'\-]+){2,})\b")
clean_names = []
for _, row in df_new.iterrows():
    match_line = next((l for l in raw_lines if row["Player_ID"] in l), "")
    match = name_pattern.search(match_line)
    clean_names.append(match.group(1) if match else row["Full_name"])
df_new["Full_name"] = clean_names

# === Paso 3: Birth date ===
date_pat = re.compile(r"\b\d{4}-\d{2}-\d{2}\b")
df_new["Birth_date"] = [
    date_pat.search(next((l for l in raw_lines if row["Player_ID"] in l), "")).group(0)
    if date_pat.search(next((l for l in raw_lines if row["Player_ID"] in l), "")) else ""
    for _, row in df_new.iterrows()
]

# === Paso 4: Edad (ej. 19-347) ===
age_pat = re.compile(r"\b\d{2}-\d{3}\b")
df_new["Age"] = [
    age_pat.search(next((l for l in raw_lines if row["Player_ID"] in l), "")).group(0)
    if age_pat.search(next((l for l in raw_lines if row["Player_ID"] in l), "")) else ""
    for _, row in df_new.iterrows()
]

# === Paso 5: Posición ===
pos_pat = re.compile(r"\b([A-Z]{2,})\b")
df_new["Position"] = [
    "-".join(sorted(set(pos_pat.findall(next((l for l in raw_lines if row["Player_ID"] in l), "")))))
    for _, row in df_new.iterrows()
]

# === Paso 6: Pierna dominante (Right/Left) ===
foot_pat = re.compile(r"\b(Right|Left)\b", flags=re.IGNORECASE)
df_new["Footed"] = [
    foot_pat.search(next((l for l in raw_lines if row["Player_ID"] in l), "")).group(1).capitalize()
    if foot_pat.search(next((l for l in raw_lines if row["Player_ID"] in l), "")) else ""
    for _, row in df_new.iterrows()
]

# === Paso 7: Lugar de nacimiento ("in City, Country") ===
birth_pat = re.compile(r'"in ([^"]+,[^"]+)"')
df_new["Birth_place"] = [
    birth_pat.search(next((l for l in raw_lines if row["Player_ID"] in l), "")).group(1)
    if birth_pat.search(next((l for l in raw_lines if row["Player_ID"] in l), "")) else ""
    for _, row in df_new.iterrows()
]

# === Paso 8: Nacionalidad desde lista de países ===
teams_path = Path("data/meta/World_Cup_Qualification_Teams.csv")
teams_df = pd.read_csv(teams_path, dtype=str)
country_names = set(teams_df["National Team"].dropna().str.strip())
df_new["Nationality"] = [
    next((c for c in country_names if c in next((l for l in raw_lines if row["Player_ID"] in l), "")), "")
    for _, row in df_new.iterrows()
]

# === Paso 9: Club (heurística) ===
clubs = []
for _, row in df_new.iterrows():
    match_line = next((l for l in raw_lines if row["Player_ID"] in l), "").replace('"', '')
    parts = [p.strip() for p in match_line.split(',') if p.strip()]
    known = set(df_new.columns.str.lower())
    best = ""
    for p in parts:
        if p.lower() in known or not p or "http" in p or len(p) < 3 or p.upper() == p:
            continue
        if p[0].isupper():
            best = p
    clubs.append(best)
df_new["Club"] = clubs

# === Paso 10: Género básico ===
detector = gender.Detector(case_sensitive=False)
df_new["First_name"] = df_new["Player_name"].str.split().str[0]
df_new["Gender"] = df_new["First_name"].apply(lambda name: {
    "male": "male", "mostly_male": "male",
    "female": "female", "mostly_female": "female"
}.get(detector.get_gender(name), "unknown"))

# === Unir y guardar ===
df_final = pd.concat([df_cleaned, df_new], ignore_index=True).drop_duplicates("Player_ID")
df_final.to_csv(cleaned_path, index=False, encoding="utf-8")
print(f"✅ {len(df_new)} jugadores añadidos. Total ahora: {len(df_final)}")

# === Extra: revisar los unknowns con Bing ===
unknowns = df_final[df_final["Gender"] == "unknown"].copy()
print(f"🎯 Jugadores con género 'unknown': {len(unknowns)}")

if not unknowns.empty:
    print("🔍 Iniciando revisión avanzada de género con Bing...")

    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

    def infer_gender_bing(name):
        try:
            query = f"{name} futbolista"
            driver.get(f"https://www.bing.com/search?q={query.replace(' ', '+')}")
            time.sleep(2)
            page = driver.page_source.lower()
            if "es una futbolista" in page:
                return "female"
            elif "es un futbolista" in page:
                return "male"
        except Exception as e:
            print(f"⚠️ Error con {name}: {e}")
        return "unknown"

    total = len(unknowns)
    for i, (_, row) in enumerate(unknowns.iterrows()):
        name = row["Full_name"] or row["Player_name"]
        print(f"🔎 [{i+1}/{total}] Revisando: {name}...", flush=True)

        gender_guess = infer_gender_bing(name)
        df_final.loc[df_final["Player_ID"] == row["Player_ID"], "Gender"] = gender_guess
        print(f"   → Resultado: {gender_guess}", flush=True)
        df_final.to_csv(cleaned_path, index=False, encoding="utf-8")
        time.sleep(1.5)

    driver.quit()
    print("✅ Revisión avanzada completada y guardada.")
else:
    print("✅ No hay jugadores con género 'unknown'. Nada que revisar.")


KeyboardInterrupt: 

In [6]:
# Test processing Mbappé

import pandas as pd
from pathlib import Path

# === Rutas
input_path = Path("data/debug/mbappe_matchlogs_raw.csv")
output_path = Path("data/debug/mbappe_matchlogs_cleaned.csv")

# === Leer raw CSV de Mbappé
df_raw = pd.read_csv(input_path, dtype=str)

# === Filtrar columnas que importan (puedes ajustar según tu análisis real)
columns_to_keep = [
    "Date", "Comp", "Round", "Venue", "Result", "Squad", "Opponent", "Start", "Pos", "Min",
    "Gls", "Ast", "PK", "PKatt", "Sh", "SoT", "CrdY", "CrdR",
    "Touches", "Tkl", "Blocks", "xG", "npxG", "xAG", "SCA", "GCA",
    "Cmp", "Att", "Cmp%", "PrgP", "Carries", "PrgC", "Succ",
    "season", "player_name", "player_id"
]

# Asegurarse de que solo se mantienen las columnas válidas
columns_available = [col for col in columns_to_keep if col in df_raw.columns]
df_cleaned = df_raw[columns_available].copy()

# === Renombrar columnas como en tu estándar
rename_dict = {
    "Comp": "Competition", "Venue": "Home_Away", "Squad": "Player_team", "Opponent": "Rival_team",
    "Start": "Start", "Pos": "Position", "Min": "Minutes", "Gls": "Goals", "Ast": "Assists",
    "PK": "Penalty_kick", "PKatt": "Penalty_kick_att", "Sh": "Shots", "SoT": "Shots_on_target",
    "CrdY": "Yellow_cards", "CrdR": "Red_cards", "Tkl": "Tackles", "xG": "xG", "npxG": "non_penalty_xG",
    "xAG": "x_assisted_G", "SCA": "Shot_creating_actions", "GCA": "Goal_creating_actions",
    "Cmp": "Passes_completed", "Att": "Passes_att", "Cmp%": "Percent_passes", "PrgP": "Progressive_passes",
    "Carries": "Feet_control", "PrgC": "Progressive_control", "Succ": "Dribling_suc"
}
df_cleaned.rename(columns=rename_dict, inplace=True)

# === Orden final
core = ["player_name", "player_id", "season"]
rest = [col for col in df_cleaned.columns if col not in core]
df_cleaned = df_cleaned[core + rest]

# === Eliminar filas casi vacías
df_cleaned.dropna(thresh=6, inplace=True)  # Mantener solo filas con al menos 6 valores no vacíos

# === Eliminar duplicados por jugador + fecha
if "Date" in df_cleaned.columns:
    df_cleaned = df_cleaned.drop_duplicates(subset=["player_id", "Date"])


# === Guardar CSV limpio
output_path.parent.mkdir(parents=True, exist_ok=True)
df_cleaned.to_csv(output_path, index=False, encoding="utf-8")
print(f"✅ CSV limpio guardado: {output_path} | {len(df_cleaned)} filas")





✅ CSV limpio guardado: data\debug\mbappe_matchlogs_cleaned.csv | 531 filas


In [2]:
# Processing completo y espero que el final

# Processing completo y final

import pandas as pd
from pathlib import Path
import re

# === File paths
input_path = Path("data/raw/top_10_countries_matchlogs_filtered.csv")
output_path = Path("data/processed/cleaned_matchlogs.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)

# === Step 1: Load raw CSV
df = pd.read_csv(input_path, dtype=str, encoding="utf-8").fillna("")

# === Step 2: Normalize player_name formatting
df["player_name"] = df["player_name"].str.replace("_", " ", regex=False).str.title()

# === Step 3: Rename columns
rename_dict = {
    "player_name": "Player_name",
    "player_id": "Player_ID",
    "season": "Seasons",
    "Date": "Date", "Day": "Day", "Comp": "Competition", "Round": "Round", "Venue": "Home_Away",
    "Result": "Result", "Squad": "Player_team", "Opponent": "Rival_team", "Start": "Start",
    "Pos": "Position", "Min": "Minutes", "Gls": "Goals", "Ast": "Assists", "PK": "Penalty_kick",
    "PKatt": "Penalty_kick_att", "Sh": "Shots", "SoT": "Shots_on_target", "CrdY": "Yellow_cards",
    "CrdR": "Red_cards", "Fls": "Fouls_committed", "Fld": "Fouls_drawn", "Off": "Offsides",
    "Crs": "Crosses", "TklW": "Tackles_won", "Int": "Interceptions", "OG": "Own_goals",
    "PKwon": "Penaltys_won", "PKcon": "Penaltys_conceded", "Touches": "Touches", "Tkl": "Tackles",
    "Blocks": "Blocks", "xG": "xG", "npxG": "non_penalty_xG", "xAG": "x_assisted_G",
    "SCA": "Shot_creating_actions", "GCA": "Goal_creating_actions", "Cmp": "Passes_completed",
    "Att": "Passes_att", "Cmp%": "Percent_passes", "PrgP": "Progressive_passes",
    "Carries": "Feet_control", "PrgC": "Progressive_control", "Succ": "Dribling_suc"
}
df.rename(columns={k: v for k, v in rename_dict.items() if k in df.columns}, inplace=True)

# === Step 4: Remove unwanted columns
for col in ["Match Report", "season", "player_name", "player_id"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# === Step 5: Clean rows (required fields must be present)
df = df[df["Player_name"].notna() & df["Player_ID"].notna() & df["Seasons"].notna()]
if "Date" in df.columns:
    df = df[df["Date"].notna()]

# === Step 6: Remove rows where the player didn’t play
if "Position" in df.columns:
    df = df[df["Position"] != "On matchday squad, but did not play"]

# ✅ Step 6.5: Drop rows where all values are empty (except core fields)
non_core = [col for col in df.columns if col not in ["Player_name", "Player_ID", "Seasons"]]
df = df[df[non_core].apply(lambda row: any(cell.strip() for cell in row), axis=1)]

# === Step 7: Remove duplicate games (player_id + date)
if "Date" in df.columns:
    df = df.drop_duplicates(subset=["Player_ID", "Date"])

# === Step 8: Clean up team names by removing country codes
for col in ["Player_team", "Rival_team"]:
    if col in df.columns:
        df[col] = df[col].str.replace(r"^[a-z]{2,3}\s+", "", regex=True)

# === Step 9: Reorder columns
core = ["Player_name", "Player_ID", "Seasons"]
rest = [c for c in df.columns if c not in core]
df = df[core + rest]

# === Step 10: Save
df.to_csv(output_path, index=False, encoding="utf-8")
print(f"🎯 Final cleaned CSV saved at: {output_path} | Rows: {len(df)}")



🎯 Final cleaned CSV saved at: data\processed\cleaned_matchlogs.csv | Rows: 1155417


In [37]:
# Processing completo y espero que el final

# Processing completo y final JUGADORES JÓVENES

import pandas as pd
from pathlib import Path
import re

# === File paths
input_path = Path("data/raw/top_10_countries_matchlogs_young_players.csv")
output_path = Path("data/processed/future_stars_cleaned_matchlogs.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)

# === Step 1: Load raw CSV
df = pd.read_csv(input_path, dtype=str, encoding="utf-8").fillna("")

# === Step 2: Normalize player_name formatting
df["player_name"] = df["player_name"].str.replace("_", " ", regex=False).str.title()

# === Step 3: Rename columns
rename_dict = {
    "player_name": "Player_name",
    "player_id": "Player_ID",
    "season": "Seasons",
    "Date": "Date", "Day": "Day", "Comp": "Competition", "Round": "Round", "Venue": "Home_Away",
    "Result": "Result", "Squad": "Player_team", "Opponent": "Rival_team", "Start": "Start",
    "Pos": "Position", "Min": "Minutes", "Gls": "Goals", "Ast": "Assists", "PK": "Penalty_kick",
    "PKatt": "Penalty_kick_att", "Sh": "Shots", "SoT": "Shots_on_target", "CrdY": "Yellow_cards",
    "CrdR": "Red_cards", "Fls": "Fouls_committed", "Fld": "Fouls_drawn", "Off": "Offsides",
    "Crs": "Crosses", "TklW": "Tackles_won", "Int": "Interceptions", "OG": "Own_goals",
    "PKwon": "Penaltys_won", "PKcon": "Penaltys_conceded", "Touches": "Touches", "Tkl": "Tackles",
    "Blocks": "Blocks", "xG": "xG", "npxG": "non_penalty_xG", "xAG": "x_assisted_G",
    "SCA": "Shot_creating_actions", "GCA": "Goal_creating_actions", "Cmp": "Passes_completed",
    "Att": "Passes_att", "Cmp%": "Percent_passes", "PrgP": "Progressive_passes",
    "Carries": "Feet_control", "PrgC": "Progressive_control", "Succ": "Dribling_suc"
}
df.rename(columns={k: v for k, v in rename_dict.items() if k in df.columns}, inplace=True)

# === Step 4: Remove unwanted columns
for col in ["Match Report", "season", "player_name", "player_id"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# === Step 5: Clean rows (required fields must be present)
df = df[df["Player_name"].notna() & df["Player_ID"].notna() & df["Seasons"].notna()]
if "Date" in df.columns:
    df = df[df["Date"].notna()]

# === Step 6: Remove rows where the player didn’t play
if "Position" in df.columns:
    df = df[df["Position"] != "On matchday squad, but did not play"]

# ✅ Step 6.5: Drop rows where all values are empty (except core fields)
non_core = [col for col in df.columns if col not in ["Player_name", "Player_ID", "Seasons"]]
df = df[df[non_core].apply(lambda row: any(cell.strip() for cell in row), axis=1)]

# === Step 7: Remove duplicate games (player_id + date)
if "Date" in df.columns:
    df = df.drop_duplicates(subset=["Player_ID", "Date"])

# === Step 8: Clean up team names by removing country codes
for col in ["Player_team", "Rival_team"]:
    if col in df.columns:
        df[col] = df[col].str.replace(r"^[a-z]{2,3}\s+", "", regex=True)

# === Step 9: Reorder columns
core = ["Player_name", "Player_ID", "Seasons"]
rest = [c for c in df.columns if c not in core]
df = df[core + rest]

# === Step 10: Save
df.to_csv(output_path, index=False, encoding="utf-8")
print(f"🎯 Final cleaned CSV saved at: {output_path} | Rows: {len(df)}")


🎯 Final cleaned CSV saved at: data\processed\future_stars_cleaned_matchlogs.csv | Rows: 1053


In [40]:
from pathlib import Path
import pandas as pd
import re

def process_metadata_file(raw_path: Path, cleaned_path: Path, teams_path: Path = Path("data/meta/World_Cup_Qualification_Teams.csv")):
    raw_lines = [line.strip() for line in raw_path.read_text(encoding="utf-8").splitlines()[1:]]

    # === Paso 1: ID, slug, nombre base
    records = []
    for line in raw_lines:
        url_match = re.search(r"https://fbref\.com/en/players/([a-f0-9]{8})/([^\s/,]+)", line)
        if not url_match:
            continue
        player_id = url_match.group(1)
        slug = url_match.group(2)
        player_name = slug.replace("_", " ").title()
        url_template = url_match.group(0)

        name_match = re.search(rf"{player_id},([^,]+)", line)
        full_name = name_match.group(1).strip() if name_match else ""

        records.append({
            "Player_ID": player_id,
            "Player_name": player_name,
            "Full_name": full_name,
            "Url_template": url_template
        })

    df = pd.DataFrame(records).fillna("")

    # === Paso 2: Validar Full_name
    name_pattern = re.compile(r"\b([A-ZÁÉÍÓÚÑ][a-záéíóúñü'’\-]+(?:\s+[A-ZÁÉÍÓÚÑ][a-záéíóúñü'’\-]+){2,})\b")
    clean_full_names = []
    for _, row in df.iterrows():
        match_line = next((line for line in raw_lines if row["Player_ID"] in line), "")
        match = name_pattern.search(match_line)
        candidate = match.group(1) if match else ""

        player_tokens = row["Player_name"].replace("-", " ").title().split()
        full_tokens = candidate.split()
        common = set(p.lower() for p in player_tokens) & set(f.lower() for f in full_tokens)

        clean_full_names.append(candidate if candidate and len(common) >= 2 else "")
    df["Player_name"] = df["Player_name"].str.replace("-", " ", regex=False).str.title()
    df["Full_name"] = clean_full_names

    # === Paso 3: Birth_date
    date_pattern = re.compile(r"\b\d{4}-\d{2}-\d{2}\b")
    df["Birth_date"] = [
        date_pattern.search(next((line for line in raw_lines if row["Player_ID"] in line), "")).group(0)
        if date_pattern.search(next((line for line in raw_lines if row["Player_ID"] in line), "")) else ""
        for _, row in df.iterrows()
    ]

    # === Paso 4: Age
    age_pattern = re.compile(r"\b\d{2}-\d{3}\b")
    df["Age"] = [
        age_pattern.search(next((line for line in raw_lines if row["Player_ID"] in line), "")).group(0)
        if age_pattern.search(next((line for line in raw_lines if row["Player_ID"] in line), "")) else ""
        for _, row in df.iterrows()
    ]

    # === Paso 5: Position
    pos_pattern = re.compile(r"\b([A-Z]{2,})\b")
    df["Position"] = [
        "-".join(sorted(set(m for m in pos_pattern.findall(
            next((line for line in raw_lines if row["Player_ID"] in line), "")
        ) if len(m) >= 2))) or ""
        for _, row in df.iterrows()
    ]

    # === Paso 6: Footed
    footed_pattern = re.compile(r"\b(Right|Left)\b", flags=re.IGNORECASE)
    df["Footed"] = [
        (footed_pattern.search(next((line for line in raw_lines if row["Player_ID"] in line), "")) or re.match("", "")).group(0).capitalize()
        if footed_pattern.search(next((line for line in raw_lines if row["Player_ID"] in line), "")) else ""
        for _, row in df.iterrows()
    ]

    # === Paso 7: Birth_place
    birth_place_pattern = re.compile(r'"in ([^"]+,[^"]+)"')
    df["Birth_place"] = [
        birth_place_pattern.search(next((line for line in raw_lines if row["Player_ID"] in line), "")).group(1).strip()
        if birth_place_pattern.search(next((line for line in raw_lines if row["Player_ID"] in line), "")) else ""
        for _, row in df.iterrows()
    ]

    # === Paso 8: Nationality
    teams_df = pd.read_csv(teams_path, dtype=str)
    countries = set(teams_df["National Team"].dropna().str.strip())

    df["Nationality"] = [
        next((country for country in countries if country in next((line for line in raw_lines if row["Player_ID"] in line), "")), "")
        for _, row in df.iterrows()
    ]

    # === Paso 9-10: Club (refinado)
    clubs = []
    for _, row in df.iterrows():
        line = next((l for l in raw_lines if row["Player_ID"] in l), "").replace('"', '')
        parts = [p.strip() for p in line.split(",") if p.strip()]
        known = {str(row[k]).lower() for k in ["Player_name", "Full_name", "Footed", "Birth_date", "Age", "Birth_place", "Nationality", "Position"]}
        best = ""
        for p in parts:
            pl = p.lower()
            if (
                not p or "http" in pl or pl in known or len(p) < 2 or
                "position" in pl or "footed" in pl or "in " in pl or
                p.isupper() or p.replace("-", "").isupper()
            ):
                continue
            if p[0].isupper():
                best = p
        clubs.append(best)
    df["Club"] = clubs

    # === Paso FINAL: Asignar Gender = male
    df["Gender"] = "male"

    # === Guardar CSV final
    cleaned_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(cleaned_path, index=False, encoding="utf-8")
    print(f"✅ Metadata procesada y guardada en: {cleaned_path}")

    return df


In [41]:
from pathlib import Path

raw_path = Path("data/raw/future_stars_raw_metadata.csv")
cleaned_path = Path("data/processed/future_stars_cleaned_metadata.csv")

df_cleaned = process_metadata_file(raw_path, cleaned_path)



✅ Metadata procesada y guardada en: data\processed\future_stars_cleaned_metadata.csv


In [None]:
# Faltaría hacer la normalización del tema de las competiciones. Creo que la mejor manera sería usando los IDs de las URLs y obteniendo el último nombre oficial de la competición recogido en FBREF. (Ej: Primera Div = Liga Argentina)

In [42]:
import pandas as pd
from pathlib import Path

# === Load cleaned matchlogs
path = Path("data/processed/cleaned_matchlogs.csv")
df = pd.read_csv(path, dtype=str).fillna("")

# === Filtrar por jugador
player_id = "81442ecb"
df_acuna = df[df["Player_ID"] == player_id].copy()

# === Filtrar partidos jugados (Min > 0)
df_acuna["Minutes"] = pd.to_numeric(df_acuna["Minutes"], errors="coerce").fillna(0)
df_played = df_acuna[df_acuna["Minutes"] > 0]

# === Contar
total_matches = len(df_played)
national_team_matches = len(df_played[df_played["Player_team"] == "Argentina"])
club_matches = total_matches - national_team_matches

print(f"🎯 Marcos Acuña (ID: {player_id})")
print(f"   Total matches played: {total_matches}")
print(f"   ➤ With Argentina: {national_team_matches}")
print(f"   ➤ With Clubs: {club_matches}")


🎯 Marcos Acuña (ID: 81442ecb)
   Total matches played: 431
   ➤ With Argentina: 59
   ➤ With Clubs: 372


In [4]:
import pandas as pd
from pathlib import Path

# === Load cleaned matchlogs
path = Path("data/processed/cleaned_matchlogs.csv")
df = pd.read_csv(path, dtype=str).fillna("")

# === Filtrar por Player_ID
player_id = "81442ecb"
df_acuna = df[df["Player_ID"] == player_id].copy()

# Convertir columnas numéricas
df_acuna["Goals"] = pd.to_numeric(df_acuna["Goals"], errors="coerce").fillna(0)
df_acuna["Assists"] = pd.to_numeric(df_acuna["Assists"], errors="coerce").fillna(0)

# === Calcular totales
total_goals = int(df_acuna["Goals"].sum())
total_assists = int(df_acuna["Assists"].sum())

print(f"⚽ Marcos Acuña (ID {player_id}) → Goals: {total_goals} | Assists: {total_assists}")


⚽ Marcos Acuña (ID 81442ecb) → Goals: 30 | Assists: 57


In [5]:
import pandas as pd

# Cargar el CSV limpio
df = pd.read_csv("data/processed/cleaned_matchlogs.csv")

# === Filtrar por Player_ID
player_id = "42fd9c7f"
df_mbappe = df[df["Player_ID"] == player_id].copy()

# Convertir columnas numéricas
df_mbappe["Goals"] = pd.to_numeric(df_mbappe["Goals"], errors="coerce").fillna(0)
df_mbappe["Assists"] = pd.to_numeric(df_mbappe["Assists"], errors="coerce").fillna(0)

# === Calcular totales
total_goals = int(df_mbappe["Goals"].sum())
total_assists = int(df_mbappe["Assists"].sum())


print(f"⚽ Kylian Mbappé (ID {player_id}) → Goals: {total_goals} | Assists: {total_assists}")

⚽ Kylian Mbappé (ID 42fd9c7f) → Goals: 0 | Assists: 0
