In [1]:
# === Celda 0 · localizar raíz del repo, fijar CWD y sys.path ===
import sys, os
from pathlib import Path

def pick_project_root() -> Path:
    candidates = []
    cwd = Path.cwd().resolve()
    candidates += [cwd, *cwd.parents]
    candidates += [
        Path(r"C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored"),
        Path.home() / "OneDrive" / "Escritorio" / "Proyecto WhoScored",
    ]
    for p in candidates:
        if (p / "src" / "whoscored_viz").exists():
            return p
    raise RuntimeError("No encuentro la raíz del repo; ajusta la ruta en 'candidates'.")

PROJECT_ROOT = pick_project_root()
os.chdir(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT =", PROJECT_ROOT)

# Importa rutas centralizadas
import importlib
from src.whoscored_viz import paths
importlib.reload(paths)
print("BASE_DATA_DIR   =", paths.BASE_DATA_DIR)
print("MATCHCENTER_DIR =", paths.MATCHCENTER_DIR)
print("FIXTURES_DIR    =", paths.FIXTURES_DIR)

# Corte duro para evitar notebooks/
assert "notebooks" not in str(paths.MATCHCENTER_DIR).lower(), paths.MATCHCENTER_DIR

PROJECT_ROOT = C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored
[paths.py] PROJECT_ROOT: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored
[paths.py] BASE_DATA_DIR: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data
[paths.py] PROJECT_ROOT: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored
[paths.py] BASE_DATA_DIR: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data
BASE_DATA_DIR   = C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data
MATCHCENTER_DIR = C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\matchcenter
FIXTURES_DIR    = C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\fixtures


In [2]:
from pathlib import Path

# Ajusta aquí liga/temporada si quieres etiquetar el índice
COMP_SLUG   = "laliga"
SEASON_SLUG = "2025-2026"

# OJO: tu estructura real incluye 'MatchCenter/Competition/Season'
MC_ROOT   = paths.MATCHCENTER_DIR  # base absoluta: .../data/raw/matchcenter
FIXT_CSV  = paths.FIXTURES_DIR / "DataFixtures" / COMP_SLUG / SEASON_SLUG / "finished_matches.csv"

INDEX_DIR = paths.BASE_DATA_DIR / "processed" / "index"
INDEX_DIR.mkdir(parents=True, exist_ok=True)
INDEX_CSV = INDEX_DIR / f"ws_matches_{COMP_SLUG}_{SEASON_SLUG}.csv"

print("MatchCenter base:", MC_ROOT)
print("Fixtures CSV    :", FIXT_CSV)
print("Salida índice   :", INDEX_CSV)

MatchCenter base: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\matchcenter
Fixtures CSV    : C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\fixtures\DataFixtures\laliga\2025-2026\finished_matches.csv
Salida índice   : C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\processed\index\ws_matches_laliga_2025-2026.csv


In [3]:
import json, re
import pandas as pd
from pathlib import Path

def safe_read_json(p: Path) -> dict:
    try:
        return json.loads(p.read_text(encoding="utf-8"))
    except Exception:
        return {}

def parse_from_path(manifest_path: Path) -> dict:
    """
    Estructura esperada (desde manifest.json):
    .../matchcenter/MatchCenter/Competition/Season/<match_slug>/normalized/manifest.json
    """
    norm_dir   = manifest_path.parent                    # normalized
    match_dir  = norm_dir.parent                         # <match_slug>
    season_dir = match_dir.parent                        # Season (puede ser literal "Season")
    comp_dir   = season_dir.parent                       # Competition (puede ser literal "Competition")

    match_slug = match_dir.name                          # p.ej. 20250815_Girona_vs_Rayo_Vallecano_1913916

    # match_id al final del slug
    m = re.search(r'_(\d+)$', match_slug)
    match_id = m.group(1) if m else None

    # Si tus carpetas realmente se llaman "Competition" y "Season" literal,
    # no aportan info real. Aun así las guardamos como 'comp_slug_hint' y 'season_slug_hint'
    # por si algún día cambias a nombres reales.
    return {
        "match_dir": str(match_dir),
        "match_slug": match_slug,
        "match_id_from_path": match_id,
        "comp_slug_hint": comp_dir.name,
        "season_slug_hint": season_dir.name,
    }

def build_row(manifest_path: Path) -> dict:
    m = safe_read_json(manifest_path)
    hint = parse_from_path(manifest_path)

    match_id    = str(m.get("match_id") or hint.get("match_id_from_path") or "")
    match_date  = m.get("match_date")
    start_time  = m.get("start_time")
    home_name   = m.get("home_name")
    away_name   = m.get("away_name")
    score_home  = m.get("score_home")
    score_away  = m.get("score_away")
    url         = m.get("match_centre_url") or (f"https://es.whoscored.com/Matches/{match_id}/Live" if match_id else None)

    # Preferimos comp/season del manifest; si no, usa parámetro global o hint del path
    comp_slug   = m.get("competition_slug") or m.get("competition") or COMP_SLUG or hint.get("comp_slug_hint")
    season_slug = m.get("season_slug") or m.get("season")       or SEASON_SLUG or hint.get("season_slug_hint")

    row = {
        "match_id": match_id,
        "comp_slug": comp_slug,
        "season_slug": season_slug,
        "match_date": match_date,
        "start_time": start_time,
        "home_name": home_name,
        "away_name": away_name,
        "score_home": pd.to_numeric(score_home, errors="coerce"),
        "score_away": pd.to_numeric(score_away, errors="coerce"),
        "match_centre_url": url,
        "match_dir": hint["match_dir"],
        "match_slug": hint["match_slug"],
    }
    return row

In [4]:
from tqdm import tqdm

# Busca EXACTAMENTE donde están tus manifests: .../normalized/manifest.json
manifests = sorted(MC_ROOT.rglob("normalized/manifest.json"))
print(f"Encontrados {len(manifests)} manifests")
if not manifests:
    raise SystemExit("No se encontraron manifests. Revisa la ruta base MATCHCENTER_DIR y la estructura.")

rows = [build_row(mf) for mf in tqdm(manifests)]
df_idx = pd.DataFrame(rows)
print("Filas base:", len(df_idx))

# Enriquecer con fixtures si existe el CSV
if FIXT_CSV.exists():
    fix = pd.read_csv(FIXT_CSV, dtype={"match_id": str})
    keep = ["match_id","match_date","start_time","match_round","match_centre_url",
            "home_name","away_name","score_home","score_away"]
    fix_cols = [c for c in keep if c in fix.columns]
    fix = fix[fix_cols].copy()
    fix["match_id"] = fix["match_id"].astype(str)
    df_idx["match_id"] = df_idx["match_id"].astype(str)
    df_idx = df_idx.merge(fix, on="match_id", how="left", suffixes=("","_fx"))
    for col in ["match_date","start_time","home_name","away_name","score_home","score_away","match_centre_url","match_round"]:
        cfx = col + "_fx"
        if cfx in df_idx.columns:
            df_idx[col] = df_idx[col].combine_first(df_idx[cfx])
            df_idx.drop(columns=[cfx], inplace=True)

display(df_idx.head(10))

Encontrados 41 manifests


100%|██████████| 41/41 [00:00<00:00, 2246.05it/s]

Filas base: 41



  df_idx[col] = df_idx[col].combine_first(df_idx[cfx])
  df_idx[col] = df_idx[col].combine_first(df_idx[cfx])


Unnamed: 0,match_id,comp_slug,season_slug,match_date,start_time,home_name,away_name,score_home,score_away,match_centre_url,match_dir,match_slug
0,1913916,laliga,2025-2026,2025-08-15,19:00,Girona,Rayo Vallecano,1,3,https://es.whoscored.com/Matches/1913916/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250815_Girona_vs_Rayo_Vallecano_1913916
1,1913892,laliga,2025-2026,2025-08-15,21:30,Villarreal,Real Oviedo,2,0,https://es.whoscored.com/Matches/1913892/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250815_Villarreal_vs_Real_Oviedo_1913892
2,1913913,laliga,2025-2026,2025-08-16,21:30,Deportivo Alaves,Levante,2,1,https://es.whoscored.com/Matches/1913913/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250816_Deportivo_Alaves_vs_Levante_1913913
3,1913918,laliga,2025-2026,2025-08-16,19:30,Mallorca,Barcelona,0,3,https://es.whoscored.com/Matches/1913918/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250816_Mallorca_vs_Barcelona_1913918
4,1913889,laliga,2025-2026,2025-08-16,21:30,Valencia,Real Sociedad,1,1,https://es.whoscored.com/Matches/1913889/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250816_Valencia_vs_Real_Sociedad_1913889
5,1913914,laliga,2025-2026,2025-08-17,19:30,Athletic Club,Sevilla,3,2,https://es.whoscored.com/Matches/1913914/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250817_Athletic_Club_vs_Sevilla_1913914
6,1913915,laliga,2025-2026,2025-08-17,17:00,Celta Vigo,Getafe,0,2,https://es.whoscored.com/Matches/1913915/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250817_Celta_Vigo_vs_Getafe_1913915
7,1913917,laliga,2025-2026,2025-08-17,21:30,Espanyol,Atletico Madrid,2,1,https://es.whoscored.com/Matches/1913917/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250817_Espanyol_vs_Atletico_1913917
8,1913883,laliga,2025-2026,2025-08-18,21:00,Elche,Real Betis,1,1,https://es.whoscored.com/Matches/1913883/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250818_Elche_vs_Real_Betis_1913883
9,1913886,laliga,2025-2026,2025-08-19,21:00,Real Madrid,Osasuna,1,0,https://es.whoscored.com/Matches/1913886/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250819_Real_Madrid_vs_Osasuna_1913886


In [5]:
# Orden por fecha/hora si existen
def to_datetime_safe(d, t):
    import pandas as pd
    try:
        if pd.isna(d): return pd.NaT
        s = str(d)
        if t and not pd.isna(t): s = f"{s} {t}"
        return pd.to_datetime(s, errors="coerce")
    except Exception:
        return pd.NaT

import pandas as pd
if {"match_date","start_time"}.issubset(df_idx.columns):
    df_idx["dt"] = [to_datetime_safe(d,t) for d,t in zip(df_idx["match_date"], df_idx["start_time"])]
    df_idx = df_idx.sort_values(["dt","match_id"], na_position="last").drop(columns=["dt"], errors="ignore")

# Deduplicar por match_id
if "match_id" in df_idx.columns:
    df_idx["match_id"] = df_idx["match_id"].astype(str)
    df_idx = df_idx.drop_duplicates(subset=["match_id"], keep="last")

print("Filas tras limpiar:", len(df_idx))
display(df_idx.tail(10))

# Escritura atómica
tmp = INDEX_CSV.with_suffix(".tmp.csv")
df_idx.to_csv(tmp, index=False, encoding="utf-8-sig")
tmp.replace(INDEX_CSV)

print("[OK] Índice escrito en:", INDEX_CSV)

Filas tras limpiar: 41


Unnamed: 0,match_id,comp_slug,season_slug,match_date,start_time,home_name,away_name,score_home,score_away,match_centre_url,match_dir,match_slug
31,1913929,laliga,2025-2026,2025-09-12,21:00,Sevilla,Elche,2,2,https://es.whoscored.com/Matches/1913929/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250912_Sevilla_vs_Elche_1913929
34,1913925,laliga,2025-2026,2025-09-13,14:00,Getafe,Real Oviedo,2,0,https://es.whoscored.com/Matches/1913925/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250913_Getafe_vs_Real_Oviedo_1913925
35,1913930,laliga,2025-2026,2025-09-13,16:15,Real Sociedad,Real Madrid,1,2,https://es.whoscored.com/Matches/1913930/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250913_Real_Sociedad_vs_Real_Madrid_1913930
32,1913927,laliga,2025-2026,2025-09-13,18:30,Athletic Club,Deportivo Alaves,0,1,https://es.whoscored.com/Matches/1913927/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250913_Athletic_Club_vs_Deportivo_Alaves_191...
33,1913921,laliga,2025-2026,2025-09-13,21:00,Atletico Madrid,Villarreal,2,0,https://es.whoscored.com/Matches/1913921/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250913_Atletico_vs_Villarreal_1913921
37,1913923,laliga,2025-2026,2025-09-14,14:00,Celta Vigo,Girona,1,1,https://es.whoscored.com/Matches/1913923/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250914_Celta_Vigo_vs_Girona_1913923
38,1913928,laliga,2025-2026,2025-09-14,16:15,Levante,Real Betis,2,2,https://es.whoscored.com/Matches/1913928/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250914_Levante_vs_Real_Betis_1913928
39,1913926,laliga,2025-2026,2025-09-14,18:30,Osasuna,Rayo Vallecano,2,0,https://es.whoscored.com/Matches/1913926/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250914_Osasuna_vs_Rayo_Vallecano_1913926
36,1913922,laliga,2025-2026,2025-09-14,21:00,Barcelona,Valencia,6,0,https://es.whoscored.com/Matches/1913922/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250914_Barcelona_vs_Valencia_1913922
40,1913924,laliga,2025-2026,2025-09-15,21:00,Espanyol,Mallorca,3,2,https://es.whoscored.com/Matches/1913924/Live,C:\Users\manue\OneDrive\Escritorio\Proyecto Wh...,20250915_Espanyol_vs_Mallorca_1913924


[OK] Índice escrito en: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\processed\index\ws_matches_laliga_2025-2026.csv


In [6]:
import pandas as pd
from pathlib import Path

df_chk = pd.read_csv(INDEX_CSV, dtype={"match_id": str})
print("Total en índice:", len(df_chk))
print("Duplicados match_id:", df_chk.duplicated("match_id").sum())

# Comprobar que match_dir existe
missing_dirs = [p for p in df_chk["match_dir"] if not Path(p).exists()]
print("Carpetas que no existen:", len(missing_dirs))
if missing_dirs:
    display(missing_dirs[:5])

for c in ["match_id","match_date","home_name","away_name","match_centre_url","match_dir"]:
    if c in df_chk.columns:
        print(f"Nulos en {c}:", df_chk[c].isna().sum())

Total en índice: 41
Duplicados match_id: 0
Carpetas que no existen: 0
Nulos en match_id: 0
Nulos en match_date: 0
Nulos en home_name: 0
Nulos en away_name: 0
Nulos en match_centre_url: 0
Nulos en match_dir: 0
