In [1]:
import sys
from pathlib import Path

def find_project_root(markers=["src", ".env"], max_hops=7):
    """Busca la raíz del proyecto por múltiples marcadores"""
    p = Path.cwd()
    for _ in range(max_hops):
        if any((p / marker).exists() for marker in markers):
            return p
        p = p.parent
    raise RuntimeError(f"No se encontró la raíz del proyecto desde {Path.cwd()}")

# Encontrar y añadir la raíz al path
PROJECT_ROOT = find_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print(f"[OK] Raíz del proyecto: {PROJECT_ROOT}")
print(f"[OK] sys.path actualizado")

[OK] Raíz del proyecto: c:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored
[OK] sys.path actualizado


In [2]:
import pandas as pd
from src.whoscored_viz.paths import BASE_DATA_DIR

# Buscar el CSV de fixtures
fixtures_root = BASE_DATA_DIR / "raw" / "fixtures" / "DataFixtures"
csv_paths = sorted(fixtures_root.rglob("finished_matches.csv"))

if not csv_paths:
    print("❌ No se encontró ningún finished_matches.csv.")
    print(f"Verifica que existe: {fixtures_root}")
    print("¿Ejecutaste el notebook 01_descarga_fixtures.ipynb primero?")
else:
    fixtures_csv = csv_paths[-1]  # El más reciente
    print(f"✅ Fixtures CSV encontrado: {fixtures_csv}")
    
    df_fixtures = pd.read_csv(fixtures_csv)
    print(f"📊 Total partidos en CSV: {len(df_fixtures)}")
    display(df_fixtures.head())

[paths.py] PROJECT_ROOT: c:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored
[paths.py] BASE_DATA_DIR: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data
✅ Fixtures CSV encontrado: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\fixtures\DataFixtures\laliga\2025-2026\finished_matches.csv
📊 Total partidos en CSV: 31


Unnamed: 0,match_date,start_time,home_name,away_name,match_id,match_centre_url,score_home,score_away,is_finished
0,2025-08-15,19:00,Girona,Rayo Vallecano,1913916,https://es.whoscored.com/Matches/1913916/Live,1,3,True
1,2025-08-15,21:30,Villarreal,Real Oviedo,1913892,https://es.whoscored.com/Matches/1913892/Live,2,0,True
2,2025-08-16,19:30,Mallorca,Barcelona,1913918,https://es.whoscored.com/Matches/1913918/Live,0,3,True
3,2025-08-16,21:30,Deportivo Alaves,Levante,1913913,https://es.whoscored.com/Matches/1913913/Live,2,1,True
4,2025-08-16,21:30,Valencia,Real Sociedad,1913889,https://es.whoscored.com/Matches/1913889/Live,1,1,True


In [3]:
from src.whoscored_viz.whoscored_matchcenter import process_one_match
from src.whoscored_viz.paths import MATCHCENTER_DIR

# Verificar que tenemos partidos
if 'df_fixtures' not in locals() or df_fixtures.empty:
    print("❌ No hay partidos disponibles. Ejecuta la celda anterior primero.")
else:
    # Tomar el primer partido como prueba
    test_match = df_fixtures.iloc[0]
    match_url = test_match["match_centre_url"]
    match_id = test_match["match_id"]
    
    print(f"🎯 Probando con partido:")
    print(f"   Match ID: {match_id}")
    print(f"   Partido: {test_match['home_name']} vs {test_match['away_name']}")
    print(f"   URL: {match_url}")
    print(f"   Guardando en: {MATCHCENTER_DIR}")
    
    # Procesar un solo partido
    try:
        result = process_one_match(
            url=match_url,
            out_root=MATCHCENTER_DIR,
            use_selenium=True,
            headless=False  # Visible para debug
        )
        
        print(f"\n✅ ¡Partido procesado exitosamente!")
        print(f"📁 Carpeta creada: {result['out_dir']}")
        print(f"📋 Manifest: {result['manifest']['tables']}")
        
        # Verificar archivos creados
        out_path = Path(result['out_dir'])
        csv_files = list(out_path.rglob("*.csv"))
        json_files = list(out_path.rglob("*.json"))
        
        print(f"\n📊 Archivos generados:")
        print(f"   🗂️ CSVs: {len(csv_files)}")
        print(f"   📄 JSONs: {len(json_files)}")
        
        if csv_files:
            print(f"\n📋 CSVs creados:")
            for f in sorted(csv_files):
                print(f"   - {f.name} ({f.stat().st_size} bytes)")
                
    except Exception as e:
        print(f"❌ Error procesando partido: {e}")
        import traceback
        traceback.print_exc()

🎯 Probando con partido:
   Match ID: 1913916
   Partido: Girona vs Rayo Vallecano
   URL: https://es.whoscored.com/Matches/1913916/Live
   Guardando en: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\matchcenter

✅ ¡Partido procesado exitosamente!
📁 Carpeta creada: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\matchcenter\MatchCenter\Competition\Season\20250815_Girona_vs_Rayo_Vallecano_1913916
📋 Manifest: {'match_meta': {'rows': 1, 'json': 'match_meta.json', 'csv': 'match_meta.csv', 'json_sha1': 'fc34b75c08eca5a647fd828bdbda06cc763080a3', 'csv_sha1': 'dacd2d37e41329270c420c88e933a5c2394e6a02'}, 'players': {'rows': 44, 'json': 'players.json', 'csv': 'players.csv', 'json_sha1': '6fc338f260d12ed3151701d9488608113505959c', 'csv_sha1': '647536a001ac8763f4f58aa3327185f1c276f8d7'}, 'events': {'rows': 1486, 'json': 'events.json', 'csv': 'events.csv', 'json_sha1': '6c567fcbf81eb0819a96ff83e48c81239e709790', 'csv_sha1': 'e1612228c691e925cdc44695c75030510a70b2bd'}

In [4]:
# Verificar la estructura de carpetas creada
if 'result' in locals():
    base_path = Path(result['out_dir'])
    
    print(f"📁 Estructura creada en: {base_path}")
    print(f"   📁 normalized/")
    for f in sorted((base_path / "normalized").glob("*")):
        size = f.stat().st_size if f.is_file() else ""
        print(f"      📄 {f.name} {size}")
    
    print(f"   📁 csv/")
    for f in sorted((base_path / "csv").glob("*")):
        size = f.stat().st_size if f.is_file() else ""
        print(f"      📄 {f.name} {size}")
        
    # Mostrar ejemplo de datos
    if (base_path / "csv" / "match_meta.csv").exists():
        print(f"\n📊 Ejemplo - Metadatos del partido:")
        df_meta = pd.read_csv(base_path / "csv" / "match_meta.csv")
        display(df_meta)
        
    if (base_path / "csv" / "events_shots.csv").exists():
        print(f"\n⚽ Ejemplo - Tiros del partido:")
        df_shots = pd.read_csv(base_path / "csv" / "events_shots.csv")
        print(f"Total tiros: {len(df_shots)}")
        if not df_shots.empty:
            display(df_shots.head())

📁 Estructura creada en: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\matchcenter\MatchCenter\Competition\Season\20250815_Girona_vs_Rayo_Vallecano_1913916
   📁 normalized/
      📄 event_types.json 5701
      📄 events.json 1472417
      📄 events_defensive.json 150042
      📄 events_gk_actions.json 11619
      📄 events_passes.json 1222230
      📄 events_shots.json 41530
      📄 formations_timeline.json 2608
      📄 formations_timeline_scored.json 3411
      📄 manifest.json 3495
      📄 match_meta.json 486
      📄 payload.json 2515631
      📄 player_positions_timeline.json 31068
      📄 players.json 15049
      📄 score_timeline.json 609
   📁 csv/
      📄 events.csv 607519
      📄 events_defensive.csv 60598
      📄 events_gk_actions.csv 3845
      📄 events_passes.csv 499620
      📄 events_shots.csv 18964
      📄 formations_timeline.csv 602
      📄 formations_timeline_scored.csv 739
      📄 match_meta.csv 328
      📄 player_positions_timeline.csv 6020
      📄 players.csv 36

Unnamed: 0,match_id,home_team_id,home_name,home_manager,away_team_id,away_name,away_manager,venue,attendance,referee,start_time,elapsed,score,ht_score,ft_score,status_code
0,1913916,2783,Girona,Míchel Sánchez,64,Rayo Vallecano,Iñigo Pérez,Municipal de Montilivi,12403,Javier Alberola Rojas,2025-08-15T19:00:00,F,1 : 3,0 : 3,1 : 3,6



⚽ Ejemplo - Tiros del partido:
Total tiros: 23


Unnamed: 0,match_id,eventId,minute,second,expandedMinute,period,teamId,playerId,x,y,endX,endY,typeName,shot_outcome,related_pass_eventId,goal_mouth_y,goal_mouth_z,q_length,q_angle,qualifiers
0,1913916,116,10,6.0,10,1,64,80758.0,89.2,47.7,,,MissedShots,Missed,,45.7,69.4,,,"[{""type"":{""value"":15,""displayName"":""Head""}},{""..."
1,1913916,131,12,2.0,12,1,64,448514.0,71.6,41.1,,,MissedShots,Missed,,66.1,29.2,,,"[{""type"":{""value"":73,""displayName"":""MissLeft""}..."
2,1913916,164,16,44.0,16,1,64,386230.0,92.5,44.3,,,MissedShots,Missed,,38.5,13.9,,,"[{""type"":{""value"":72,""displayName"":""LeftFoot""}..."
3,1913916,166,17,9.0,17,1,64,386230.0,96.3,44.5,,,Goal,Goal,,48.7,1.3,,,"[{""type"":{""value"":328,""displayName"":""FirstTouc..."
4,1913916,176,19,12.0,19,1,64,135345.0,97.1,49.4,,,Goal,Goal,,49.2,18.4,,,"[{""type"":{""value"":29,""displayName"":""Assisted""}..."


In [5]:
from src.whoscored_viz.whoscored_matchcenter import process_from_csv, _build_driver
import time

# Verificar que tenemos partidos para procesar
if 'fixtures_csv' not in locals():
    print("❌ No hay CSV de fixtures cargado. Ejecuta las celdas anteriores.")
else:
    print(f"📋 Procesando partidos desde: {fixtures_csv}")
    print(f"📊 Total partidos a procesar: {len(df_fixtures)}")
    
    # Crear un driver reutilizable para mejor rendimiento
    print("🚀 Iniciando driver de navegación...")
    driver = _build_driver(headless=False)  # Visible para monitorear progreso
    
    try:
        # Procesar todos los partidos
        results = process_from_csv(
            csv_file=fixtures_csv,
            out_root=MATCHCENTER_DIR,
            driver=driver,              # Reutilizar mismo driver
            pause_range=(2.0, 4.0),     # Pausa entre partidos (2-4 segundos)
            cooldown_every=5,           # Pausa larga cada 5 partidos
            cooldown_secs=15,           # 15 segundos de pausa larga
            limit=None                  # Procesar todos (quita limit si quieres probar con pocos)
        )
        
        print(f"\n🎉 ¡Procesamiento completado!")
        print(f"✅ Partidos procesados exitosamente: {len(results)}")
        print(f"❌ Partidos con errores: {len(df_fixtures) - len(results)}")
        
    except KeyboardInterrupt:
        print("\n⏹️ Procesamiento interrumpido por el usuario")
    except Exception as e:
        print(f"\n❌ Error durante el procesamiento: {e}")
        import traceback
        traceback.print_exc()
    finally:
        # Siempre cerrar el driver al final
        driver.quit()
        print("🔒 Driver cerrado")

📋 Procesando partidos desde: C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\fixtures\DataFixtures\laliga\2025-2026\finished_matches.csv
📊 Total partidos a procesar: 31
🚀 Iniciando driver de navegación...
✅ OK [1/31] match_id=1913916 → C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\matchcenter\MatchCenter\Competition\Season\20250815_Girona_vs_Rayo_Vallecano_1913916
✅ OK [2/31] match_id=1913892 → C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\matchcenter\MatchCenter\Competition\Season\20250815_Villarreal_vs_Real_Oviedo_1913892
✅ OK [3/31] match_id=1913918 → C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\matchcenter\MatchCenter\Competition\Season\20250816_Mallorca_vs_Barcelona_1913918
✅ OK [4/31] match_id=1913913 → C:\Users\manue\OneDrive\Escritorio\Proyecto WhoScored\data\raw\matchcenter\MatchCenter\Competition\Season\20250816_Deportivo_Alaves_vs_Levante_1913913
✅ OK [5/31] match_id=1913889 → C:\Users\manue\OneDrive\Escritor

In [6]:
# Verificar qué se ha procesado
matchcenter_base = MATCHCENTER_DIR / "MatchCenter"

if matchcenter_base.exists():
    # Contar carpetas de partidos creadas
    match_folders = []
    for comp_dir in matchcenter_base.iterdir():
        if comp_dir.is_dir():
            for season_dir in comp_dir.iterdir():
                if season_dir.is_dir():
                    for match_dir in season_dir.iterdir():
                        if match_dir.is_dir():
                            match_folders.append(match_dir)
    
    print(f"📁 Total carpetas de partidos creadas: {len(match_folders)}")
    print(f"📊 Partidos en CSV original: {len(df_fixtures)}")
    print(f"📈 Tasa de éxito: {len(match_folders)/len(df_fixtures)*100:.1f}%")
    
    # Mostrar algunos ejemplos
    print(f"\n📂 Ejemplos de carpetas creadas:")
    for i, folder in enumerate(sorted(match_folders)[:5]):
        print(f"   {i+1}. {folder.name}")
        
    if len(match_folders) > 5:
        print(f"   ... y {len(match_folders)-5} más")
    
    # Verificar archivos por partido
    if match_folders:
        sample_folder = match_folders[0]
        csv_files = list((sample_folder / "csv").glob("*.csv")) if (sample_folder / "csv").exists() else []
        json_files = list((sample_folder / "normalized").glob("*.json")) if (sample_folder / "normalized").exists() else []
        
        print(f"\n📋 Archivos por partido (ejemplo: {sample_folder.name}):")
        print(f"   📄 CSVs: {len(csv_files)}")
        print(f"   📄 JSONs: {len(json_files)}")
        
        if csv_files:
            print(f"   📊 Tipos de datos disponibles:")
            for f in sorted(csv_files):
                size_mb = f.stat().st_size / 1024 / 1024
                print(f"      - {f.stem}: {size_mb:.2f} MB")
else:
    print("❌ No se encontraron datos procesados en:", matchcenter_base)

📁 Total carpetas de partidos creadas: 31
📊 Partidos en CSV original: 31
📈 Tasa de éxito: 100.0%

📂 Ejemplos de carpetas creadas:
   1. 20250815_Girona_vs_Rayo_Vallecano_1913916
   2. 20250815_Villarreal_vs_Real_Oviedo_1913892
   3. 20250816_Deportivo_Alaves_vs_Levante_1913913
   4. 20250816_Mallorca_vs_Barcelona_1913918
   5. 20250816_Valencia_vs_Real_Sociedad_1913889
   ... y 26 más

📋 Archivos por partido (ejemplo: 20250815_Girona_vs_Rayo_Vallecano_1913916):
   📄 CSVs: 11
   📄 JSONs: 14
   📊 Tipos de datos disponibles:
      - events: 0.58 MB
      - events_defensive: 0.06 MB
      - events_gk_actions: 0.00 MB
      - events_passes: 0.48 MB
      - events_shots: 0.02 MB
      - formations_timeline: 0.00 MB
      - formations_timeline_scored: 0.00 MB
      - match_meta: 0.00 MB
      - player_positions_timeline: 0.01 MB
      - players: 0.00 MB
      - score_timeline: 0.00 MB


In [8]:
# Análisis rápido de los datos extraídos
if 'match_folders' in locals() and match_folders:
    print("🔍 Análisis rápido de datos extraídos:\n")
    
    # Tomar una muestra de partidos para análisis
    sample_size = min(31, len(match_folders))
    sample_folders = match_folders[:sample_size]
    
    total_events = 0
    total_shots = 0
    total_passes = 0
    
    for folder in sample_folders:
        csv_dir = folder / "csv"
        if not csv_dir.exists():
            continue
            
        # Leer eventos si existe
        events_file = csv_dir / "events.csv"
        if events_file.exists():
            df_events = pd.read_csv(events_file)
            total_events += len(df_events)
            
        # Leer tiros si existe  
        shots_file = csv_dir / "events_shots.csv"
        if shots_file.exists():
            df_shots = pd.read_csv(shots_file)
            total_shots += len(df_shots)
            print(f"⚽ {folder.name}: {len(df_shots)} tiros")
            
        # Leer pases si existe
        passes_file = csv_dir / "events_passes.csv"
        if passes_file.exists():
            df_passes = pd.read_csv(passes_file)
            total_passes += len(df_passes)
    
    print(f"\n📊 Estadísticas de muestra ({sample_size} partidos):")
    print(f"   🎯 Total eventos: {total_events:,}")
    print(f"   ⚽ Total tiros: {total_shots:,}")
    print(f"   ⚪ Total pases: {total_passes:,}")
    print(f"   📈 Promedio eventos por partido: {total_events/sample_size:.0f}")
    print(f"   📈 Promedio tiros por partido: {total_shots/sample_size:.0f}")
    print(f"   📈 Promedio pases por partido: {total_passes/sample_size:.0f}")

🔍 Análisis rápido de datos extraídos:

⚽ 20250815_Girona_vs_Rayo_Vallecano_1913916: 23 tiros
⚽ 20250815_Villarreal_vs_Real_Oviedo_1913892: 30 tiros
⚽ 20250816_Deportivo_Alaves_vs_Levante_1913913: 22 tiros
⚽ 20250816_Mallorca_vs_Barcelona_1913918: 28 tiros
⚽ 20250816_Valencia_vs_Real_Sociedad_1913889: 31 tiros
⚽ 20250817_Athletic_Club_vs_Sevilla_1913914: 26 tiros
⚽ 20250817_Celta_Vigo_vs_Getafe_1913915: 21 tiros
⚽ 20250817_Espanyol_vs_Atletico_1913917: 24 tiros
⚽ 20250818_Elche_vs_Real_Betis_1913883: 18 tiros
⚽ 20250819_Real_Madrid_vs_Osasuna_1913886: 20 tiros
⚽ 20250822_Real_Betis_vs_Deportivo_Alaves_1913885: 27 tiros
⚽ 20250823_Atletico_vs_Elche_1913898: 19 tiros
⚽ 20250823_Levante_vs_Barcelona_1913888: 35 tiros
⚽ 20250823_Mallorca_vs_Celta_Vigo_1913891: 20 tiros
⚽ 20250824_Osasuna_vs_Valencia_1913900: 20 tiros
⚽ 20250824_Real_Oviedo_vs_Real_Madrid_1913882: 32 tiros
⚽ 20250824_Real_Sociedad_vs_Espanyol_1913894: 29 tiros
⚽ 20250824_Villarreal_vs_Girona_1913901: 24 tiros
⚽ 20250825_Athl