In [1]:
## 1 · Imports y rutas globales
#Carga de librerías y definición de carpetas base.
import warnings, re, math, random
from pathlib import Path
from collections import defaultdict
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import xgboost as xgb
from sklearn.metrics import log_loss, accuracy_score
import optuna

# ---- Rutas base ----
ROOT_CANDIDATES = [Path("data"), Path("/data"), Path("/mnt/data"), Path(".")]
DATA_DIR  = next((p for p in ROOT_CANDIDATES if (p / "atp_players.csv").exists()), None)
assert DATA_DIR, "No se encontraron los CSV de ATP en las rutas candidatas."

PLAYERS_CSV = DATA_DIR / "atp_players.csv"
DRAW_CSV    = DATA_DIR / "ao_2025_draw.csv"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## 2 · Carga y limpieza inicial de partidos ATP
def load_matches(pattern: str = "atp_matches_*.csv", roots = ROOT_CANDIDATES) -> pd.DataFrame:
    #Une todos los CSV año‑a‑año en un único DataFrame ordenado por fecha.
    dfs = []
    for root in roots:
        for fp in sorted(root.glob(pattern)):
            dfs.append(pd.read_csv(fp, parse_dates=["tourney_date"]))
    if not dfs:
        raise FileNotFoundError(f"No se hallaron archivos {pattern} en {roots}")
    df = pd.concat(dfs, ignore_index=True)
    # Sólo columnas esenciales para Elo + XGB
    keep = ["tourney_date", "surface", "winner_id", "loser_id"]
    return df[keep].sort_values("tourney_date").reset_index(drop=True)

matches = load_matches()
orig_rows = len(matches)
# Descartamos filas sin superficie declarada
matches = matches.dropna(subset=["surface"]).reset_index(drop=True)
print(f"Matches cargados: {orig_rows:,}  → tras limpiar NaNs en 'surface': {len(matches):,}")
print(matches.surface.value_counts(dropna=False).head())
f"Matches cargados: {matches.shape[0]:,}"


Matches cargados: 194,996  → tras limpiar NaNs en 'surface': 192,006
surface
Hard      78287
Clay      69418
Grass     23401
Carpet    20900
Name: count, dtype: int64


'Matches cargados: 192,006'

In [3]:
## 3 · Cálculo de Elo dinámico y sensible a superficie
def compute_elo_features(df: pd.DataFrame, *, k: int = 32, alpha: float = 0.30):
    #Devuelve un DataFrame con las columnas pre‑partido `a_elo`, `b_elo`, `delta_elo`.

    #overall[pid]**   : Elo genérico  (Hard+Clay+Grass+Carpet)
    #surface[pid][s]**: Elo específico de la superficie *s*
    #alpha**          : Peso relativo del Elo por superficie (0 = ignora superficie).

    overall   = defaultdict(lambda: 1500.0)
    surfaces  = defaultdict(lambda: {s: 1500.0 for s in ["Hard", "Clay", "Grass", "Carpet"]})
    records   = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Elo loop"):
        w, l  = int(row.winner_id), int(row.loser_id)
        surf  = (row.surface or "Hard").title()

        w_r = (1-alpha)*overall[w] + alpha*surfaces[w][surf]
        l_r = (1-alpha)*overall[l] + alpha*surfaces[l][surf]
        exp_w = 1 / (1 + 10**((l_r - w_r)/400))

        records.append({"a_elo": w_r, "b_elo": l_r, "delta_elo": w_r - l_r})

        delta = k * (1 - exp_w)
        overall[w]  += delta; overall[l]  -= delta
        surfaces[w][surf] += delta; surfaces[l][surf] -= delta

    return pd.DataFrame(records), overall, surfaces

elo_feats, final_overall, final_surface = compute_elo_features(matches)


Elo loop: 100%|██████████| 192006/192006 [00:08<00:00, 21612.96it/s]


In [4]:
## 4 · Dataset balanceado (ganador vs perdedor)
# Positivos: ganador como lado A ─────────
X_pos = elo_feats.copy()
y_pos = np.ones(len(X_pos), dtype=np.int8)

# Negativos: perdedor como lado A ────────
X_neg = X_pos.copy()
X_neg[["a_elo", "b_elo"]] = X_neg[["b_elo", "a_elo"]].values
X_neg["delta_elo"] *= -1
y_neg = np.zeros(len(X_neg), dtype=np.int8)

X_full = pd.concat([X_pos, X_neg], ignore_index=True)
y_full = np.concatenate([y_pos, y_neg])


In [5]:
## 5 · Split cronológico train / val / test
# Dos fechas por match (porque espejamos)
dates_full = pd.concat([matches.tourney_date]*2, ignore_index=True)

train = dates_full.dt.year <= 2022
val   = dates_full.dt.year == 2023
test  = dates_full.dt.year == 2024

X_train, y_train = X_full[train], y_full[train]
X_val,   y_val   = X_full[val],   y_full[val]
X_test,  y_test  = X_full[test],  y_full[test]

print("Distribución:")
for name, y in zip(["Train", "Val", "Test"], [y_train, y_val, y_test]):
    print(f" · {name}: {len(y):,} obs – Pos={y.mean():.3f}")


Distribución:
 · Train: 371,994 obs – Pos=0.500
 · Val: 5,866 obs – Pos=0.500
 · Test: 6,152 obs – Pos=0.500


In [6]:
## 6 · Búsqueda de hiperparámetros con Optuna
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)

def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "alpha":  trial.suggest_float("alpha",  1e-8, 10.0, log=True),
    }
    booster = xgb.train(params, dtrain, num_boost_round=500,
                        evals=[(dval, "val")],
                        early_stopping_rounds=50, verbose_eval=False)
    return booster.best_score

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=40, show_progress_bar=True)
print("Mejor pérdida log:", study.best_value, "\nParams:", study.best_params)


[I 2025-06-30 19:36:14,742] A new study created in memory with name: no-name-d5d6031d-3007-45e6-8493-9e8f208e9c61
Best trial: 0. Best value: 0.623134:   2%|▎         | 1/40 [00:04<02:42,  4.17s/it]

[I 2025-06-30 19:36:18,912] Trial 0 finished with value: 0.6231336489553505 and parameters: {'eta': 0.010577367847204571, 'max_depth': 7, 'subsample': 0.755631492651708, 'colsample_bytree': 0.6385503892015613, 'lambda': 0.011834788147411911, 'alpha': 0.0006466760806552321}. Best is trial 0 with value: 0.6231336489553505.


Best trial: 0. Best value: 0.623134:   5%|▌         | 2/40 [00:05<01:38,  2.59s/it]

[I 2025-06-30 19:36:20,392] Trial 1 finished with value: 0.6234946026293097 and parameters: {'eta': 0.017791136706963664, 'max_depth': 6, 'subsample': 0.7490979086990577, 'colsample_bytree': 0.701578082708999, 'lambda': 6.135356436009329e-05, 'alpha': 8.262089700651654e-06}. Best is trial 0 with value: 0.6231336489553505.


Best trial: 0. Best value: 0.623134:   8%|▊         | 3/40 [00:06<01:03,  1.72s/it]

[I 2025-06-30 19:36:21,073] Trial 2 finished with value: 0.6241131326329107 and parameters: {'eta': 0.04889447645443499, 'max_depth': 5, 'subsample': 0.9327786231654154, 'colsample_bytree': 0.9604714247737565, 'lambda': 0.9271755968992232, 'alpha': 3.2200142784042317e-07}. Best is trial 0 with value: 0.6231336489553505.


Best trial: 0. Best value: 0.623134:  10%|█         | 4/40 [00:07<00:56,  1.58s/it]

[I 2025-06-30 19:36:22,441] Trial 3 finished with value: 0.6246764583394383 and parameters: {'eta': 0.017875002155614057, 'max_depth': 4, 'subsample': 0.5058174057170661, 'colsample_bytree': 0.876364393807813, 'lambda': 2.1617711855310548e-05, 'alpha': 2.0117015601464883e-06}. Best is trial 0 with value: 0.6231336489553505.


Best trial: 0. Best value: 0.623134:  12%|█▎        | 5/40 [00:08<00:39,  1.13s/it]

[I 2025-06-30 19:36:22,783] Trial 4 finished with value: 0.6254330614522643 and parameters: {'eta': 0.24373766041833997, 'max_depth': 4, 'subsample': 0.9104923375360554, 'colsample_bytree': 0.9531464347052059, 'lambda': 1.1992558768474943e-08, 'alpha': 0.023145945970675944}. Best is trial 0 with value: 0.6231336489553505.


Best trial: 0. Best value: 0.623134:  15%|█▌        | 6/40 [00:08<00:30,  1.10it/s]

[I 2025-06-30 19:36:23,256] Trial 5 finished with value: 0.6231633961245804 and parameters: {'eta': 0.2700511594635894, 'max_depth': 6, 'subsample': 0.8926115734771927, 'colsample_bytree': 0.6486519857770368, 'lambda': 4.109830708208631e-05, 'alpha': 0.00011542400819573893}. Best is trial 0 with value: 0.6231336489553505.


Best trial: 0. Best value: 0.623134:  18%|█▊        | 7/40 [00:09<00:28,  1.15it/s]

[I 2025-06-30 19:36:24,048] Trial 6 finished with value: 0.6240333504695413 and parameters: {'eta': 0.03384833863808831, 'max_depth': 5, 'subsample': 0.9126660440468708, 'colsample_bytree': 0.7547891857705418, 'lambda': 0.7055872181315628, 'alpha': 2.3883411127862324e-07}. Best is trial 0 with value: 0.6231336489553505.


Best trial: 0. Best value: 0.623134:  20%|██        | 8/40 [00:10<00:33,  1.04s/it]

[I 2025-06-30 19:36:25,453] Trial 7 finished with value: 0.6246850634812343 and parameters: {'eta': 0.019842541481152845, 'max_depth': 4, 'subsample': 0.912912221595443, 'colsample_bytree': 0.72471185570037, 'lambda': 0.056176485054533244, 'alpha': 1.99525740524112e-07}. Best is trial 0 with value: 0.6231336489553505.


Best trial: 8. Best value: 0.623003:  22%|██▎       | 9/40 [00:11<00:25,  1.19it/s]

[I 2025-06-30 19:36:25,845] Trial 8 finished with value: 0.6230029599836996 and parameters: {'eta': 0.25367626412785665, 'max_depth': 4, 'subsample': 0.5781201393971191, 'colsample_bytree': 0.5619500863691878, 'lambda': 8.229636526186415e-06, 'alpha': 0.0002821566476217457}. Best is trial 8 with value: 0.6230029599836996.


Best trial: 8. Best value: 0.623003:  25%|██▌       | 10/40 [00:11<00:20,  1.43it/s]

[I 2025-06-30 19:36:26,233] Trial 9 finished with value: 0.6231267559629453 and parameters: {'eta': 0.19052573095456626, 'max_depth': 3, 'subsample': 0.727963129879677, 'colsample_bytree': 0.558706814749727, 'lambda': 1.0196984944818914e-08, 'alpha': 2.709552404872238e-07}. Best is trial 8 with value: 0.6230029599836996.


Best trial: 8. Best value: 0.623003:  28%|██▊       | 11/40 [00:12<00:21,  1.38it/s]

[I 2025-06-30 19:36:27,024] Trial 10 finished with value: 0.6231636722758539 and parameters: {'eta': 0.11208941795924246, 'max_depth': 8, 'subsample': 0.5266110624263485, 'colsample_bytree': 0.526521915390028, 'lambda': 7.288190987791678e-07, 'alpha': 0.24017519282974972}. Best is trial 8 with value: 0.6230029599836996.


Best trial: 8. Best value: 0.623003:  30%|███       | 12/40 [00:12<00:18,  1.53it/s]

[I 2025-06-30 19:36:27,515] Trial 11 finished with value: 0.6234545278068577 and parameters: {'eta': 0.11624570590729126, 'max_depth': 3, 'subsample': 0.6348326141298183, 'colsample_bytree': 0.5067050276791092, 'lambda': 3.51360220646453e-08, 'alpha': 1.2356692002290674e-08}. Best is trial 8 with value: 0.6230029599836996.


Best trial: 8. Best value: 0.623003:  32%|███▎      | 13/40 [00:13<00:16,  1.68it/s]

[I 2025-06-30 19:36:27,969] Trial 12 finished with value: 0.6233687929141005 and parameters: {'eta': 0.13504041370502892, 'max_depth': 3, 'subsample': 0.6524336352824107, 'colsample_bytree': 0.5814718117064583, 'lambda': 7.618230215170223e-07, 'alpha': 0.0015868769515310415}. Best is trial 8 with value: 0.6230029599836996.


Best trial: 8. Best value: 0.623003:  35%|███▌      | 14/40 [00:13<00:13,  1.86it/s]

[I 2025-06-30 19:36:28,376] Trial 13 finished with value: 0.6232419007613701 and parameters: {'eta': 0.16893148152104523, 'max_depth': 3, 'subsample': 0.6426800663750328, 'colsample_bytree': 0.5844275476532738, 'lambda': 7.922828146356139e-07, 'alpha': 2.5619747321743456e-05}. Best is trial 8 with value: 0.6230029599836996.


Best trial: 8. Best value: 0.623003:  38%|███▊      | 15/40 [00:14<00:14,  1.70it/s]

[I 2025-06-30 19:36:29,078] Trial 14 finished with value: 0.6248930958177459 and parameters: {'eta': 0.0812791903028225, 'max_depth': 4, 'subsample': 0.7870537367424765, 'colsample_bytree': 0.8120140567863883, 'lambda': 0.0013050391128662838, 'alpha': 1.770442698022684}. Best is trial 8 with value: 0.6230029599836996.


Best trial: 8. Best value: 0.623003:  40%|████      | 16/40 [00:14<00:12,  1.91it/s]

[I 2025-06-30 19:36:29,456] Trial 15 finished with value: 0.6232756240515022 and parameters: {'eta': 0.29323652976632186, 'max_depth': 3, 'subsample': 0.5791282109755838, 'colsample_bytree': 0.5793728187420049, 'lambda': 2.6095927697825567e-06, 'alpha': 0.015347139650924836}. Best is trial 8 with value: 0.6230029599836996.


Best trial: 16. Best value: 0.62286:  42%|████▎     | 17/40 [00:15<00:12,  1.90it/s]

[I 2025-06-30 19:36:29,986] Trial 16 finished with value: 0.6228601773667343 and parameters: {'eta': 0.17616222987701521, 'max_depth': 5, 'subsample': 0.8255219787624801, 'colsample_bytree': 0.6558652906010073, 'lambda': 1.0496841026327265e-07, 'alpha': 1.4855838283516526e-08}. Best is trial 16 with value: 0.6228601773667343.


Best trial: 16. Best value: 0.62286:  45%|████▌     | 18/40 [00:15<00:12,  1.71it/s]

[I 2025-06-30 19:36:30,711] Trial 17 finished with value: 0.6228971835981548 and parameters: {'eta': 0.07188415692999209, 'max_depth': 5, 'subsample': 0.8426336741157305, 'colsample_bytree': 0.6394849195820018, 'lambda': 8.961997085634323e-08, 'alpha': 1.090448670694334e-08}. Best is trial 16 with value: 0.6228601773667343.


Best trial: 16. Best value: 0.62286:  48%|████▊     | 19/40 [00:16<00:13,  1.55it/s]

[I 2025-06-30 19:36:31,491] Trial 18 finished with value: 0.6228885907378775 and parameters: {'eta': 0.07449655241858977, 'max_depth': 6, 'subsample': 0.993999331393316, 'colsample_bytree': 0.660304091011116, 'lambda': 8.35832925988945e-08, 'alpha': 1.6035938958597174e-08}. Best is trial 16 with value: 0.6228601773667343.


Best trial: 16. Best value: 0.62286:  50%|█████     | 20/40 [00:17<00:13,  1.44it/s]

[I 2025-06-30 19:36:32,304] Trial 19 finished with value: 0.6236854692291693 and parameters: {'eta': 0.04269016160808837, 'max_depth': 7, 'subsample': 0.9980342068515016, 'colsample_bytree': 0.8058527296183566, 'lambda': 0.0005367715298804097, 'alpha': 5.173472484318241e-08}. Best is trial 16 with value: 0.6228601773667343.


Best trial: 16. Best value: 0.62286:  52%|█████▎    | 21/40 [00:18<00:12,  1.52it/s]

[I 2025-06-30 19:36:32,880] Trial 20 finished with value: 0.6234010752548323 and parameters: {'eta': 0.07506177166379523, 'max_depth': 7, 'subsample': 0.9855016053709444, 'colsample_bytree': 0.6837828852921335, 'lambda': 7.952086579070442e-08, 'alpha': 2.802614089976028e-06}. Best is trial 16 with value: 0.6228601773667343.


Best trial: 16. Best value: 0.62286:  55%|█████▌    | 22/40 [00:18<00:12,  1.44it/s]

[I 2025-06-30 19:36:33,658] Trial 21 finished with value: 0.6228934482160411 and parameters: {'eta': 0.06778536669794256, 'max_depth': 5, 'subsample': 0.8287428129019694, 'colsample_bytree': 0.6446112368974899, 'lambda': 7.376498045272415e-08, 'alpha': 1.5169884152979235e-08}. Best is trial 16 with value: 0.6228601773667343.


Best trial: 16. Best value: 0.62286:  57%|█████▊    | 23/40 [00:19<00:12,  1.33it/s]

[I 2025-06-30 19:36:34,536] Trial 22 finished with value: 0.6236217011400063 and parameters: {'eta': 0.03201008504289336, 'max_depth': 6, 'subsample': 0.837540583443853, 'colsample_bytree': 0.768978727473211, 'lambda': 1.4576824612434804e-07, 'alpha': 3.83453734554924e-08}. Best is trial 16 with value: 0.6228601773667343.


Best trial: 23. Best value: 0.6228:  60%|██████    | 24/40 [00:20<00:12,  1.31it/s] 

[I 2025-06-30 19:36:35,340] Trial 23 finished with value: 0.6227997689639042 and parameters: {'eta': 0.06288443475800931, 'max_depth': 5, 'subsample': 0.8165827644114687, 'colsample_bytree': 0.6581166516704186, 'lambda': 3.251122027728748e-07, 'alpha': 1.3572176372666346e-06}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  62%|██████▎   | 25/40 [00:21<00:10,  1.46it/s]

[I 2025-06-30 19:36:35,832] Trial 24 finished with value: 0.623752630055662 and parameters: {'eta': 0.08925349081670002, 'max_depth': 6, 'subsample': 0.7000233525634748, 'colsample_bytree': 0.6816756398996757, 'lambda': 3.4546782343906755e-06, 'alpha': 3.7097463295520534e-06}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  65%|██████▌   | 26/40 [00:21<00:08,  1.62it/s]

[I 2025-06-30 19:36:36,290] Trial 25 finished with value: 0.622879723095844 and parameters: {'eta': 0.15847484449206722, 'max_depth': 5, 'subsample': 0.7978265085424293, 'colsample_bytree': 0.6145405001151313, 'lambda': 9.593782497902392, 'alpha': 6.789442755949599e-07}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  68%|██████▊   | 27/40 [00:22<00:07,  1.71it/s]

[I 2025-06-30 19:36:36,806] Trial 26 finished with value: 0.6228358986855133 and parameters: {'eta': 0.16535884065040107, 'max_depth': 5, 'subsample': 0.8014263998911803, 'colsample_bytree': 0.6023047977517848, 'lambda': 0.0931828368701895, 'alpha': 3.369019068786163e-05}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  70%|███████   | 28/40 [00:22<00:06,  1.84it/s]

[I 2025-06-30 19:36:37,246] Trial 27 finished with value: 0.6241460477433491 and parameters: {'eta': 0.10220236815610341, 'max_depth': 5, 'subsample': 0.8616076629756654, 'colsample_bytree': 0.7143550245843634, 'lambda': 0.004765759085495133, 'alpha': 2.1097300315021025e-05}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  72%|███████▎  | 29/40 [00:22<00:05,  1.93it/s]

[I 2025-06-30 19:36:37,706] Trial 28 finished with value: 0.6228946210178897 and parameters: {'eta': 0.2000049730238454, 'max_depth': 5, 'subsample': 0.7866909632686119, 'colsample_bytree': 0.6031228774358031, 'lambda': 0.00019571523361569784, 'alpha': 3.251822015471554e-05}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  75%|███████▌  | 30/40 [00:26<00:14,  1.45s/it]

[I 2025-06-30 19:36:41,341] Trial 29 finished with value: 0.623250178520895 and parameters: {'eta': 0.011451788570789797, 'max_depth': 8, 'subsample': 0.7155719718716156, 'colsample_bytree': 0.6128369786031975, 'lambda': 0.04706806388601163, 'alpha': 0.00056578873628196}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  78%|███████▊  | 31/40 [00:27<00:10,  1.16s/it]

[I 2025-06-30 19:36:41,820] Trial 30 finished with value: 0.6229435777081449 and parameters: {'eta': 0.1401073899346341, 'max_depth': 4, 'subsample': 0.8761800000106078, 'colsample_bytree': 0.54501371176124, 'lambda': 0.007576671697813373, 'alpha': 1.5535019723599606e-06}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  80%|████████  | 32/40 [00:27<00:07,  1.08it/s]

[I 2025-06-30 19:36:42,206] Trial 31 finished with value: 0.6245989145370034 and parameters: {'eta': 0.1530065082382226, 'max_depth': 5, 'subsample': 0.792980222452006, 'colsample_bytree': 0.6707006356233014, 'lambda': 3.919368273431066, 'alpha': 7.312043556594228e-07}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  82%|████████▎ | 33/40 [00:27<00:05,  1.27it/s]

[I 2025-06-30 19:36:42,665] Trial 32 finished with value: 0.6228482805743707 and parameters: {'eta': 0.1985705004175035, 'max_depth': 5, 'subsample': 0.8121325544948331, 'colsample_bytree': 0.61869359014443, 'lambda': 0.26207753316413634, 'alpha': 8.148963701920019e-08}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  85%|████████▌ | 34/40 [00:28<00:04,  1.41it/s]

[I 2025-06-30 19:36:43,194] Trial 33 finished with value: 0.6230304060472722 and parameters: {'eta': 0.20582943887769725, 'max_depth': 6, 'subsample': 0.756450527556863, 'colsample_bytree': 0.6295988489771527, 'lambda': 0.11475507715040956, 'alpha': 6.353594481973346e-08}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  88%|████████▊ | 35/40 [00:29<00:03,  1.43it/s]

[I 2025-06-30 19:36:43,875] Trial 34 finished with value: 0.6233577337427185 and parameters: {'eta': 0.05334069062376282, 'max_depth': 7, 'subsample': 0.7607484648531062, 'colsample_bytree': 0.7273013517313898, 'lambda': 0.44222635328073334, 'alpha': 9.824658910366024e-06}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  90%|█████████ | 36/40 [00:29<00:02,  1.69it/s]

[I 2025-06-30 19:36:44,207] Trial 35 finished with value: 0.6247562201972447 and parameters: {'eta': 0.22625152805569146, 'max_depth': 4, 'subsample': 0.8150133694533057, 'colsample_bytree': 0.6969493639563694, 'lambda': 0.13610283530076397, 'alpha': 9.55299608250909e-08}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  92%|█████████▎| 37/40 [00:29<00:01,  1.82it/s]

[I 2025-06-30 19:36:44,665] Trial 36 finished with value: 0.6243798325426986 and parameters: {'eta': 0.0984833215603649, 'max_depth': 5, 'subsample': 0.9415594178648826, 'colsample_bytree': 0.7907200881238768, 'lambda': 0.00015403993676263883, 'alpha': 0.00010238360218346105}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  95%|█████████▌| 38/40 [00:30<00:01,  1.90it/s]

[I 2025-06-30 19:36:45,134] Trial 37 finished with value: 0.6244280038371637 and parameters: {'eta': 0.12418702371316383, 'max_depth': 5, 'subsample': 0.8631984641543338, 'colsample_bytree': 0.8555333395050068, 'lambda': 0.020414836439049327, 'alpha': 7.367324298877198e-07}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228:  98%|█████████▊| 39/40 [00:32<00:00,  1.08it/s]

[I 2025-06-30 19:36:47,002] Trial 38 finished with value: 0.6230398848445808 and parameters: {'eta': 0.02186639527630752, 'max_depth': 6, 'subsample': 0.7654524776815766, 'colsample_bytree': 0.6074843815297966, 'lambda': 0.002565090684352852, 'alpha': 1.1468876276269137e-07}. Best is trial 23 with value: 0.6227997689639042.


Best trial: 23. Best value: 0.6228: 100%|██████████| 40/40 [00:32<00:00,  1.22it/s]

[I 2025-06-30 19:36:47,407] Trial 39 finished with value: 0.6248268244163782 and parameters: {'eta': 0.17774034131727173, 'max_depth': 4, 'subsample': 0.6800443272570065, 'colsample_bytree': 0.9876067045087694, 'lambda': 3.047998321747781, 'alpha': 0.0022502346612625944}. Best is trial 23 with value: 0.6227997689639042.
Mejor pérdida log: 0.6227997689639042 
Params: {'eta': 0.06288443475800931, 'max_depth': 5, 'subsample': 0.8165827644114687, 'colsample_bytree': 0.6581166516704186, 'lambda': 3.251122027728748e-07, 'alpha': 1.3572176372666346e-06}





In [7]:
## 7 · Modelo final y evaluación en test
best_params = {**study.best_params,
               "objective": "binary:logistic",
               "eval_metric": "logloss"}
model = xgb.train(best_params, xgb.DMatrix(X_train, label=y_train),
                  num_boost_round=study.best_trial.number)

y_pred = model.predict(xgb.DMatrix(X_test))
print("LogLoss test:", log_loss(y_test, y_pred))
print("Accuracy  :", accuracy_score(y_test, (y_pred>=0.5).astype(int)))


LogLoss test: 0.6386164832782223
Accuracy  : 0.642555266579974


In [8]:
## 8 · Carga del cuadro AO 2025 y utilidades
def load_draw_with_ids(path_draw: Path, *, players_csv=PLAYERS_CSV) -> pd.DataFrame:

    #Devuelve DataFrame con `player1_id`, `player2_id`.
    #Acepta columnas `PLAYER_1/PLAYER_2` con nombre+país.

    draw = pd.read_csv(path_draw)

    # ↓ Normaliza nombres y extrae IDs ↓
    def clean(name):
        return re.sub(r"\s*\([^)]*\)", "", str(name)).strip().lower()

    if {"player1_id", "player2_id"}.issubset(draw.columns):
        return draw[["player1_id", "player2_id"]].astype(int)

    # Detecta columnas de nombres (case‑insensitive)
    name_cols = [c for c in draw.columns if c.lower() in {"player1", "player_1", "player2", "player_2", "name1", "name2", "player", "player_"}]
    if len(name_cols) >= 2:
        p1, p2 = name_cols[:2]
        players = pd.read_csv(players_csv)
        name_map = {clean(f"{r.name_first} {r.name_last}"): r.player_id
                    for r in players.itertuples()}
        p1_ids = draw[p1].apply(lambda x: name_map.get(clean(x), np.nan))
        p2_ids = draw[p2].apply(lambda x: name_map.get(clean(x), np.nan))
        if p1_ids.isna().any() or p2_ids.isna().any():
            missing = draw.loc[p1_ids.isna() | p2_ids.isna(), [p1, p2]]
            raise ValueError(f"No se encontraron IDs para:\n{missing.head()}")
        return pd.DataFrame({"player1_id": p1_ids.astype(int), "player2_id": p2_ids.astype(int)})

    raise ValueError("Formato de draw no reconocido → columnas presentes:", draw.columns)

# Cargar draw listo para usar
SURFACE = "Hard"
draw_df = load_draw_with_ids(DRAW_CSV)
print("Draw cargado", draw_df.shape)


Draw cargado (64, 2)


  players = pd.read_csv(players_csv)


In [9]:
## 9 · Simulación ronda por ronda
ALPHA, K = 0.30, 32
current       = draw_df.copy()
round_num     = 1
round_results = []  # lista de DF por ronda

# Etiquetas human‑friendly para cada ronda (32‑draw → ajusta si cambia)
round_labels = {
    1: "1st Round", 2: "2nd Round", 3: "3rd Round", 4: "4th Round",
    5: "QF", 6: "SF", 7: "Final"
}

while True:
    # --- Elo previo a cada match ---
    a_elo = current.player1_id.apply(lambda pid: (1-ALPHA)*final_overall.get(pid,1500)+ALPHA*final_surface.get(pid,{}).get(SURFACE,1500))
    b_elo = current.player2_id.apply(lambda pid: (1-ALPHA)*final_overall.get(pid,1500)+ALPHA*final_surface.get(pid,{}).get(SURFACE,1500))
    dmat  = xgb.DMatrix(pd.DataFrame({"a_elo":a_elo, "b_elo":b_elo, "delta_elo":a_elo-b_elo}))

    proba   = model.predict(dmat)
    winners = np.where(proba>=0.5, current.player1_id, current.player2_id)

    # Guarda resultado ronda
        # ---- Guarda resultado de la ronda ----
    round_df = current.copy()
    round_df["winner_id"] = winners
    round_df["round"]    = round_labels.get(round_num, f"R{round_num}")
    round_results.append(round_df)

    # Actualiza Elo con un K reducido (menor volatilidad que hist.)
    for pid_w, pid_l in zip(winners, np.where(proba>=0.5, current.player2_id, current.player1_id)):
        delta_k = K * (1 - 1/(1+10**(((final_overall[pid_l]-final_overall[pid_w])/400))))
        final_overall[pid_w] += delta_k; final_overall[pid_l] -= delta_k
        final_surface[pid_w][SURFACE] += delta_k; final_surface[pid_l][SURFACE] -= delta_k

    if len(winners) == 1:
        champion_id = int(winners[0]); break
    current = pd.DataFrame({"player1_id": winners[::2], "player2_id": winners[1::2]})
    round_num += 1

print(f"🏆  Predicted champion ID: {champion_id}")


🏆  Predicted champion ID: 206173


In [10]:
## 10 Reporte de cada ronda con nombres
players = pd.read_csv(PLAYERS_CSV, low_memory=False).set_index("player_id")
name = lambda pid: players.loc[pid, ["name_first", "name_last"]].str.cat(sep=" ")

for rnd_df in round_results:
    out = rnd_df.copy()
    out["player1_name"] = out.player1_id.map(name)
    out["player2_name"] = out.player2_id.map(name)
    out["winner_name" ] = out.winner_id .map(name)

    display(
        out[["player1_id", "player1_name", "player2_id", "player2_name", "winner_id", "winner_name"]]
        .style.set_caption(out["round"].iloc[0])
        .hide(axis="index")
    )


player1_id,player1_name,player2_id,player2_name,winner_id,winner_name
206173,Jannik Sinner,111797,Nicolas Jarry,206173,Jannik Sinner
209262,Tristan Schoolkate,106121,Taro Daniel,106121,Taro Daniel
106218,Marcos Giron,105870,Yannick Hanfmann,106218,Marcos Giron
144869,Tomas Martin Etcheverry,207925,Flavio Cobolli,207925,Flavio Cobolli
128034,Hubert Hurkacz,134868,Tallon Griekspoor,128034,Hubert Hurkacz
200175,Miomir Kecmanovic,105583,Dusan Lajovic,200175,Miomir Kecmanovic
126610,Matteo Berrettini,111815,Cameron Norrie,126610,Matteo Berrettini
111190,Zhizhen Zhang,208029,Holger Rune,208029,Holger Rune
126774,Stefanos Tsitsipas,210506,Alex Michelsen,126774,Stefanos Tsitsipas
210317,James Mccabe,212021,Martin Landaluce,212021,Martin Landaluce


player1_id,player1_name,player2_id,player2_name,winner_id,winner_name
206173,Jannik Sinner,106121,Taro Daniel,206173,Jannik Sinner
106218,Marcos Giron,207925,Flavio Cobolli,207925,Flavio Cobolli
128034,Hubert Hurkacz,200175,Miomir Kecmanovic,128034,Hubert Hurkacz
126610,Matteo Berrettini,208029,Holger Rune,208029,Holger Rune
126774,Stefanos Tsitsipas,212021,Martin Landaluce,126774,Stefanos Tsitsipas
209113,Gabriel Diallo,111575,Karen Khachanov,111575,Karen Khachanov
202103,Francisco Cerundolo,200267,Zizou Bergs,202103,Francisco Cerundolo
105948,Federico Coria,200282,Alex De Minaur,200282,Alex De Minaur
126203,Taylor Fritz,106432,Borna Coric,126203,Taylor Fritz
207681,Francisco Comesana,104792,Gael Monfils,104792,Gael Monfils


player1_id,player1_name,player2_id,player2_name,winner_id,winner_name
206173,Jannik Sinner,207925,Flavio Cobolli,206173,Jannik Sinner
128034,Hubert Hurkacz,208029,Holger Rune,208029,Holger Rune
126774,Stefanos Tsitsipas,111575,Karen Khachanov,111575,Karen Khachanov
202103,Francisco Cerundolo,200282,Alex De Minaur,200282,Alex De Minaur
126203,Taylor Fritz,104792,Gael Monfils,126203,Taylor Fritz
210097,Ben Shelton,133430,Denis Shapovalov,210097,Ben Shelton
126094,Andrey Rublev,126207,Frances Tiafoe,126207,Frances Tiafoe
200615,Alexei Popyrin,106421,Daniil Medvedev,106421,Daniil Medvedev
104925,Novak Djokovic,207830,Tomas Machac,104925,Novak Djokovic
208103,Jiri Lehecka,105777,Grigor Dimitrov,105777,Grigor Dimitrov


player1_id,player1_name,player2_id,player2_name,winner_id,winner_name
206173,Jannik Sinner,208029,Holger Rune,206173,Jannik Sinner
111575,Karen Khachanov,200282,Alex De Minaur,200282,Alex De Minaur
126203,Taylor Fritz,210097,Ben Shelton,126203,Taylor Fritz
126207,Frances Tiafoe,106421,Daniil Medvedev,106421,Daniil Medvedev
104925,Novak Djokovic,105777,Grigor Dimitrov,104925,Novak Djokovic
207733,Jack Draper,207989,Carlos Alcaraz,207989,Carlos Alcaraz
200000,Felix Auger Aliassime,126205,Tommy Paul,126205,Tommy Paul
200005,Ugo Humbert,100644,Alexander Zverev,100644,Alexander Zverev


player1_id,player1_name,player2_id,player2_name,winner_id,winner_name
206173,Jannik Sinner,200282,Alex De Minaur,206173,Jannik Sinner
126203,Taylor Fritz,106421,Daniil Medvedev,126203,Taylor Fritz
104925,Novak Djokovic,207989,Carlos Alcaraz,104925,Novak Djokovic
126205,Tommy Paul,100644,Alexander Zverev,100644,Alexander Zverev


player1_id,player1_name,player2_id,player2_name,winner_id,winner_name
206173,Jannik Sinner,126203,Taylor Fritz,206173,Jannik Sinner
104925,Novak Djokovic,100644,Alexander Zverev,104925,Novak Djokovic


player1_id,player1_name,player2_id,player2_name,winner_id,winner_name
206173,Jannik Sinner,104925,Novak Djokovic,206173,Jannik Sinner


In [11]:
## 11 · Validación con ao_2025_results.csv
RESULTS_CSV = DATA_DIR / "ao_2025_results.csv"
assert RESULTS_CSV.exists(), f"❌ No se encontró {RESULTS_CSV}"

real_raw = pd.read_csv(RESULTS_CSV)
players  = pd.read_csv(PLAYERS_CSV, low_memory=False)

# ---- Helper: nombre → id ----
name2id = {}
for r in players.itertuples(index=False):
    if pd.notna(r.name_first) and pd.notna(r.name_last):
        key = f"{str(r.name_first).lower()} {str(r.name_last).lower()}"
        name2id[key] = r.player_id

get_id = lambda s: name2id.get(str(s).lower().strip(), np.nan)

# ---- Normalizar real_df ----
if {"player1_id", "player2_id", "winner_id"}.issubset(real_raw.columns):
    real_df = real_raw.copy()
else:
    assert {"player1", "player2", "winner"}.issubset(real_raw.columns), "CSV debe tener columnas de IDs o de nombres"
    real_df = real_raw.copy()
    real_df["player1_id"] = real_df.player1.map(get_id)
    real_df["player2_id"] = real_df.player2.map(get_id)
    real_df["winner_id"]  = real_df.winner .map(get_id)
    real_df = real_df.dropna(subset=["player1_id","player2_id","winner_id"])

# ---- Predicciones simuladas ----
sim_df = pd.concat(round_results, ignore_index=True).rename(columns={"winner_id":"pred_winner"})

# ---- Unordered pair key ----
for df in (real_df, sim_df):
    df["pair_key"] = df.apply(lambda r: tuple(sorted([int(r.player1_id), int(r.player2_id)])), axis=1)

merge_keys = ["pair_key"]
if "round" in real_df.columns:
    merge_keys.append("round")

merged = real_df.merge(sim_df, on=merge_keys, how="inner")

coverage  = len(merged) / len(real_df)
accuracy  = (merged.pred_winner == merged.winner_id).mean()
print(f"🔍 Partidos en dataset real: {len(real_df)}")
print(f"🔗 Partidos cubiertos por simulación: {len(merged)}  ({coverage:.1%})")

print(f"🎯 Precisión (sobre los cubiertos): {accuracy:.2%}")

if "round" in merged.columns:
    acc_round = merged.groupby("round").apply(lambda g: (g.pred_winner==g.winner_id).mean())
    print("📊 Precisión por ronda:")
    display(acc_round.to_frame("accuracy"))

# --- Campeón ---
champion_real = (
    real_df.loc[real_df.get("round", pd.Series()).eq("Final"), "winner_id"].iloc[0]
    if "round" in real_df.columns and (real_df["round"] == "Final").any()
    else real_df.winner_id.iloc[-1]
)
print(f"🏆 Campeón real coincide con predicción: {champion_real == champion_id}")


🔍 Partidos en dataset real: 127
🔗 Partidos cubiertos por simulación: 93  (73.2%)
🎯 Precisión (sobre los cubiertos): 68.82%
📊 Precisión por ronda:


  acc_round = merged.groupby("round").apply(lambda g: (g.pred_winner==g.winner_id).mean())


Unnamed: 0_level_0,accuracy
round,Unnamed: 1_level_1
1st Round,0.703125
2nd Round,0.555556
3rd Round,0.75
4th Round,1.0
QF,1.0
SF,0.0


🏆 Campeón real coincide con predicción: True
