# üéæ Fase 1: Modelo Base de Predicci√≥n de Tenis

Este notebook cubre todo el proceso de la Fase 1:
1. Obtenci√≥n y exploraci√≥n de datos (TML-Database)
2. Limpieza y preparaci√≥n
3. Feature engineering
4. Entrenamiento de modelos
5. Predicci√≥n y c√°lculo de EV
6. An√°lisis de resultados

**Datos**: TML-Database (actualizado hasta 2025-11-23)

## üì¶ 1. Imports y Configuraci√≥n

In [None]:
# Imports b√°sicos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Configuraci√≥n de pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("‚úÖ Imports completados")

## üì• 2. Carga de Datos

Cargamos los datos ya procesados de TML-Database

In [None]:
# Cargar datos actualizados de TML
df_raw = pd.read_csv('datos/raw/atp_matches_raw_updated.csv')

print(f"üìä Total de partidos: {len(df_raw):,}")
print(f"üìÖ Rango de fechas: {df_raw['tourney_date'].min()} - {df_raw['tourney_date'].max()}")
print(f"\nüìã Columnas: {len(df_raw.columns)}")

# Mostrar primeras filas
df_raw.head()

## üîç 3. Exploraci√≥n de Datos

In [None]:
# Informaci√≥n general
print("üìä INFORMACI√ìN DEL DATASET")
print("=" * 60)
df_raw.info()

In [None]:
# Valores faltantes
missing = df_raw.isnull().sum()
missing_pct = (missing / len(df_raw)) * 100
missing_df = pd.DataFrame({
    'Valores Faltantes': missing,
    'Porcentaje': missing_pct
}).sort_values('Valores Faltantes', ascending=False)

print("‚ùì TOP 10 COLUMNAS CON VALORES FALTANTES")
print(missing_df[missing_df['Valores Faltantes'] > 0].head(10))

In [None]:
# Distribuci√≥n por superficie
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Gr√°fico de barras
surface_counts = df_raw['surface'].value_counts()
axes[0].bar(surface_counts.index, surface_counts.values, 
            color=['#8B4513', '#90EE90', '#4169E1'])
axes[0].set_title('Distribuci√≥n por Superficie', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Superficie')
axes[0].set_ylabel('N√∫mero de Partidos')
axes[0].grid(axis='y', alpha=0.3)

# Gr√°fico de pie
axes[1].pie(surface_counts.values, labels=surface_counts.index, autopct='%1.1f%%',
            colors=['#8B4513', '#90EE90', '#4169E1'], startangle=90)
axes[1].set_title('Proporci√≥n por Superficie', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nüìä Partidos por superficie:")
print(surface_counts)

In [None]:
# Distribuci√≥n de rankings
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Rankings de ganadores
axes[0].hist(df_raw['winner_rank'].dropna(), bins=50, color='green', alpha=0.7, edgecolor='black')
axes[0].set_title('Distribuci√≥n de Rankings - Ganadores', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Ranking ATP')
axes[0].set_ylabel('Frecuencia')
axes[0].axvline(df_raw['winner_rank'].median(), color='red', linestyle='--', 
                label=f'Mediana: {df_raw["winner_rank"].median():.0f}')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Rankings de perdedores
axes[1].hist(df_raw['loser_rank'].dropna(), bins=50, color='red', alpha=0.7, edgecolor='black')
axes[1].set_title('Distribuci√≥n de Rankings - Perdedores', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Ranking ATP')
axes[1].set_ylabel('Frecuencia')
axes[1].axvline(df_raw['loser_rank'].median(), color='blue', linestyle='--',
                label=f'Mediana: {df_raw["loser_rank"].median():.0f}')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\nüèÜ Estad√≠sticas de Rankings:")
print(f"Ganador - Media: {df_raw['winner_rank'].mean():.1f}, Mediana: {df_raw['winner_rank'].median():.1f}")
print(f"Perdedor - Media: {df_raw['loser_rank'].mean():.1f}, Mediana: {df_raw['loser_rank'].median():.1f}")

In [None]:
# Partidos por a√±o
df_raw['tourney_date'] = pd.to_datetime(df_raw['tourney_date'], format='%Y%m%d', errors='coerce')
df_raw['year'] = df_raw['tourney_date'].dt.year

partidos_por_a√±o = df_raw['year'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
plt.bar(partidos_por_a√±o.index, partidos_por_a√±o.values, color='steelblue', edgecolor='black')
plt.title('Partidos por A√±o', fontsize=16, fontweight='bold')
plt.xlabel('A√±o')
plt.ylabel('N√∫mero de Partidos')
plt.grid(axis='y', alpha=0.3)

# Destacar 2025
if 2025 in partidos_por_a√±o.index:
    plt.bar(2025, partidos_por_a√±o[2025], color='orange', edgecolor='black', label='2025 (Actual)')
    plt.legend()

plt.tight_layout()
plt.show()

print("\nüìÖ Partidos por a√±o:")
print(partidos_por_a√±o)

## üßπ 4. Limpieza de Datos

In [None]:
# Cargar datos limpios (ya procesados)
df_clean = pd.read_csv('datos/processed/atp_matches_clean.csv')
df_clean['tourney_date'] = pd.to_datetime(df_clean['tourney_date'])

print(f"üìä Partidos despu√©s de limpieza: {len(df_clean):,}")
print(f"üìâ Partidos eliminados: {len(df_raw) - len(df_clean):,} ({(len(df_raw) - len(df_clean))/len(df_raw)*100:.1f}%)")
print(f"\n‚úÖ Datos limpios:")
print(f"   - Sin rankings nulos")
print(f"   - Rankings <= 500")
print(f"   - Superficies principales (Hard, Clay, Grass)")
print(f"   - Sin walkovers")
print(f"   - Ordenados temporalmente")

## üîß 5. Feature Engineering

In [None]:
# Cargar dataset con features
df_features = pd.read_csv('datos/processed/dataset_con_features.csv')
df_features['fecha'] = pd.to_datetime(df_features['fecha'])

print(f"üìä Dataset con features: {len(df_features):,} muestras")
print(f"\nüìã Features creadas:")
feature_cols = [
    'jugador_rank', 'oponente_rank', 'rank_diff', 'rank_ratio',
    'jugador_top10', 'oponente_top10', 'jugador_top50', 'oponente_top50',
    'surface_hard', 'surface_clay', 'surface_grass'
]
for i, col in enumerate(feature_cols, 1):
    print(f"   {i}. {col}")

print(f"\n‚úÖ Balance de clases:")
print(df_features['resultado'].value_counts())

In [None]:
# Visualizar distribuci√≥n de features principales
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# rank_diff
axes[0, 0].hist(df_features['rank_diff'], bins=50, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Distribuci√≥n: Diferencia de Ranking', fontweight='bold')
axes[0, 0].set_xlabel('rank_diff (oponente - jugador)')
axes[0, 0].axvline(0, color='red', linestyle='--', label='Igual ranking')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# rank_ratio
axes[0, 1].hist(df_features['rank_ratio'].clip(0, 5), bins=50, color='lightcoral', edgecolor='black')
axes[0, 1].set_title('Distribuci√≥n: Ratio de Ranking', fontweight='bold')
axes[0, 1].set_xlabel('rank_ratio (jugador / oponente)')
axes[0, 1].axvline(1, color='red', linestyle='--', label='Igual ranking')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Top 10
top10_counts = df_features[['jugador_top10', 'oponente_top10']].sum()
axes[1, 0].bar(['Jugador Top 10', 'Oponente Top 10'], top10_counts.values, 
               color=['green', 'orange'], edgecolor='black')
axes[1, 0].set_title('Jugadores Top 10', fontweight='bold')
axes[1, 0].set_ylabel('Cantidad')
axes[1, 0].grid(axis='y', alpha=0.3)

# Superficies
surface_counts = df_features[['surface_hard', 'surface_clay', 'surface_grass']].sum()
axes[1, 1].bar(['Hard', 'Clay', 'Grass'], surface_counts.values,
               color=['#8B4513', '#90EE90', '#4169E1'], edgecolor='black')
axes[1, 1].set_title('Distribuci√≥n de Superficies', fontweight='bold')
axes[1, 1].set_ylabel('Cantidad')
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## ü§ñ 6. Entrenamiento de Modelos

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# Preparar datos
X = df_features[feature_cols]
y = df_features['resultado']

# Split temporal (80/20)
df_sorted = df_features.sort_values('fecha').reset_index(drop=True)
split_idx = int(len(df_sorted) * 0.8)

X_train = df_sorted.iloc[:split_idx][feature_cols]
y_train = df_sorted.iloc[:split_idx]['resultado']
X_test = df_sorted.iloc[split_idx:][feature_cols]
y_test = df_sorted.iloc[split_idx:]['resultado']

print(f"üìä Training set: {len(X_train):,} muestras")
print(f"üìä Test set: {len(X_test):,} muestras")
print(f"üìÖ Test set fechas: {df_sorted.iloc[split_idx:]['fecha'].min().date()} - {df_sorted.iloc[split_idx:]['fecha'].max().date()}")

In [None]:
# Entrenar Random Forest
print("üå≤ Entrenando Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# Predicciones
y_pred_rf_train = rf_model.predict(X_train)
y_pred_rf_test = rf_model.predict(X_test)
y_pred_rf_proba = rf_model.predict_proba(X_test)[:, 1]

# M√©tricas
acc_rf_train = accuracy_score(y_train, y_pred_rf_train)
acc_rf_test = accuracy_score(y_test, y_pred_rf_test)
auc_rf = roc_auc_score(y_test, y_pred_rf_proba)

print(f"\n‚úÖ Random Forest entrenado")
print(f"   Accuracy Train: {acc_rf_train*100:.2f}%")
print(f"   Accuracy Test:  {acc_rf_test*100:.2f}%")
print(f"   AUC-ROC:        {auc_rf:.4f}")

In [None]:
# Entrenar Logistic Regression
print("üìà Entrenando Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Predicciones
y_pred_lr_train = lr_model.predict(X_train)
y_pred_lr_test = lr_model.predict(X_test)
y_pred_lr_proba = lr_model.predict_proba(X_test)[:, 1]

# M√©tricas
acc_lr_train = accuracy_score(y_train, y_pred_lr_train)
acc_lr_test = accuracy_score(y_test, y_pred_lr_test)
auc_lr = roc_auc_score(y_test, y_pred_lr_proba)

print(f"\n‚úÖ Logistic Regression entrenado")
print(f"   Accuracy Train: {acc_lr_train*100:.2f}%")
print(f"   Accuracy Test:  {acc_lr_test*100:.2f}%")
print(f"   AUC-ROC:        {auc_lr:.4f}")

In [None]:
# Comparaci√≥n de modelos
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
models = ['Random Forest', 'Logistic Regression']
train_scores = [acc_rf_train*100, acc_lr_train*100]
test_scores = [acc_rf_test*100, acc_lr_test*100]

x = np.arange(len(models))
width = 0.35

axes[0].bar(x - width/2, train_scores, width, label='Train', color='skyblue', edgecolor='black')
axes[0].bar(x + width/2, test_scores, width, label='Test', color='lightcoral', edgecolor='black')
axes[0].set_ylabel('Accuracy (%)')
axes[0].set_title('Comparaci√≥n de Accuracy', fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(models)
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)
axes[0].set_ylim([60, 70])

# ROC Curves
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf_proba)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_lr_proba)

axes[1].plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC={auc_rf:.3f})', linewidth=2)
axes[1].plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC={auc_lr:.3f})', linewidth=2)
axes[1].plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('Curvas ROC', fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf_test)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[0], 
            xticklabels=['Perdedor', 'Ganador'],
            yticklabels=['Perdedor', 'Ganador'])
axes[0].set_title('Confusion Matrix - Random Forest', fontweight='bold')
axes[0].set_ylabel('Real')
axes[0].set_xlabel('Predicho')

# Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr_test)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Reds', ax=axes[1],
            xticklabels=['Perdedor', 'Ganador'],
            yticklabels=['Perdedor', 'Ganador'])
axes[1].set_title('Confusion Matrix - Logistic Regression', fontweight='bold')
axes[1].set_ylabel('Real')
axes[1].set_xlabel('Predicho')

plt.tight_layout()
plt.show()

In [None]:
# Feature Importance (Random Forest)
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'], 
         color='steelblue', edgecolor='black')
plt.xlabel('Importancia', fontsize=12)
plt.title('Feature Importance - Random Forest', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nüéØ Feature Importance:")
print(feature_importance)

## üéØ 7. Predicci√≥n y C√°lculo de EV

In [None]:
# Funci√≥n para predecir un partido
def predecir_partido(jugador_rank, oponente_rank, superficie, cuota, modelo=rf_model):
    """
    Predice un partido y calcula EV
    """
    # Crear features
    features = {
        'jugador_rank': jugador_rank,
        'oponente_rank': oponente_rank,
        'rank_diff': oponente_rank - jugador_rank,
        'rank_ratio': jugador_rank / oponente_rank,
        'jugador_top10': 1 if jugador_rank <= 10 else 0,
        'oponente_top10': 1 if oponente_rank <= 10 else 0,
        'jugador_top50': 1 if jugador_rank <= 50 else 0,
        'oponente_top50': 1 if oponente_rank <= 50 else 0,
        'surface_hard': 1 if superficie == 'Hard' else 0,
        'surface_clay': 1 if superficie == 'Clay' else 0,
        'surface_grass': 1 if superficie == 'Grass' else 0
    }
    
    X_pred = pd.DataFrame([features])[feature_cols]
    
    # Predecir
    prob_jugador = modelo.predict_proba(X_pred)[0][1]
    
    # Calcular EV
    ev = (prob_jugador * cuota) - 1
    prob_implicita = 1 / cuota
    
    return {
        'prob_modelo': prob_jugador,
        'prob_implicita': prob_implicita,
        'cuota': cuota,
        'ev': ev,
        'ev_pct': ev * 100,
        'decision': 'APOSTAR ‚úÖ' if ev > 0.03 else 'NO APOSTAR ‚ùå'
    }

print("‚úÖ Funci√≥n de predicci√≥n lista")

In [None]:
# Ejemplo 1: Alcaraz vs Sinner
print("üìå EJEMPLO 1: Carlos Alcaraz (#3) vs Jannik Sinner (#1) en Hard")
print("=" * 70)

resultado1 = predecir_partido(
    jugador_rank=3,
    oponente_rank=1,
    superficie='Hard',
    cuota=2.10
)

print(f"Probabilidad modelo:    {resultado1['prob_modelo']*100:.1f}%")
print(f"Probabilidad impl√≠cita: {resultado1['prob_implicita']*100:.1f}%")
print(f"Cuota:                  @{resultado1['cuota']:.2f}")
print(f"Expected Value:         {resultado1['ev_pct']:+.2f}%")
print(f"\nüéØ DECISI√ìN: {resultado1['decision']}")

if 'APOSTAR' in resultado1['decision']:
    print(f"\nüíµ Con 10‚Ç¨:")
    print(f"   Ganancia esperada: {10 * resultado1['ev']:.2f}‚Ç¨")
    print(f"   Si ganas: {10 * resultado1['cuota']:.2f}‚Ç¨")

In [None]:
# Ejemplo 2: Djokovic vs Rune
print("üìå EJEMPLO 2: Novak Djokovic (#7) vs Holger Rune (#13) en Clay")
print("=" * 70)

resultado2 = predecir_partido(
    jugador_rank=7,
    oponente_rank=13,
    superficie='Clay',
    cuota=1.55
)

print(f"Probabilidad modelo:    {resultado2['prob_modelo']*100:.1f}%")
print(f"Probabilidad impl√≠cita: {resultado2['prob_implicita']*100:.1f}%")
print(f"Cuota:                  @{resultado2['cuota']:.2f}")
print(f"Expected Value:         {resultado2['ev_pct']:+.2f}%")
print(f"\nüéØ DECISI√ìN: {resultado2['decision']}")

## üìä 8. An√°lisis de Resultados

In [None]:
# An√°lisis de calibraci√≥n (probabilidades predichas vs reales)
from sklearn.calibration import calibration_curve

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Random Forest
prob_true_rf, prob_pred_rf = calibration_curve(y_test, y_pred_rf_proba, n_bins=10)
axes[0].plot(prob_pred_rf, prob_true_rf, marker='o', linewidth=2, label='Random Forest')
axes[0].plot([0, 1], [0, 1], 'k--', label='Perfectamente calibrado')
axes[0].set_xlabel('Probabilidad Predicha')
axes[0].set_ylabel('Probabilidad Real')
axes[0].set_title('Curva de Calibraci√≥n - Random Forest', fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Logistic Regression
prob_true_lr, prob_pred_lr = calibration_curve(y_test, y_pred_lr_proba, n_bins=10)
axes[1].plot(prob_pred_lr, prob_true_lr, marker='o', linewidth=2, label='Logistic Regression', color='orange')
axes[1].plot([0, 1], [0, 1], 'k--', label='Perfectamente calibrado')
axes[1].set_xlabel('Probabilidad Predicha')
axes[1].set_ylabel('Probabilidad Real')
axes[1].set_title('Curva de Calibraci√≥n - Logistic Regression', fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("üí° Nota: Una buena calibraci√≥n significa que las probabilidades predichas")
print("   reflejan las probabilidades reales (l√≠nea cercana a la diagonal)")

In [None]:
# Distribuci√≥n de probabilidades predichas
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(y_pred_rf_proba, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Probabilidad Predicha')
axes[0].set_ylabel('Frecuencia')
axes[0].set_title('Distribuci√≥n de Probabilidades - Random Forest', fontweight='bold')
axes[0].axvline(0.5, color='red', linestyle='--', label='Umbral 50%')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].hist(y_pred_lr_proba, bins=30, color='lightcoral', edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Probabilidad Predicha')
axes[1].set_ylabel('Frecuencia')
axes[1].set_title('Distribuci√≥n de Probabilidades - Logistic Regression', fontweight='bold')
axes[1].axvline(0.5, color='red', linestyle='--', label='Umbral 50%')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## üìù 9. Resumen y Conclusiones

In [None]:
print("=" * 70)
print("üìä RESUMEN DE LA FASE 1")
print("=" * 70)

print("\n‚úÖ DATOS:")
print(f"   Total de partidos: {len(df_clean):,}")
print(f"   Muestras de entrenamiento: {len(df_features):,}")
print(f"   √öltima fecha: {df_clean['tourney_date'].max().date()}")
print(f"   Partidos de 2025: {len(df_clean[df_clean['tourney_date'].dt.year == 2025]):,}")

print("\n‚úÖ MODELOS:")
print(f"   Random Forest:")
print(f"      - Accuracy Test: {acc_rf_test*100:.2f}%")
print(f"      - AUC-ROC: {auc_rf:.4f}")
print(f"   Logistic Regression:")
print(f"      - Accuracy Test: {acc_lr_test*100:.2f}%")
print(f"      - AUC-ROC: {auc_lr:.4f}")

print("\n‚úÖ FEATURES M√ÅS IMPORTANTES:")
for idx, row in feature_importance.head(5).iterrows():
    print(f"   {row['feature']}: {row['importance']*100:.2f}%")

print("\n‚úÖ CRITERIOS DE √âXITO:")
print(f"   {'‚úÖ' if len(df_clean) > 1000 else '‚ùå'} Dataset > 1000 partidos: {len(df_clean):,}")
print(f"   {'‚úÖ' if acc_rf_test > 0.58 else '‚ùå'} Accuracy > 58%: {acc_rf_test*100:.2f}%")
print(f"   {'‚úÖ' if len(df_clean[df_clean['tourney_date'].dt.year == 2025]) > 0 else '‚ùå'} Datos de 2025: {len(df_clean[df_clean['tourney_date'].dt.year == 2025]):,} partidos")

print("\nüéØ PR√ìXIMOS PASOS:")
print("   1. Usar el sistema para predicciones reales")
print("   2. Validar con resultados de partidos actuales")
print("   3. Continuar con FASE 2: Calibraci√≥n")

print("\n" + "=" * 70)
print("‚úÖ FASE 1 COMPLETADA EXITOSAMENTE")
print("=" * 70)