In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
from src.feature_engineering import add_pythagorean_expectation, EloRatingSystem, add_elo_ratings, add_rolling_win_percentage

print("--- Fase 5: Entrenamiento con Características de Nivel Experto ---")

# 1. Cargar el dataset experto
csv_path = os.path.join('..', 'data', 'historical_games_expert.csv')
df = pd.read_csv(csv_path, parse_dates=['game_date'])
print(f"Dataset experto cargado con {df.shape} filas.")

# 2. Aplicar la ingeniería de características anterior
df_final = add_pythagorean_expectation(df)
df_final = add_rolling_win_percentage(df_final)
#... (entrenamiento de Elo)...
elo_sys = EloRatingSystem()
df_final['winner'] = df_final.apply(lambda row: row['h_team_name'] if row['target'] == 1 else row['v_team_name'], axis=1)
df_final['loser'] = df_final.apply(lambda row: row['v_team_name'] if row['target'] == 1 else row['h_team_name'], axis=1)
for index, row in df_final.sort_values('game_date').iterrows():
    elo_sys.update_ratings(row['winner'], row['loser'])
df_final = add_elo_ratings(df_final, elo_sys)

# 3. Crear características diferenciales para los lanzadores
df_final['era_diff'] = df_final['h_pitcher_era'] - df_final['v_pitcher_era']
df_final['pitcher_win_diff'] = df_final['h_pitcher_wins'] - df_final['v_pitcher_wins']

# 4. Preparar los datos finales
y = df_final['target']
features_final = [
    'pythag_diff', 'elo_diff', 'win_pct_roll_diff', # Nuestras mejores características hasta ahora
    'era_diff', 'pitcher_win_diff' # Las nuevas características de lanzador
]
X = df_final[features_final]
print("\nCaracterísticas finales para el modelo:")
print(X.columns.tolist())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Entrenar con el mejor modelo que encontramos (el de fábrica o el optimizado)
# Usemos el de fábrica que nos dio 64.11%
model_final = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=3, eval_metric='logloss')
model_final.fit(X_train, y_train)
print("¡Modelo final entrenado!")

predictions_final = model_final.predict(X_test)
accuracy_final = accuracy_score(y_test, predictions_final)

print("\n" + "="*60)
print(f"  PRECISIÓN ANTERIOR (v4): 0.6411")
print(f"  PRECISIÓN FINAL (con datos de lanzador): {accuracy_final:.4f}")
print("="*60)