In [1]:
import numpy as np
import random
import optuna
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from optuna.visualization import plot_optimization_history
import os
import json
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 1234
 # Se puede cambiar a "precision" o "recall" o "f1-score"
SCORE = "f1-score"
model_name = 'XGBClassifier'

In [3]:
def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)

In [4]:
def get_trend_changes_report_dict(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    
    Args:
        y_test (np.array): True labels.
        y_pred (np.array): Predicted labels.
        
    Returns:
        float: The trend changes score.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    return classification_report(
        y_df["is_changed_trend_test"][:-1], 
        y_df["is_changed_trend_predict"][:-1], 
        digits=4,
        output_dict=True,
        zero_division=0
    )

def trend_changes_score(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    return classification_report(
        y_df["is_changed_trend_test"][:-1],
        y_df["is_changed_trend_predict"][:-1],
        digits=4
    )

def trend_changes_true(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    Returns only the F1-score for trend change.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    report = classification_report(
        y_df["is_changed_trend_test"][:-1],
        y_df["is_changed_trend_predict"][:-1],
        output_dict=True,
        zero_division=0
    )
    return report["True"][SCORE]

In [5]:
# Cargar datos
train = pd.read_csv("../../../data/post_cleaning/training_set.csv", parse_dates=["date"])
val = pd.read_csv("../../../data/post_cleaning/validation_set.csv", parse_dates=["date"])
X_train = train.drop(columns=["date", "target_trend"]).values
y_train = train["target_trend"].values
X_val = val.drop(columns=["date", "target_trend"]).values
y_val = val["target_trend"].values

In [6]:
# Mapear clases (-1,0,1) -> (0,1,2) para LightGBM
cls_map = {-1:0, 0:1, 1:2}
inv_map = {v:k for k,v in cls_map.items()}
y_train_m = np.vectorize(cls_map.get)(y_train)
y_val_m   = np.vectorize(cls_map.get)(y_val)

In [7]:
# Optuna para hiperparámetros
def objective(trial):
    set_seeds(SEED) # Fijar semilla antes de cada trial
    # Hiperparámetros a optimizar
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 16)
    n_estimators = trial.suggest_int("n_estimators", 100, 500)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
    reg_lambda = trial.suggest_float("reg_lambda", 0.0, 2.0)
    reg_alpha = trial.suggest_float("reg_alpha", 0.0, 2.0)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)

    model = XGBClassifier(
        learning_rate=learning_rate,
        max_depth=max_depth,
        n_estimators=n_estimators,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        min_child_weight=min_child_weight,
        random_state=SEED,
        eval_metric="mlogloss"
    )
    model.fit(X_train, y_train_m)
    y_val_pred_m = model.predict(X_val)
    score = trend_changes_true(y_val_m, y_val_pred_m)
    return score # Optuna maximiza este valor

In [8]:
# Crear estudio y medir tiempo de Optuna
t0 = time.perf_counter()

set_seeds(SEED)
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=SEED)
)
study.optimize(objective, n_trials=200)
opt_duration_sec = time.perf_counter() - t0
n_trials_run = len(study.trials)

print(f"Optuna duró {opt_duration_sec:.2f}s en {n_trials_run} trials")

[I 2025-09-17 17:01:19,207] A new study created in memory with name: no-name-fdd40a9f-da84-4bda-8913-93636fd19b8b


[I 2025-09-17 17:01:23,072] Trial 0 finished with value: 0.46511627906976744 and parameters: {'learning_rate': 0.01774894524799066, 'max_depth': 11, 'n_estimators': 275, 'subsample': 0.9141434334855076, 'colsample_bytree': 0.8899879040594018, 'reg_lambda': 0.5451852105652832, 'reg_alpha': 0.5529285102861934, 'min_child_weight': 9}. Best is trial 0 with value: 0.46511627906976744.
[I 2025-09-17 17:01:24,271] Trial 1 finished with value: 0.5147058823529411 and parameters: {'learning_rate': 0.17642821458092559, 'max_depth': 15, 'n_estimators': 243, 'subsample': 0.8003980502093835, 'colsample_bytree': 0.8417314675860681, 'reg_lambda': 1.4254040539658004, 'reg_alpha': 0.7405015095807899, 'min_child_weight': 6}. Best is trial 1 with value: 0.5147058823529411.
[I 2025-09-17 17:01:24,935] Trial 2 finished with value: 0.556390977443609 and parameters: {'learning_rate': 0.045136334609244884, 'max_depth': 2, 'n_estimators': 409, 'subsample': 0.9530564762544467, 'colsample_bytree': 0.6824429919506

Optuna duró 213.37s en 200 trials


In [9]:
# Visualizar el historial de optimización
plot_optimization_history(study)

In [10]:
print("Mejores hiperparámetros encontrados:")
print(study.best_params)
print(f"Mejor score de {SCORE}: {study.best_value:.4f}")

Mejores hiperparámetros encontrados:
{'learning_rate': 0.06995473347554425, 'max_depth': 4, 'n_estimators': 196, 'subsample': 0.998201463423508, 'colsample_bytree': 0.8357610386634241, 'reg_lambda': 0.7790830479987098, 'reg_alpha': 0.4593921006238936, 'min_child_weight': 6}
Mejor score de f1-score: 0.5692


In [11]:
# Guardar los mejores hiperparámetros y su valor
history = []
if os.path.exists("best_hyperparams.json"):
    try:
        with open("best_hyperparams.json", "r") as f:
            history = json.load(f)
    except (json.JSONDecodeError, ValueError):
        history = []

# Guardar ambos en un solo diccionario
history.append({
    "params": study.best_params,
    "value": study.best_value
})

with open("best_hyperparams.json", "w") as f:
    json.dump(history, f, indent=2)

CARGAR HIPERPARAMETROS DESDE JSON

In [12]:
# Cargar historial de hiperparámetros y valores
with open("best_hyperparams.json", "r") as f:
    history = json.load(f)

# Escoger el último (más reciente)
best_params = history[-1]["params"]
best_value = history[-1]["value"]

# Para ver todos:
for i, entry in enumerate(history):
    print(f"Hiperparámetros #{i+1}: {entry['params']}, Valor: {entry['value']}")

# Para escoger uno específico (por índice):
# best_params = history[indice_que_quieras]["params"]
# best_value = history[indice_que_quieras]["value"]

Hiperparámetros #1: {'learning_rate': 0.06995473347554425, 'max_depth': 4, 'n_estimators': 196, 'subsample': 0.998201463423508, 'colsample_bytree': 0.8357610386634241, 'reg_lambda': 0.7790830479987098, 'reg_alpha': 0.4593921006238936, 'min_child_weight': 6}, Valor: 0.5692307692307692


In [13]:
# Entrenar modelo final con los mejores hiperparámetros
set_seeds(SEED)
best_params = study.best_params
final_model = XGBClassifier(
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    n_estimators=best_params["n_estimators"],
    subsample=best_params["subsample"],
    colsample_bytree=best_params["colsample_bytree"],
    reg_lambda=best_params["reg_lambda"],
    reg_alpha=best_params["reg_alpha"],
    min_child_weight=best_params["min_child_weight"],
    random_state=SEED,
    eval_metric="mlogloss"
)
final_model.fit(X_train, y_train_m)
final_model.save_model("XGBClassifier.json")


In [19]:
y_val_pred_m = final_model.predict(X_val)
print("\nXGBClassifier trend_changes_score:\n", trend_changes_score(y_val_m, y_val_pred_m))


XGBClassifier trend_changes_score:
               precision    recall  f1-score   support

       False     0.8439    0.9132    0.8772       219
        True     0.6607    0.5000    0.5692        74

    accuracy                         0.8089       293
   macro avg     0.7523    0.7066    0.7232       293
weighted avg     0.7976    0.8089    0.7994       293



In [20]:
# Reporte completo: precisión, recall y F1 por clase
report = classification_report(y_val_m, y_val_pred_m, digits=4)
print("XGBClassifier Report:\n", report)

XGBClassifier Report:
               precision    recall  f1-score   support

           0     0.7593    0.8913    0.8200        92
           1     0.3333    0.1224    0.1791        49
           2     0.8036    0.8824    0.8411       153

    accuracy                         0.7585       294
   macro avg     0.6321    0.6320    0.6134       294
weighted avg     0.7113    0.7585    0.7242       294



In [21]:
# --- Exportar y Comparar Métricas de Modelos (Validación) ---

# 1. Definir el nombre del modelo actual y el archivo de salida
output_file = '../../../score_models/model_comparison_metrics.csv'

# 2. Calcular el reporte de clasificación estándar
# Usamos y_val_m y y_val_pred que están en la misma escala (0,1,2)
report_dict = classification_report(y_val_m, y_val_pred_m, output_dict=True, zero_division=0)
precision = report_dict['macro avg']['precision']
recall = report_dict['macro avg']['recall']
f1_score = report_dict['macro avg']['f1-score']


# 3. Calcular el reporte de cambio de tendencia
report = get_trend_changes_report_dict(y_val_m, y_val_pred_m)
trend_change_precision = report['True']['precision']
trend_change_recall = report['True']['recall']
trend_change_f1_score = report['True']['f1-score']

# 4. Organizar las nuevas métricas
new_metrics = {
    'precision': precision,
    'recall': recall,
    'f1_score': f1_score,
    'trend_change_precision': trend_change_precision,
    'trend_change_recall': trend_change_recall,
    'trend_change_f1_score': trend_change_f1_score,
    "optuna_duration_sec": opt_duration_sec,
    "n_trials": n_trials_run
}

# 5. Cargar, actualizar y guardar el DataFrame de comparación
try:
    # Intentar cargar el archivo existente
    comparison_df = pd.read_csv(output_file, index_col='model')
    # Si existe, actualizar o añadir la fila para el modelo actual
    comparison_df.loc[model_name] = new_metrics
except FileNotFoundError:
    # Si no existe, crear un DataFrame nuevo directamente con los datos actuales
    comparison_df = pd.DataFrame([new_metrics], index=[model_name])

# Guardar el DataFrame actualizado en el CSV
comparison_df.to_csv(output_file, index_label='model')