In [3]:
import numpy as np
import random
import optuna
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, balanced_accuracy_score
import json
import os
from optuna.visualization import plot_optimization_history

In [4]:
SEED = 1234
 # Se puede cambiar a "precision" o "recall" o "f1-score"
SCORE = "f1-score"

In [5]:
def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)

In [6]:
def trend_changes_score(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    return classification_report(
        y_df["is_changed_trend_test"][:-1],
        y_df["is_changed_trend_predict"][:-1],
        digits=4
    )

def trend_changes_true(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    Returns only the F1-score for trend change.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    report = classification_report(
        y_df["is_changed_trend_test"][:-1],
        y_df["is_changed_trend_predict"][:-1],
        output_dict=True,
        zero_division=0
    )
    return report["True"][SCORE]

In [7]:
# Cargar datos
train = pd.read_csv("../../../data/training_set.csv", parse_dates=["date"])
val = pd.read_csv("../../../data/validation_set.csv", parse_dates=["date"])
X_train = train.drop(columns=["date", "target_trend"]).values
y_train = train["target_trend"].values
X_val = val.drop(columns=["date", "target_trend"]).values
y_val = val["target_trend"].values

In [8]:
def objective(trial):
    set_seeds(SEED)
    n_estimators = trial.suggest_int("n_estimators", 50, 400)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 10)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=SEED
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = trend_changes_true(y_val, y_pred)
    return score

In [9]:
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=SEED)
)
study.optimize(objective, n_trials=100)

[I 2025-08-20 19:55:36,936] A new study created in memory with name: no-name-86b5c830-b9be-4929-8e30-a88783492b45
[I 2025-08-20 19:55:39,970] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 117, 'learning_rate': 0.06447363728960653, 'max_depth': 5, 'subsample': 0.9141434334855076, 'min_samples_split': 9, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0.
[I 2025-08-20 19:55:46,560] Trial 1 finished with value: 0.041666666666666664 and parameters: {'n_estimators': 147, 'learning_rate': 0.11047391773518671, 'max_depth': 10, 'subsample': 0.9503730538968379, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 1 with value: 0.041666666666666664.
[I 2025-08-20 19:55:53,465] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 289, 'learning_rate': 0.08457591696604633, 'max_depth': 5, 'subsample': 0.82447847442625, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.041666666666666664.
[I 2025-08-20 19:56:00,889] Trial 3

In [10]:
# Visualizar el historial de optimización
plot_optimization_history(study)

In [11]:
print("Mejores hiperparámetros encontrados:")
print(study.best_params)
print(f"Mejor score de {SCORE}: {study.best_value:.4f}")
best_params = study.best_params

Mejores hiperparámetros encontrados:
{'n_estimators': 377, 'learning_rate': 0.07038214571143755, 'max_depth': 5, 'subsample': 0.9154920571762982, 'min_samples_split': 4, 'min_samples_leaf': 6}
Mejor score de f1-score: 0.1600


In [12]:
# Guardar los mejores hiperparámetros y su valor
history = []
if os.path.exists("best_hyperparams.json"):
    try:
        with open("best_hyperparams.json", "r") as f:
            history = json.load(f)
    except (json.JSONDecodeError, ValueError):
        history = []

# Guardar ambos en un solo diccionario
history.append({
    "params": study.best_params,
    "value": study.best_value
})
# Guardar el historial de hiperparámetros
with open("best_hyperparams.json", "w") as f:
    json.dump(history, f, indent=2)

In [16]:
# # Cargar historial de hiperparámetros y valores
# import json

# with open("best_hyperparams.json", "r") as f:
#     history = json.load(f)

# # Escoger el último (más reciente)
# best_params = history[-1]["params"]
# best_value = history[-1]["value"]

# # Si quieres ver todos:
# for i, entry in enumerate(history):
#     print(f"Hiperparámetros #{i+1}: {entry['params']}, Valor: {entry['value']}")

# # Si quieres escoger uno específico (por índice):
# # best_params = history[indice_que_quieras]["params"]
# # best_value = history[indice_que_quieras]["value"]

Hiperparámetros #1: {'n_estimators': 324, 'learning_rate': 0.029233637937789516, 'max_depth': 4, 'subsample': 0.6813368996054391, 'min_samples_split': 10, 'min_samples_leaf': 9}, Valor: 0.32558139534883723


In [17]:
# Entrenar modelo final con los mejores hiperparámetros
set_seeds(SEED)
final_model = GradientBoostingClassifier(
    n_estimators=best_params["n_estimators"],
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    subsample=best_params["subsample"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    random_state=SEED
)
final_model.fit(X_train, y_train)
# Obtener predicciones
y_pred = final_model.predict(X_val)
print("Gradient Boosting Classifier Trend Changes Score:\n", trend_changes_score(y_val, y_pred))

Gradient Boosting Classifier Trend Changes Score:
               precision    recall  f1-score   support

       False     0.9029    0.8304    0.8651       112
        True     0.2692    0.4118    0.3256        17

    accuracy                         0.7752       129
   macro avg     0.5861    0.6211    0.5953       129
weighted avg     0.8194    0.7752    0.7940       129



In [15]:
# Reporte completo: precisión, recall y F1 por clase
report = classification_report(y_val, y_pred, digits=4)
print("Gradient Boosting Classifier Report:\n", report)
print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))

Gradient Boosting Classifier Report:
               precision    recall  f1-score   support

          -1     0.5349    0.9583    0.6866        24
           0     0.3333    0.3333    0.3333        18
           1     0.9710    0.7614    0.8535        88

    accuracy                         0.7385       130
   macro avg     0.6131    0.6843    0.6245       130
weighted avg     0.8022    0.7385    0.7507       130

Balanced accuracy: 0.6843434343434344
