In [1]:
import numpy as np
import random
import optuna
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, balanced_accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 1234
 # Se puede cambiar a "precision" o "recall" o "f1-score"
SCORE = "f1-score"

In [3]:
def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)

In [4]:
def trend_changes_score(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    
    Args:
        y_test (np.array): True labels.
        y_pred (np.array): Predicted labels.
        
    Returns:
        float: The trend changes score.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    return classification_report(y_df["is_changed_trend_test"][:-1], y_df["is_changed_trend_predict"][:-1], digits=4)

def trend_changes_true(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    
    Args:
        y_test (np.array): True labels.
        y_pred (np.array): Predicted labels.
        
    Returns:
        float: The trend changes score.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    report = classification_report(
        y_df["is_changed_trend_test"][:-1],
        y_df["is_changed_trend_predict"][:-1],
        output_dict=True,
        zero_division=0
    )
    return report["True"][SCORE]

In [5]:
# Cargar datos
train = pd.read_csv("../../../data/training_set.csv", parse_dates=["date"])
val = pd.read_csv("../../../data/validation_set.csv", parse_dates=["date"])
X_train = train.drop(columns=["date", "target_trend"]).values
y_train = train["target_trend"].values
X_val = val.drop(columns=["date", "target_trend"]).values
y_val = val["target_trend"].values

In [6]:
#Versión más rápida y escalable: HistGradientBoostingClassifier
set_seeds(SEED)
model = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=6,          # profundidades algo mayores suelen ir bien aquí
    max_iter=400,         # equivalente a n_estimators
    random_state=SEED
)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print("\nHistGradientBoostingClassifier trend_changes_score:\n",
    trend_changes_score(y_val, y_pred))


HistGradientBoostingClassifier trend_changes_score:
               precision    recall  f1-score   support

       False     0.8878    0.7768    0.8286       112
        True     0.1935    0.3529    0.2500        17

    accuracy                         0.7209       129
   macro avg     0.5407    0.5649    0.5393       129
weighted avg     0.7963    0.7209    0.7523       129



In [7]:
def objective(trial):
    set_seeds(SEED)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 16)
    max_iter = trial.suggest_int("max_iter", 100, 500)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    l2_regularization = trial.suggest_float("l2_regularization", 0.0, 2.0)
    max_bins = trial.suggest_int("max_bins", 128, 255)

    model = HistGradientBoostingClassifier(
        learning_rate=learning_rate,
        max_depth=max_depth,
        max_iter=max_iter,
        min_samples_leaf=min_samples_leaf,
        l2_regularization=l2_regularization,
        max_bins=max_bins,
        random_state=SEED
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = trend_changes_true(y_val, y_pred)
    return score

In [8]:
set_seeds(SEED)
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=SEED)
)
study.optimize(objective, n_trials=100)

[I 2025-08-17 22:48:31,516] A new study created in memory with name: no-name-282efbb5-913e-481c-ac69-49a2ca6b0998
[I 2025-08-17 22:48:32,741] Trial 0 finished with value: 0.13636363636363635 and parameters: {'learning_rate': 0.01774894524799066, 'max_depth': 11, 'max_iter': 275, 'min_samples_leaf': 16, 'l2_regularization': 1.559951616237607, 'max_bins': 162}. Best is trial 0 with value: 0.13636363636363635.
[I 2025-08-17 22:48:34,924] Trial 1 finished with value: 0.13636363636363635 and parameters: {'learning_rate': 0.022892239910869177, 'max_depth': 14, 'max_iter': 484, 'min_samples_leaf': 18, 'l2_regularization': 0.7156345399157333, 'max_bins': 192}. Best is trial 0 with value: 0.13636363636363635.
[I 2025-08-17 22:48:36,326] Trial 2 finished with value: 0.2916666666666667 and parameters: {'learning_rate': 0.07748288441159408, 'max_depth': 12, 'max_iter': 248, 'min_samples_leaf': 12, 'l2_regularization': 1.0061663306156194, 'max_bins': 129}. Best is trial 2 with value: 0.291666666666

In [9]:
print("Mejores hiperparámetros encontrados:")
print(study.best_params)
print(f"Mejor score de trend change: {study.best_value:.4f}")

Mejores hiperparámetros encontrados:
{'learning_rate': 0.11852258842059535, 'max_depth': 4, 'max_iter': 443, 'min_samples_leaf': 10, 'l2_regularization': 1.4427582668352443, 'max_bins': 182}
Mejor score de trend change: 0.3404


In [10]:
# Entrenar modelo final con los mejores hiperparámetros
set_seeds(SEED)
best_params = study.best_params
final_model = HistGradientBoostingClassifier(
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    max_iter=best_params["max_iter"],
    min_samples_leaf=best_params["min_samples_leaf"],
    l2_regularization=best_params["l2_regularization"],
    max_bins=best_params["max_bins"],
    random_state=SEED
)
final_model.fit(X_train, y_train)
y_pred_final = final_model.predict(X_val)
print("\nHistGradientBoostingClassifier FINAL trend_changes_score:\n", trend_changes_score(y_val, y_pred_final))


HistGradientBoostingClassifier FINAL trend_changes_score:
               precision    recall  f1-score   support

       False     0.9091    0.8036    0.8531       112
        True     0.2667    0.4706    0.3404        17

    accuracy                         0.7597       129
   macro avg     0.5879    0.6371    0.5968       129
weighted avg     0.8244    0.7597    0.7855       129

