In [26]:
import numpy as np
import random
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import classification_report, balanced_accuracy_score
import json
import os
from optuna.visualization import plot_optimization_history

In [27]:
SEED = 1234
 # Se puede cambiar a "precision" o "recall" o "f1-score"
SCORE = "f1-score"

In [28]:
def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)

Score Trend Changes Score

In [29]:
def trend_changes_score(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    
    Args:
        y_test (np.array): True labels.
        y_pred (np.array): Predicted labels.
        
    Returns:
        float: The trend changes score.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    return classification_report(y_df["is_changed_trend_test"][:-1], y_df["is_changed_trend_predict"][:-1], digits=4)

def trend_changes_true(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    
    Args:
        y_test (np.array): True labels.
        y_pred (np.array): Predicted labels.
        
    Returns:
        float: The trend changes score.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    report = classification_report(
        y_df["is_changed_trend_test"][:-1],
        y_df["is_changed_trend_predict"][:-1],
        output_dict=True,
        zero_division=0
    )
    return report["True"][SCORE]

In [30]:
# Cargar datos
train = pd.read_csv("../../../data/training_set.csv", parse_dates=["date"])
val = pd.read_csv("../../../data/validation_set.csv", parse_dates=["date"])
test_set = pd.read_csv("../../../data/test_set.csv", parse_dates=['date'])
X_train = train.drop(columns=["date", "target_trend"]).values
y_train = train["target_trend"].values
X_val = val.drop(columns=["date", "target_trend"]).values
y_val = val["target_trend"].values
X_test = test_set.drop(columns=['target_trend','date']).values
y_test = test_set['target_trend'].values

In [31]:
# Mapear clases (-1,0,1) -> (0,1,2) para LightGBM
cls_map = {-1:0, 0:1, 1:2}
inv_map = {v:k for k,v in cls_map.items()}
y_train_m = np.vectorize(cls_map.get)(y_train)
y_val_m   = np.vectorize(cls_map.get)(y_val)

# Pesos de clase (inverso de la frecuencia)
from collections import Counter
cnt = Counter(y_train_m)
total = len(y_train_m)
class_weight = {c: total/(len(cnt)*n) for c,n in cnt.items()}

lgb_train = lgb.Dataset(X_train, label=y_train_m, weight=[class_weight[c] for c in y_train_m])
lgb_val   = lgb.Dataset(X_val,   label=y_val_m,   weight=[class_weight[c] for c in y_val_m], reference=lgb_train)

params = dict(
    objective="multiclass",
    num_class=3,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    feature_fraction=0.9,
    bagging_fraction=0.8,
    bagging_freq=1,
    min_data_in_leaf=20,
    metric=["multi_logloss","multi_error"],
    verbosity=-1,
    seed=42
)

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=["train","val"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(period=100)
    ]
)

# Predicciones validation
y_val_prob = model.predict(X_val, num_iteration=model.best_iteration)
y_val_pred_m = y_val_prob.argmax(axis=1)
y_val_pred = np.vectorize(inv_map.get)(y_val_pred_m)
print("LightGBM report trend change score:\n",
    trend_changes_score(y_val, y_val_pred))

Training until validation scores don't improve for 100 rounds
[100]	train's multi_logloss: 0.0481828	train's multi_error: 0	val's multi_logloss: 0.436619	val's multi_error: 0.1565
Early stopping, best iteration is:
[7]	train's multi_logloss: 0.727406	train's multi_error: 0.118679	val's multi_logloss: 0.771903	val's multi_error: 0.103731
LightGBM report trend change score:
               precision    recall  f1-score   support

       False     0.8684    0.8839    0.8761       112
        True     0.1333    0.1176    0.1250        17

    accuracy                         0.7829       129
   macro avg     0.5009    0.5008    0.5006       129
weighted avg     0.7715    0.7829    0.7771       129



In [32]:
def objective(trial):
    set_seeds(SEED)  # Fijar semilla antes de cada trial
    # Hiperparámetros a optimizar
    param = {
        "objective": "multiclass",
        "num_class": 3,
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 15, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
        "metric": ["multi_logloss"],
        "verbosity": -1,
        "seed": SEED
    }

    # Dataset y pesos igual que antes
    lgb_train = lgb.Dataset(X_train, label=y_train_m, weight=[class_weight[c] for c in y_train_m])
    lgb_val = lgb.Dataset(X_val, label=y_val_m, weight=[class_weight[c] for c in y_val_m], reference=lgb_train)

    model = lgb.train(
        param,
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_val],
        valid_names=["val"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False)
        ]
    )

    # Predicción y métrica personalizada
    y_val_prob = model.predict(X_val, num_iteration=model.best_iteration)
    y_val_pred_m = y_val_prob.argmax(axis=1)
    y_val_pred = np.vectorize(inv_map.get)(y_val_pred_m)
    #score = trend_changes_true(y_val, y_val_pred)  # <- Métrica anterior
    score = balanced_accuracy_score(y_val, y_val_pred) # <- Métrica actual

    return score  # Optuna maximiza este valor

In [33]:
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=SEED)
)
study.optimize(objective, n_trials=1000)

[I 2025-08-21 09:31:54,572] A new study created in memory with name: no-name-99ddbd48-e962-45e8-8909-4bbd4d345654
[I 2025-08-21 09:31:57,127] Trial 0 finished with value: 0.7676767676767676 and parameters: {'learning_rate': 0.010134344267417028, 'num_leaves': 68, 'max_depth': 7, 'feature_fraction': 0.9141434334855076, 'bagging_fraction': 0.9119903232475214, 'bagging_freq': 3, 'min_data_in_leaf': 35}. Best is trial 0 with value: 0.7676767676767676.
[I 2025-08-21 09:31:57,552] Trial 1 finished with value: 0.7567340067340068 and parameters: {'learning_rate': 0.09629801567849108, 'num_leaves': 97, 'max_depth': 11, 'feature_fraction': 0.7431269079831466, 'bagging_fraction': 0.8003980502093835, 'bagging_freq': 7, 'min_data_in_leaf': 74}. Best is trial 0 with value: 0.7676767676767676.
[I 2025-08-21 09:31:58,021] Trial 2 finished with value: 0.779040404040404 and parameters: {'learning_rate': 0.019594488244047626, 'num_leaves': 63, 'max_depth': 8, 'feature_fraction': 0.6055073798362729, 'bagg

In [34]:
# Visualizar el historial de optimización
plot_optimization_history(study)

In [48]:
print("Mejores hiperparámetros encontrados:")
print(study.best_params)
print(f"Mejor score de {SCORE}: {study.best_value:.4f}")
best_params = study.best_params

Mejores hiperparámetros encontrados:
{'learning_rate': 0.13741545795925508, 'num_leaves': 58, 'max_depth': 3, 'feature_fraction': 0.900247421340637, 'bagging_fraction': 0.9374180242245818, 'bagging_freq': 2, 'min_data_in_leaf': 19}
Mejor score de f1-score: 0.8363


In [36]:
# # Guardar los mejores hiperparámetros y su valor
# history = []
# if os.path.exists("best_hyperparams.json"):
#     try:
#         with open("best_hyperparams.json", "r") as f:
#             history = json.load(f)
#     except (json.JSONDecodeError, ValueError):
#         history = []

# # Guardar ambos en un solo diccionario
# history.append({
#     "params": study.best_params,
#     "value": study.best_value
# })
# # Guardar el historial de hiperparámetros
# with open("best_hyperparams.json", "w") as f:
#     json.dump(history, f, indent=2)

In [42]:
# Cargar historial de hiperparámetros y valores
import json

with open("best_hyperparams.json", "r") as f:
    history = json.load(f)

# Escoger el último (más reciente)
best_params = history[-1]["params"]
best_value = history[-1]["value"]

# Si quieres ver todos:
for i, entry in enumerate(history):
    print(f"Hiperparámetros #{i+1}: {entry['params']}, Valor: {entry['value']}")

# Si quieres escoger uno específico (por índice):
# best_params = history[indice_que_quieras]["params"]
# best_value = history[indice_que_quieras]["value"]

Hiperparámetros #1: {'learning_rate': 0.13537033248173344, 'num_leaves': 64, 'max_depth': 4, 'feature_fraction': 0.7559289078996488, 'bagging_fraction': 0.7331629719452519, 'bagging_freq': 8, 'min_data_in_leaf': 25}, Valor: 0.5


In [49]:
# Entrenar modelo final con los mejores hiperparámetros de Optuna
set_seeds(SEED)
final_params = best_params.copy()
final_params.update({
    "objective": "multiclass",
    "num_class": 3,
    "metric": ["multi_logloss"],  # Igual que en Optuna
    "verbosity": -1,
    "seed": SEED
})

final_model = lgb.train(
    final_params,
    lgb_train,
    num_boost_round=1000,  # Igual que en Optuna
    valid_sets=[lgb_val],  # Igual que en Optuna
    valid_names=["val"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=True)  # Igual que en Optuna
    ]
)

# Predicciones con el modelo final
y_val_prob_final = final_model.predict(X_val, num_iteration=final_model.best_iteration)
y_val_pred_m_final = y_val_prob_final.argmax(axis=1)
y_val_pred_final = np.vectorize(inv_map.get)(y_val_pred_m_final)
print("LightGBM FINAL report trend change score:\n",
      trend_changes_score(y_val, y_val_pred_final))

[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.


LightGBMError: Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.

In [45]:
# Obtener predicciones
y_pred_test = np.argmax(final_model.predict(X_test), axis=1)
print("LightGBM Trend Changes Score:\n", trend_changes_score(y_test, y_pred_test))

LightGBM Trend Changes Score:
               precision    recall  f1-score   support

       False     0.9310    0.9076    0.9191       119
        True     0.2143    0.2727    0.2400        11

    accuracy                         0.8538       130
   macro avg     0.5727    0.5901    0.5796       130
weighted avg     0.8704    0.8538    0.8617       130



In [None]:
y_pred_train = np.argmax(final_model.predict(X_train), axis=1)
print("LightGBM Trend Changes Score:\n", trend_changes_score(y_train, y_pred_train))

LightGBM Trend Changes Score:
               precision    recall  f1-score   support

       False     0.9625    0.9643    0.9634       532
        True     0.7361    0.7260    0.7310        73

    accuracy                         0.9355       605
   macro avg     0.8493    0.8452    0.8472       605
weighted avg     0.9352    0.9355    0.9353       605



In [46]:
# Reporte completo: precisión, recall y F1 por clase
report = classification_report(y_val, y_val_pred_final, digits=4)
print("LightGBM Report:\n", report)
print("Balanced accuracy:", balanced_accuracy_score(y_val, y_val_pred_final))

LightGBM Report:
               precision    recall  f1-score   support

          -1     0.7500    1.0000    0.8571        24
           0     0.5000    0.3889    0.4375        18
           1     0.9524    0.9091    0.9302        88

    accuracy                         0.8538       130
   macro avg     0.7341    0.7660    0.7416       130
weighted avg     0.8524    0.8538    0.8485       130

Balanced accuracy: 0.765993265993266
