In [4]:
import os
import json
import gc
import re
import random
import numpy as np
import pandas as pd
import optuna
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, balanced_accuracy_score

In [5]:
# ========= Seeds =========
SEED = 43
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [6]:
SCORE = "f1-score"

In [7]:
def get_trend_changes_report_dict(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    
    Args:
        y_test (np.array): True labels.
        y_pred (np.array): Predicted labels.
        
    Returns:
        float: The trend changes score.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    return classification_report(
        y_df["is_changed_trend_test"][:-1], 
        y_df["is_changed_trend_predict"][:-1], 
        digits=4,
        output_dict=True,
        zero_division=0
    )

def trend_changes_score(y_test: np.array, y_pred: np.array) -> str:
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    return classification_report(
        y_df["is_changed_trend_test"][:-1],
        y_df["is_changed_trend_predict"][:-1],
        digits=4
    )

def trend_changes_true(y_test: np.array, y_pred: np.array) -> float:
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    report = classification_report(
        y_df["is_changed_trend_test"][:-1],
        y_df["is_changed_trend_predict"][:-1],
        output_dict=True,
        zero_division=0
    )
    return report["True"][SCORE]

In [12]:
# ========= Utils: reconstituir secuencias (d-4..d0) =========
step_pat = re.compile(r"_d(-?\d+)$")

def infer_steps_and_bases(df_cols):
    # Extrae steps únicos y ordenados y el orden de features base usando el step más antiguo
    steps = sorted({int(step_pat.search(c).group(1)) for c in df_cols if step_pat.search(c)})
    oldest = steps[0]
    bases_oldest = [c.split(f"_d{oldest}")[0] for c in df_cols if c.endswith(f"_d{oldest}")]
    return steps, bases_oldest

def reindex_sequence_columns(df):
    cols = [c for c in df.columns if c not in ("date", "target_trend")]
    steps, bases = infer_steps_and_bases(df.columns)
    ordered = []
    for st in steps:  # mantiene orden temporal: antiguo -> reciente
        for b in bases:
            name = f"{b}_d{st}"
            if name not in df.columns:
                raise ValueError(f"Falta columna esperada: {name}")
            ordered.append(name)
    return ordered, steps, bases

def to_sequence_array(df):
    ordered_cols, steps, bases = reindex_sequence_columns(df)
    X_flat = df[ordered_cols].values.astype(np.float32)
    n, nflat = X_flat.shape
    T = len(steps)
    F = len(bases)
    assert nflat == T * F, f"Esperado {T*F} columnas, recibido {nflat}"
    X = X_flat.reshape(n, T, F)
    return X, steps, bases

# ========= Carga de datos =========
DATA_DIR = "../../../data/post_cleaning"
train_df = pd.read_csv(os.path.join(DATA_DIR, "training_set.csv"), parse_dates=["date"])
val_df   = pd.read_csv(os.path.join(DATA_DIR, "validation_set.csv"), parse_dates=["date"])
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test_set.csv"), parse_dates=["date"])

# Entradas (3D) y etiquetas (+1 para {0,1,2})
X_train, steps, bases = to_sequence_array(train_df)
X_val, _, _ = to_sequence_array(val_df)
X_test, _, _ = to_sequence_array(test_df)

y_train = (train_df["target_trend"].values + 1).astype(np.int32)
y_val   = (val_df["target_trend"].values + 1).astype(np.int32)
y_test  = (test_df["target_trend"].values + 1).astype(np.int32)

print(f"X_train: {X_train.shape} (samples, timesteps, features)")
print(f"X_val:   {X_val.shape}")
print(f"X_test:  {X_test.shape}")
print(f"Timesteps={len(steps)}, Features/step={len(bases)} -> steps={steps}")

# ========= Definición de modelo =========
def build_model(trial, timesteps, features):
    lstm_layers = trial.suggest_int("lstm_layers", 1, 2)
    units1 = trial.suggest_int("units1", 32, 256, step=32)
    units2 = trial.suggest_int("units2", 32, 256, step=32) if lstm_layers == 2 else None
     # --- LÓGICA CORREGIDA PARA PARÁMETROS CONDICIONALES ---
    if lstm_layers == 2:
        # Solo sugiere 'units2' si se va a usar una segunda capa
        units2 = trial.suggest_int("units2", 32, 256, step=32)
    else:
        # Si no, 'units2' no es un hiperparámetro para este trial
        units2 = None
    dense_units = trial.suggest_int("dense_units", 16, 128, step=16)
    dropout = trial.suggest_float("dropout", 0.0, 0.5)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
    optimizer_name = trial.suggest_categorical("optimizer", ["adam", "rmsprop"])
    optimizer = {"adam": tf.keras.optimizers.Adam,
                 "rmsprop": tf.keras.optimizers.RMSprop}[optimizer_name](learning_rate=learning_rate)

    model = Sequential()
    model.add(Input(shape=(timesteps, features)))
    if lstm_layers == 2:
        # Asegurarse de que units2 no sea None si se necesitan 2 capas
        if units2 is None:
             # Esto puede pasar en un FixedTrial si el trial original no tenía 2 capas.
             # En ese caso, no deberíamos estar aquí, pero como salvaguarda:
             raise ValueError("lstm_layers es 2 pero units2 no está definido.")
        model.add(LSTM(units1, return_sequences=True, dropout=dropout))
        model.add(LSTM(units2, dropout=dropout))
    else:
        model.add(LSTM(units1, dropout=dropout))

    model.add(Dropout(dropout))
    model.add(Dense(dense_units, activation="relu"))
    model.add(Dropout(dropout))
    model.add(Dense(3, activation="linear"))  # logits

    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=optimizer,
        metrics=["accuracy"]
    )
    return model

# ========= Objetivo Optuna =========
def objective(trial):
    tf.keras.backend.clear_session()
    model = build_model(trial, X_train.shape[1], X_train.shape[2])
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    patience = trial.suggest_int("patience", 8, 15)

    es = EarlyStopping(monitor="val_loss", patience=patience, restore_best_weights=True, verbose=0)

    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=80,
        batch_size=batch_size,
        callbacks=[es],
        verbose=0
    )

    # Predicción y métrica personalizada
    val_logits = model.predict(X_val, verbose=0)
    y_val_pred = np.argmax(val_logits, axis=1)
    score = trend_changes_true(y_val, y_val_pred)

    # Limpieza
    tf.keras.backend.clear_session()
    del model
    gc.collect()

    return score




X_train: (1374, 5, 12) (samples, timesteps, features)
X_val:   (294, 5, 12)
X_test:  (295, 5, 12)
Timesteps=5, Features/step=12 -> steps=[-4, -3, -2, -1, 0]


In [None]:
# ========= Ejecutar Optuna =========
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(objective, n_trials=150)
print("Mejores hiperparámetros:", study.best_params)
print(f"Mejor score {SCORE}: {study.best_value:.4f}")


In [None]:
# --- Visualizar y Guardar Resultados de Optuna ---
plot_optimization_history(study)

In [None]:
print("Mejores hiperparámetros encontrados:")
best_params = study.best_params
print(best_params)
print(f"Mejor score de '{SCORE}': {study.best_value:.4f}")

Mejores hiperparámetros encontrados:
{'units1': 64, 'dense_units': 112, 'dense_activation': 'tanh', 'dropout': 0.1382843186220748, 'learning_rate': 0.00010768041003832457, 'optimizer': 'adamw', 'batch_size': 128, 'patience': 14}
Mejor score de 'f1-score': 0.3596


In [None]:
# --- Celda Nueva: Análisis de Hiperparámetros ---
from optuna.visualization import plot_param_importances, plot_slice

# 1. Gráfico de Importancia de Hiperparámetros
# Muestra qué hiperparámetros tuvieron el mayor impacto en el score.
param_importances = plot_param_importances(study)
param_importances.show()

# 2. Gráfico de Corte (Slice Plot)
# Muestra cómo varía el score para cada valor de cada hiperparámetro.
# Es excelente para ver los "rangos buenos" de cada parámetro.
slice_plot = plot_slice(study)
slice_plot.show()

In [9]:
# --- Cargar Hiperparámetros y Entrenar Modelo Final ---

# Cargar los mejores hiperparámetros
with open("best_hyperparams_lstm1.json", "r") as f:
    history = json.load(f)
best_params = history[-1]["params"]

In [13]:
# ========= Entrenamiento final con mejores HP =========

# --- 1. Limpiar y fijar semillas para reproducibilidad ---
tf.keras.backend.clear_session()
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# --- 2. Reconstruir el modelo con los mejores hiperparámetros ---
final_model = build_model(
    trial=optuna.trial.FixedTrial(best_params),
    timesteps=X_train.shape[1],
    features=X_train.shape[2]
)
final_model.summary()

# --- 3. Configurar y ejecutar el entrenamiento ---
early_stop = EarlyStopping(
    monitor="val_loss", 
    patience=best_params.get("patience", 10), 
    restore_best_weights=True, 
    verbose=1
)

history = final_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=120,  # Usar un número de épocas suficiente, EarlyStopping se encargará
    batch_size=best_params.get("batch_size", 64),
    callbacks=[early_stop],
    verbose=1
)


ValueError: The value of the parameter 'lstm_layers' is not found. Please set it at the construction of the FixedTrial object.

In [None]:

# ========= Evaluación =========
# Validation
val_logits = final_model.predict(X_val, verbose=0)
y_val_pred = np.argmax(val_logits, axis=1)
print("\nLSTM Trend Changes (Validation):\n", trend_changes_score(y_val, y_val_pred))
print("Balanced accuracy (val):", balanced_accuracy_score(y_val, y_val_pred))
print("Classification report (val):\n", classification_report(y_val, y_val_pred, digits=4))

# Test
test_logits = final_model.predict(X_test, verbose=0)
y_test_pred = np.argmax(test_logits, axis=1)
print("\nLSTM Trend Changes (Test):\n", trend_changes_score(y_test, y_test_pred))
print("Balanced accuracy (test):", balanced_accuracy_score(y_test, y_test_pred))
print("Classification report (test):\n", classification_report(y_test, y_test_pred, digits=4))

# Train
train_logits = final_model.predict(X_train, verbose=0)
y_train_pred = np.argmax(train_logits, axis=1)
print("\nLSTM Trend Changes (Train):\n", trend_changes_score(y_train, y_train_pred))
print("Balanced accuracy (train):", balanced_accuracy_score(y_train, y_train_pred))

# ========= Guardar modelo =========
final_model.save("lstm_best_model.keras")
print("\nModelo guardado en lstm_best_model.keras")

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 46: early stopping

LSTM Trend Changes (Validation):
               precision    recall  f1-score   support

       False     0.9016    0.8494    0.8748       259
        True     0.2041    0.2941    0.2410        34

    accuracy                         0.7850       293
   macro avg     0.5529    0.5718    0.5579       293
weighted avg     0.8207    0.7850    0.8012       293

Balanced accuracy (val):

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



LSTM Trend Changes (Train):
               precision    recall  f1-score   support

       False     0.9011    0.8671    0.8838      1219
        True     0.1900    0.2468    0.2147       154

    accuracy                         0.7975      1373
   macro avg     0.5456    0.5569    0.5492      1373
weighted avg     0.8213    0.7975    0.8087      1373

Balanced accuracy (train): 0.6018199889428474

Modelo guardado en lstm_best_model.keras


In [None]:
# # --- Exportar y Comparar Métricas de Modelos (Validación) ---

# # 1. Definir el nombre del modelo actual y el archivo de salida
# model_name = 'LSTM'
# output_file = '../../../score_models/model_comparison_metrics.csv'

# # 2. Calcular el reporte de clasificación estándar
# # Usamos y_val_m y y_val_pred que están en la misma escala (0,1,2)
# report_dict = classification_report(y_val, y_val_pred, output_dict=True, zero_division=0)
# precision = report_dict['macro avg']['precision']
# recall = report_dict['macro avg']['recall']
# f1_score = report_dict['macro avg']['f1-score']


# # 3. Calcular el reporte de cambio de tendencia
# report = get_trend_changes_report_dict(y_val, y_val_pred)
# trend_change_precision = report['True']['precision']
# trend_change_recall = report['True']['recall']
# trend_change_f1_score = report['True']['f1-score']

# # 4. Organizar las nuevas métricas
# new_metrics = {
#     'precision': precision,
#     'recall': recall,
#     'f1_score': f1_score,
#     'trend_change_precision': trend_change_precision,
#     'trend_change_recall': trend_change_recall,
#     'trend_change_f1_score': trend_change_f1_score
# }

# # 5. Cargar, actualizar y guardar el DataFrame de comparación
# try:
#     # Intentar cargar el archivo existente
#     comparison_df = pd.read_csv(output_file, index_col='model')
#     # Si existe, actualizar o añadir la fila para el modelo actual
#     comparison_df.loc[model_name] = new_metrics
# except FileNotFoundError:
#     # Si no existe, crear un DataFrame nuevo directamente con los datos actuales
#     comparison_df = pd.DataFrame([new_metrics], index=[model_name])

# # Guardar el DataFrame actualizado en el CSV
# comparison_df.to_csv(output_file, index_label='model')