In [1]:
import pandas as pd
import random
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import optuna   

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 1234
 # Se puede cambiar a "precision" o "recall" o "f1-score"
SCORE = "f1-score"

In [3]:
def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)

Score Trend Changes Score

In [4]:
def trend_changes_score(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    
    Args:
        y_test (np.array): True labels.
        y_pred (np.array): Predicted labels.
        
    Returns:
        float: The trend changes score.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    return classification_report(y_df["is_changed_trend_test"][:-1], y_df["is_changed_trend_predict"][:-1], digits=4)

def trend_changes_true(y_test: np.array, y_pred: np.array) -> float:
    """
    Calculate the trend changes score based on the test and predicted values.
    
    Args:
        y_test (np.array): True labels.
        y_pred (np.array): Predicted labels.
        
    Returns:
        float: The trend changes score.
    """
    y_df = pd.DataFrame([y_test, y_pred]).T
    y_df.columns = ["y_test", "y_pred"]
    y_df["y_test_shifted"] = y_df["y_test"].shift(-1)
    y_df["is_changed_trend_test"] = y_df["y_test"] != y_df["y_test_shifted"]
    y_df["y_predict_shifted"] = y_df["y_pred"].shift(-1)
    y_df["is_changed_trend_predict"] = y_df["y_pred"] != y_df["y_predict_shifted"]
    report = classification_report(
        y_df["is_changed_trend_test"][:-1],
        y_df["is_changed_trend_predict"][:-1],
        output_dict=True,
        zero_division=0
    )
    return report["True"][SCORE]

Carga de datos

In [5]:
# Cargar datos
train = pd.read_csv("../../../data/training_set.csv", parse_dates=["date"])
val = pd.read_csv("../../../data/validation_set.csv", parse_dates=["date"])
X_train = train.drop(columns=["date", "target_trend"]).values
y_train = train["target_trend"].values
X_val = val.drop(columns=["date", "target_trend"]).values
y_val = val["target_trend"].values

IMPORTANTE: Hay que sumar a la columna de prediccion porque -1 no funciona en funcion de LOSS

In [6]:
y_train += 1
y_val += 1

Integracion de metrica trend_changes_score en la funcion objetivo Optuna

In [8]:
def objective(trial):
    set_seeds(SEED)  # Fijar semilla antes de cada trial
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"])
    C = trial.suggest_float("C", 1e-3, 50, log=True)
    l1_ratio = None
    if penalty == "elasticnet":
        l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.9)

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            multi_class="multinomial",
            solver="saga",
            penalty=penalty,
            C=C,
            l1_ratio=l1_ratio,
            max_iter=800,
            random_state=SEED
        ))
    ])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    score = trend_changes_true(y_val, y_pred)
    return score

Ejecucion de estudio con Optuna

In [9]:
set_seeds(SEED)
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=SEED)
)
study.optimize(objective, n_trials=300)

[I 2025-08-17 22:55:58,992] A new study created in memory with name: no-name-4fe27a20-7b12-41fb-975b-2206bba4f92a
[I 2025-08-17 22:55:59,595] Trial 0 finished with value: 0.32558139534883723 and parameters: {'penalty': 'l2', 'C': 4.9020352232191025}. Best is trial 0 with value: 0.32558139534883723.
[I 2025-08-17 22:56:00,250] Trial 1 finished with value: 0.27906976744186046 and parameters: {'penalty': 'l1', 'C': 5.8610214330700705}. Best is trial 0 with value: 0.32558139534883723.
[I 2025-08-17 22:56:00,556] Trial 2 finished with value: 0.17647058823529413 and parameters: {'penalty': 'l1', 'C': 0.22602738844381381}. Best is trial 0 with value: 0.32558139534883723.
[I 2025-08-17 22:56:00,688] Trial 3 finished with value: 0.358974358974359 and parameters: {'penalty': 'l2', 'C': 0.43355484601640604}. Best is trial 3 with value: 0.358974358974359.
[I 2025-08-17 22:56:01,358] Trial 4 finished with value: 0.32558139534883723 and parameters: {'penalty': 'elasticnet', 'C': 14.044441691902366, 

In [10]:
print("Mejores hiperparámetros encontrados:")
print(study.best_params)
print(f"Mejor score de {SCORE}: {study.best_value:.4f}")

Mejores hiperparámetros encontrados:
{'penalty': 'l2', 'C': 0.047383310511121914}
Mejor score de f1-score: 0.4242


In [11]:
# Entrenar modelo final con los mejores hiperparámetros
set_seeds(SEED)
best_params = study.best_params
final_l1_ratio = best_params["l1_ratio"] if best_params["penalty"] == "elasticnet" else None
final_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        multi_class="multinomial",
        solver="saga",
        penalty=best_params["penalty"],
        C=best_params["C"],
        l1_ratio=final_l1_ratio,
        max_iter=800,
        random_state=SEED
    ))
])

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_val)
print("Trend Change F1 Score:\n", trend_changes_score(y_val, y_pred))

Trend Change F1 Score:
               precision    recall  f1-score   support

       False     0.9115    0.9196    0.9156       112
        True     0.4375    0.4118    0.4242        17

    accuracy                         0.8527       129
   macro avg     0.6745    0.6657    0.6699       129
weighted avg     0.8490    0.8527    0.8508       129

