In [1]:
import numpy as np
import seaborn as sns
import optuna
import plotly
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import fbeta_score, f1_score, make_scorer, classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, average_precision_score, auc, mean_squared_error, r2_score

from utils.data_loader import load_split, prepare_features_target
from utils.learning_curve import learning_curve_with_resampling
from models.manage_models import save_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
f2_scorer = make_scorer(fbeta_score, beta=2)

In [3]:
train_df = load_split('train', data_dir='dataset/splits')
X_train, y_train = prepare_features_target(train_df, target_col='Fault')

In [4]:
def objective(trial):

    sampling_strategy = trial.suggest_categorical("sampling", ["none", "smote", "under"])
    
    c_param = trial.suggest_categorical("C", [0.01, 0.1, 1, 5, 10])
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    solver = trial.suggest_categorical("solver", ["lbfgs", "newton-cg", "sag", "newton-cholesky", "liblinear", "saga"])

    l1_ratio = None
    if solver in ["lbfgs", "newton-cg", "sag", "newton-cholesky"]:
        penalty = trial.suggest_categorical("penalty_g1", ["l2"])
    elif solver == "liblinear":
        penalty = trial.suggest_categorical("penalty_g2", ["l1", "l2"])
    elif solver == "saga":
        penalty = "elasticnet"
        l1_ratio = trial.suggest_categorical("l1_ratio", [0, 0.5, 1])

    steps = [('scaler', StandardScaler())]

    if sampling_strategy == "smote":
        steps.append(('resample', SMOTE(random_state=42)))
    elif sampling_strategy == "under":
        steps.append(('resample', RandomUnderSampler(random_state=42)))
    
    clf = LogisticRegression(
        C=c_param, penalty=penalty, solver=solver,
        class_weight=class_weight, l1_ratio=l1_ratio,
        max_iter=2000, random_state=42
    )
    steps.append(('clf', clf))

    pipeline = ImbPipeline(steps)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring=f2_scorer, n_jobs=-1)
    
    return scores.mean()

In [5]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100) # Con 50 pruebas suele bastar para LogReg

print(f"Mejor score (f2_scorer): {study.best_value}")
print(f"Mejores parámetros: {study.best_params}")

[I 2025-12-21 11:01:50,247] A new study created in memory with name: no-name-c0840b00-1f24-4953-96e3-d7bc7f0982fe
[I 2025-12-21 11:01:54,363] Trial 0 finished with value: 0.807194978218579 and parameters: {'sampling': 'smote', 'C': 0.1, 'class_weight': None, 'solver': 'newton-cholesky', 'penalty_g1': 'l2'}. Best is trial 0 with value: 0.807194978218579.
[I 2025-12-21 11:01:57,028] Trial 1 finished with value: 0.7204247210060224 and parameters: {'sampling': 'none', 'C': 5, 'class_weight': None, 'solver': 'saga', 'l1_ratio': 1}. Best is trial 0 with value: 0.807194978218579.
[I 2025-12-21 11:01:59,437] Trial 2 finished with value: 0.8183421730816063 and parameters: {'sampling': 'under', 'C': 5, 'class_weight': None, 'solver': 'saga', 'l1_ratio': 1}. Best is trial 2 with value: 0.8183421730816063.
[I 2025-12-21 11:02:02,423] Trial 3 finished with value: 0.8165030334753816 and parameters: {'sampling': 'smote', 'C': 5, 'class_weight': 'balanced', 'solver': 'liblinear', 'penalty_g2': 'l2'}. 

Mejor score (f2_scorer): 0.8183482297800058
Mejores parámetros: {'sampling': 'under', 'C': 10, 'class_weight': None, 'solver': 'saga', 'l1_ratio': 1}


In [6]:
df = study.trials_dataframe()

In [7]:
df['params_penalty_unified'] = df['params_penalty_g1'].fillna(df['params_penalty_g2'])
df.loc[df['params_solver'] == 'saga', 'params_penalty_unified'] = 'elasticnet'

In [8]:
df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_class_weight,params_l1_ratio,params_penalty_g1,params_penalty_g2,params_sampling,params_solver,state,params_penalty_unified
0,0,0.807195,2025-12-21 11:01:50.248294,2025-12-21 11:01:54.363500,0 days 00:00:04.115206,0.1,,,l2,,smote,newton-cholesky,COMPLETE,l2
1,1,0.720425,2025-12-21 11:01:54.365287,2025-12-21 11:01:57.027904,0 days 00:00:02.662617,5.0,,1.0,,,none,saga,COMPLETE,elasticnet
2,2,0.818342,2025-12-21 11:01:57.029977,2025-12-21 11:01:59.437836,0 days 00:00:02.407859,5.0,,1.0,,,under,saga,COMPLETE,elasticnet
3,3,0.816503,2025-12-21 11:01:59.439938,2025-12-21 11:02:02.423817,0 days 00:00:02.983879,5.0,balanced,,,l2,smote,liblinear,COMPLETE,l2
4,4,0.815808,2025-12-21 11:02:02.425769,2025-12-21 11:02:02.801465,0 days 00:00:00.375696,1.0,balanced,,l2,,under,newton-cg,COMPLETE,l2
