#  REGRESIÓN LOGÍSTICA 
 ***
 <code> **AJUSTE DE HIPERPARÁMETROS** </code>


[Regresión logística (documentación oficial)](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [1]:
# Importamos las librerías necesarias

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, KFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import FunctionTransformer

from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from scipy.stats import uniform

### Importación de los datos

Para entrenar este modelo vamos a partir de dos conjuntos de datos: uno con todas las variables y otro con las variables seleccionadas en la etapa anterior

In [2]:
# Df con todas las variables
DF_ALL = pd.read_csv('datosEntrenamientoModelosFinal.csv')

# Df con selección de variables
DF = pd.read_csv('datosEntrenamientoRL.csv')

In [3]:
DF_ALL.head()

Unnamed: 0,Bestseller,NumPages,SagaNumber,RedPerc,BluePerc,BelongsSaga,Price,WordsTitle,PriceFormat,BookInterest1M,...,World History,World War I,World War II,Writing,Young Adult,Young Adult Contemporary,Young Adult Fantasy,Young Adult Romance,Young Adult Science Fiction,Zombies
0,0.0,329.0,1.0,0.51,0.4,0,19.99,1.0,paperback,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,269.0,2.0,0.61,0.54,1,3.99,2.0,ebook,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,2335.0,1.0,0.72,0.57,1,20.99,7.0,ebook,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0.0,40.0,1.0,0.83,0.35,0,25.0,1.0,hardcover,0.0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,189.0,1.0,0.59,0.26,0,15.0,4.0,paperback,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
DF.head()

Unnamed: 0,Bestseller,RedPerc,BluePerc,BelongsSaga,Price,WordsTitle,BookInterest1M,Rating20Days,HasTwitter,HasWikipedia,...,Whodunit,Witches,Womens,World War I,World War II,Young Adult,Young Adult Contemporary,Young Adult Fantasy,Young Adult Romance,PriceFormat
0,0.0,0.51,0.4,0,19.99,1.0,0.0,3.7,1.0,0.0,...,0,0,0,0,0,0,0,0,0,paperback
1,0.0,0.61,0.54,1,3.99,2.0,0.0,3.89,1.0,1.0,...,0,0,0,0,0,0,0,0,0,ebook
2,0.0,0.72,0.57,1,20.99,7.0,0.0,4.45,0.0,1.0,...,0,0,0,0,0,1,0,0,0,ebook
3,0.0,0.83,0.35,0,25.0,1.0,0.0,4.3,1.0,1.0,...,0,0,0,0,0,1,0,0,0,hardcover
4,0.0,0.59,0.26,0,15.0,4.0,0.0,3.8,0.0,1.0,...,0,0,0,0,0,0,0,0,0,paperback


### SEPARACIÓN EN TRAIN Y TEST

In [5]:
# Semilla
SEED = 22

# Proporción del conjunto de test
TEST_SIZE = 0.3

# Número de folds para la validación cruzada
CV_FOLDS = 5

**Con selección de variables**

In [6]:
y = DF["Bestseller"]
X = DF.iloc[:, 1:]

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=SEED)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2583, 276), (1107, 276), (2583,), (1107,))

**Con todas las variables**

In [7]:
y = DF_ALL["Bestseller"]
X = DF_ALL.iloc[:, 1:]

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_ALL, X_test_ALL, y_train_ALL, y_test_ALL = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=SEED)
X_train_ALL.shape, X_test_ALL.shape, y_train_ALL.shape, y_test_ALL.shape

((2583, 344), (1107, 344), (2583,), (1107,))

### Creación de KFolds

Estrategia de [validación cruzada](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html) con k folds

In [8]:
# Inicializamos el objeto KFold
kf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=SEED)

In [9]:
def codificarPriceFormat(df):
    return pd.get_dummies(df, columns=['PriceFormat'], dtype=int)

### Métricas de evaluación

In [10]:
# Función para calcular la sensibilidad
def sensitivity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tp / (tp + fn)

# Función para calcular la especificidad
def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

# Convertir las funciones en funciones de puntuación para usar en RandomizedSearchCV
sensitivity_scorer = make_scorer(sensitivity)
specificity_scorer = make_scorer(specificity)

METRICS = {'balanced_accuracy': 'balanced_accuracy',
           'sensitivity': sensitivity_scorer,
           'specificity': specificity_scorer}

### Preparación del entorno de MLFlow

In [None]:
!mlflow ui --port 8080 --backend-store-uri sqlite:///mlruns.db

[2024-04-16 19:22:20 +0200] [16273] [INFO] Starting gunicorn 21.2.0
[2024-04-16 19:22:20 +0200] [16273] [INFO] Listening at: http://127.0.0.1:8080 (16273)
[2024-04-16 19:22:20 +0200] [16273] [INFO] Using worker: sync
[2024-04-16 19:22:20 +0200] [16274] [INFO] Booting worker with pid: 16274
[2024-04-16 19:22:21 +0200] [16275] [INFO] Booting worker with pid: 16275
[2024-04-16 19:22:21 +0200] [16276] [INFO] Booting worker with pid: 16276
[2024-04-16 19:22:21 +0200] [16277] [INFO] Booting worker with pid: 16277


In [11]:
# Sets the sqlite db as the MLFLOW_TRACKING_URI 
os.environ['MLFLOW_TRACKING_URI'] = 'sqlite:///mlruns.db'

# WARNING: TO SEE THE LOCAL SERVER YOU HAVE TO CHOOSE THE CORRECT BACKEND STORE AS FOLLOWS:
# mlflow ui --port 8080 --backend-store-uri sqlite:///mlruns.db

# mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Para imprimir los experimentos que están en la base de datos

# Establecer la URI de seguimiento
mlflow.set_tracking_uri('sqlite:///mlruns.db')

# Obtener todos los experimentos
experiment_ids = mlflow.search_runs().experiment_id.unique()

# Imprimir los experimentos
for exp_id in experiment_ids:
    print(exp_id)
    
# Defino el experimento el que guardaré todas las ejecuciones
mlflow.set_experiment(experiment_name = 'Regresión logística')

<Experiment: artifact_location='/Users/javimartinfuentes/Documents/GitHub/NOVELLA/src/modelos/regresionLogistica/mlruns/2', creation_time=1713216632112, experiment_id='2', last_update_time=1713216632112, lifecycle_stage='active', name='Regresión logística', tags={}>

# MODELO BASELINE

Creamos un modelo base para después comparar con los otros modelos que entrenemos con más técnicas de procesado y transformaciones. 

Modelo baseline:
* Parámetros por defecto
* Todas las variables
* Sin transformaciones
* Sin SMOTE-NC

In [13]:
DF_BASE = DF_ALL.copy()
DF_BASE = codificarPriceFormat(DF_BASE)

In [14]:
# Dividimos los datos en conjuntos de entrenamiento y prueba
X_BASE = DF_BASE.drop('Bestseller', axis=1)
y_BASE = DF_BASE['Bestseller']

X_base_train, X_base_test, y_base_train, y_base_test = train_test_split(X_BASE, y_BASE, test_size=TEST_SIZE, stratify=y, random_state=SEED)

In [15]:
RL = LogisticRegression(random_state = SEED)

In [16]:
scores = cross_validate(RL, X_base_train, y_base_train, scoring=METRICS, cv=CV_FOLDS,
                        return_train_score=True, verbose=1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.1s remaining:    1.6s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or

In [17]:
scores

{'fit_time': array([0.07637882, 0.07318711, 0.09041381, 0.08406401, 0.09269881]),
 'score_time': array([0.00465822, 0.0048871 , 0.00485301, 0.00461793, 0.00478792]),
 'test_balanced_accuracy': array([0.67151451, 0.712546  , 0.64359558, 0.64655778, 0.65744165]),
 'train_balanced_accuracy': array([0.68918203, 0.6703777 , 0.67652367, 0.67274444, 0.69469373]),
 'test_sensitivity': array([0.35897436, 0.46153846, 0.30769231, 0.32051282, 0.33766234]),
 'train_sensitivity': array([0.40514469, 0.36012862, 0.37299035, 0.36655949, 0.40705128]),
 'test_specificity': array([0.98405467, 0.96355353, 0.97949886, 0.97260274, 0.97722096]),
 'train_specificity': array([0.97321937, 0.98062678, 0.98005698, 0.97892938, 0.98233618])}

In [18]:
# Registramos los resultados en MlFlow
with mlflow.start_run():
    
    # Métricas
    m = ["balanced_accuracy", "sensitivity", "specificity"]

    for metric in m:
        
        for fold in range(len(scores[f"train_{metric}"])):
            
            # Obtenemos las métricas de cada fold
            train_fold_metric = scores[f"train_{metric}"][fold]
            test_fold_metric = scores[f"test_{metric}"][fold]
            
            # Log the metric for each fold
            mlflow.log_metric(f"train_{metric}_fold_{fold+1}", train_fold_metric)
            mlflow.log_metric(f"test_{metric}_fold_{fold+1}", test_fold_metric)
            
        # Calculamos la media de los valores
        train_mean = np.mean(scores[f"train_{metric}"])
        test_mean = np.mean(scores[f"test_{metric}"])

        # Log the mean values for train and test sets
        mlflow.log_metric(f"train_{metric}_mean", train_mean)
        mlflow.log_metric(f"test_{metric}_mean", test_mean)

    # Establece una etiqueta que describe el propósito de esta ejecución
    mlflow.set_tag("TIPO", "BASELINE")

    # Infiere el signature del modelo, que describe el tipo de entrada y salida del modelo
    RL.fit(X_base_train, y_base_train)
    signature = infer_signature(X_base_train, RL.predict(X_base_train))

    # Registra el modelo
    model_info = mlflow.sklearn.log_model(
        sk_model=RL,
        artifact_path="rl_model",
        signature=signature,
        input_example=X_base_train,
        registered_model_name="BASELINE",
    )


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Registered model 'BASELINE' already exists. Creating a new version of this model...
Created version '2' of model 'BASELINE'.


### Escalado de variables

In [12]:
data_scaled = DF.copy()
X_scaled = data_scaled.drop('Bestseller', axis=1)
y_scaled = data_scaled['Bestseller']

# Dividimos en train y test
X_scaled_train, X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X_scaled, y_scaled, test_size=TEST_SIZE, stratify=y, random_state=SEED)

In [13]:
# Inicializamos RobustScaler
scaler = RobustScaler()

# Solo lo aplicamos a las variables numéricas

variables_numericas = ['RedPerc', 'BluePerc', 'Price', 'WordsTitle', 'BookInterest1M',
                     'Rating20Days', 'PrevBestSellAuthor']

# Aplicamos el RobustScaler a los datos de entrenamiento y test
X_scaled_train[variables_numericas] = scaler.fit_transform(X_scaled_train[variables_numericas])
X_scaled_test[variables_numericas] = scaler.transform(X_scaled_test[variables_numericas])

# CON SMOTE-NC

### Creación del pipeline

Creamos un pipeline con las operaciones que se deben aplicar a cada fold en el entrenamiento:
* Oversampling (SMOTENC)
* Redondear variables enteras
* Transformación variables categóricas con un valor único
* Clasificador (MLP)

In [21]:
def redondearVariables(X):
    variablesRedondeo = ["WordsTitle"]
    # Itera sobre las columnas especificadas y redondea sus valores
    for v in variablesRedondeo:
        X[v] = np.round(X[v])
    return X

In [22]:
# Columnas de los géneros
columnas_generos = X_scaled_train.columns[10:-1]

# Columnas categóricas
categoricalColumns = ["BelongsSaga", "PriceFormat", "HasTwitter", "HasWikipedia"] + list(columnas_generos)

In [23]:
smote = SMOTENC(categorical_features = categoricalColumns, random_state = SEED)

# Definimos el clasificador 
RL = LogisticRegression(random_state = SEED)

# Definimos el transformador para codificar la variable categórica 'PriceFormat'
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(), ['PriceFormat'])
], remainder='passthrough')

# Definimos el transformador de la función para redondear
transformador_funcion = FunctionTransformer(func=redondearVariables)

# Construimos el pipeline
pipeline = Pipeline([
    ('smote', smote),
    ('redondear_variables', transformador_funcion),
    ('encoder', column_transformer),
    ('classifier', RL)
])


### 2.1 Grid Search

In [24]:
# Definir los hiperparámetros a ajustar
param_grid = {
    'classifier__solver': ['saga'],
    'classifier__penalty': ['elasticnet'],
    'classifier__max_iter': [2000],
    'classifier__C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0], 
    'classifier__l1_ratio': np.linspace(0.0, 1.0, 10),
    'classifier__fit_intercept': [True, False]
}

# Inicializo GridSearch
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kf,
                           scoring=METRICS, refit = "balanced_accuracy", return_train_score=True, n_jobs=-1, error_score="raise")

grid_search.fit(X_scaled_train, y_scaled_train)

# Resultados
cv_results = grid_search.cv_results_
best_params = grid_search.best_params_























In [25]:
best_params

{'classifier__C': 0.01,
 'classifier__fit_intercept': False,
 'classifier__l1_ratio': 0.3333333333333333,
 'classifier__max_iter': 2000,
 'classifier__penalty': 'elasticnet',
 'classifier__solver': 'saga'}

In [26]:
# Convertimos los resultados de la validación cruzada en un dataframe
df_results = pd.DataFrame(grid_search.cv_results_)

# Filtrar la fila con los mejores parámetros
filtered_row = df_results.loc[
    (df_results['param_classifier__C'] == best_params['classifier__C']) &
    (df_results['param_classifier__l1_ratio'] == best_params['classifier__l1_ratio']) &
    (df_results['param_classifier__max_iter'] == best_params['classifier__max_iter']) &
    (df_results['param_classifier__penalty'] == best_params['classifier__penalty']) &
    (df_results['param_classifier__solver'] == best_params['classifier__solver']) &
    (df_results['param_classifier__fit_intercept'] == best_params['classifier__fit_intercept'])
]

index_row = filtered_row.index[0]

# Registro los resultados en MLFlow
with mlflow.start_run():

    # Almaceno los valores de los hiperparámetros
    for key, value in best_params.items():
        mlflow.log_param(key, value)

    # Registra las métricas de cada fold para cada métrica
    for metric in METRICS:
        
        M = metric.replace(" ", "_")
        
        # Media
        
        mlflow.log_metric(f"mean_train_{M}", df_results[f"mean_train_{M}"][index_row])
        mlflow.log_metric(f"mean_test_{M}", df_results[f"mean_test_{M}"][index_row])

        # Desviación típica
        mlflow.log_metric(f"std_train_{M}", df_results[f"std_train_{M}"][index_row])
        mlflow.log_metric(f"std_test_{M}", df_results[f"std_test_{M}"][index_row])

        for i in range(CV_FOLDS):

            # Resultados de entrenamiento en cada fold
            mlflow.log_metric(f"train_{M}fold{i}", df_results[f"split{i}_train_{M}"][index_row])
            # Resultados de validación en cada fold
            mlflow.log_metric(f"test_{M}fold{i}", df_results[f"split{i}_test_{M}"][index_row])

    # Establece una etiqueta que describe el propósito de esta ejecución
    mlflow.set_tag("TIPO", "RL_SMOTE_GRID_SEARCH")

    # Infiere el signature del modelo, que describe el tipo de entrada y salida del modelo
    signature = infer_signature(X_scaled_train, grid_search.best_estimator_.predict(X_scaled_train))

    # Registra el modelo
    model_info = mlflow.sklearn.log_model(
        sk_model=grid_search,
        artifact_path="rf_model",
        signature=signature,
        input_example=X_train,
        registered_model_name="RL_SMOTE_GRID_SEARCH",
    )

Successfully registered model 'RL_SMOTE_GRID_SEARCH'.
Created version '1' of model 'RL_SMOTE_GRID_SEARCH'.


### 2.2 Random Search

In [27]:
# Definir los hiperparámetros a ajustar

param_grid = {
    'classifier__solver': ['saga'],
    'classifier__penalty': ['elasticnet'],
    'classifier__max_iter': [2000],
    'classifier__C': uniform(loc=0.0001, scale=9999.9999),  # Distribución uniforme entre 0.0001 y 10000,
    'classifier__l1_ratio': uniform(loc=0.0, scale=1.0),  # Distribución uniforme entre 0.0 y 1.0,
    'classifier__fit_intercept': [True, False]
}

In [28]:
# Definir la búsqueda aleatoria
random_search = RandomizedSearchCV(
    estimator=pipeline, param_distributions=param_grid, 
    n_iter=200, cv=kf, 
    scoring= METRICS, 
    refit = "balanced_accuracy",
    return_train_score=True
)

random_search.fit(X_scaled_train, y_scaled_train)

# Resultados
cv_results = random_search.cv_results_
best_params = random_search.best_params_




















































In [29]:
best_params

{'classifier__C': 108.09882506507694,
 'classifier__fit_intercept': True,
 'classifier__l1_ratio': 0.24631986131794537,
 'classifier__max_iter': 2000,
 'classifier__penalty': 'elasticnet',
 'classifier__solver': 'saga'}

In [30]:
# Convertimos los resultados de la validación cruzada en un dataframe
df_results = pd.DataFrame(cv_results)

# Filtrar la fila con los mejores parámetros
filtered_row = df_results.loc[
    (df_results['param_classifier__C'] == best_params['classifier__C']) &
    (df_results['param_classifier__l1_ratio'] == best_params['classifier__l1_ratio']) &
    (df_results['param_classifier__max_iter'] == best_params['classifier__max_iter']) &
    (df_results['param_classifier__penalty'] == best_params['classifier__penalty']) &
    (df_results['param_classifier__solver'] == best_params['classifier__solver']) &
    (df_results['param_classifier__fit_intercept'] == best_params['classifier__fit_intercept'])
]

index_row = filtered_row.index[0]

# Registro los resultados en MLFlow
with mlflow.start_run():

    # Almaceno los valores de los hiperparámetros
    for key, value in best_params.items():
        mlflow.log_param(key, value)

    # Registra las métricas de cada fold para cada métrica
    for metric in METRICS:
        
        M = metric.replace(" ", "_")
        
        # Media
        
        mlflow.log_metric(f"mean_train_{M}", df_results[f"mean_train_{M}"][index_row])
        mlflow.log_metric(f"mean_test_{M}", df_results[f"mean_test_{M}"][index_row])

        # Desviación típica
        mlflow.log_metric(f"std_train_{M}", df_results[f"std_train_{M}"][index_row])
        mlflow.log_metric(f"std_test_{M}", df_results[f"std_test_{M}"][index_row])

        for i in range(CV_FOLDS):

            # Resultados de entrenamiento en cada fold
            mlflow.log_metric(f"train_{M}fold{i}", df_results[f"split{i}_train_{M}"][index_row])
            # Resultados de validación en cada fold
            mlflow.log_metric(f"test_{M}fold{i}", df_results[f"split{i}_test_{M}"][index_row])

    # Establece una etiqueta que describe el propósito de esta ejecución
    mlflow.set_tag("TIPO", "RL_SMOTE_RANDOM_SEARCH")

    # Infiere el signature del modelo, que describe el tipo de entrada y salida del modelo
    signature = infer_signature(X_scaled_train, random_search.best_estimator_.predict(X_scaled_train))

    # Registra el modelo
    model_info = mlflow.sklearn.log_model(
        sk_model=random_search,
        artifact_path="rf_model",
        signature=signature,
        input_example=X_scaled_train,
        registered_model_name="RL_SMOTE_RANDOM_SEARCH",
    )

Successfully registered model 'RL_SMOTE_RANDOM_SEARCH'.
Created version '1' of model 'RL_SMOTE_RANDOM_SEARCH'.


# SIN SMOTE-NC

In [14]:
X_scaled_train = codificarPriceFormat(X_scaled_train)

In [15]:
# Definir el modelo - LogisticRegression
RL = LogisticRegression(random_state = SEED)

### 2.1 Grid Search

In [33]:
# Definir los hiperparámetros a ajustar
param_grid = {
    'solver': ['saga'],
    'penalty': ['elasticnet'],
    'max_iter': [2000],
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0], 
    'l1_ratio': np.linspace(0.0, 1.0, 10),
    'fit_intercept': [True, False]
}

# Inicializo GridSearch
grid_search = GridSearchCV(estimator=RL, param_grid=param_grid, cv=kf,
                           scoring=METRICS, refit = "balanced_accuracy", return_train_score=True, n_jobs=-1, error_score="raise")

grid_search.fit(X_scaled_train, y_scaled_train)

# Resultados
cv_results = grid_search.cv_results_
best_params = grid_search.best_params_























In [34]:
best_params

{'C': 1000.0,
 'fit_intercept': False,
 'l1_ratio': 0.0,
 'max_iter': 2000,
 'penalty': 'elasticnet',
 'solver': 'saga'}

In [35]:
# Convertimos los resultados de la validación cruzada en un dataframe
df_results = pd.DataFrame(grid_search.cv_results_)

# Filtrar la fila con los mejores parámetros
filtered_row = df_results.loc[
    (df_results['param_C'] == best_params['C']) &
    (df_results['param_l1_ratio'] == best_params['l1_ratio']) &
    (df_results['param_max_iter'] == best_params['max_iter']) &
    (df_results['param_penalty'] == best_params['penalty']) &
    (df_results['param_solver'] == best_params['solver']) &
    (df_results['param_fit_intercept'] == best_params['fit_intercept'])
]


index_row = filtered_row.index[0]

# Registro los resultados en MLFlow
with mlflow.start_run():

    # Almaceno los valores de los hiperparámetros
    for key, value in best_params.items():
        mlflow.log_param(key, value)

    # Registra las métricas de cada fold para cada métrica
    for metric in METRICS:
        
        M = metric.replace(" ", "_")
        
        # Media
        
        mlflow.log_metric(f"mean_train_{M}", df_results[f"mean_train_{M}"][index_row])
        mlflow.log_metric(f"mean_test_{M}", df_results[f"mean_test_{M}"][index_row])

        # Desviación típica
        mlflow.log_metric(f"std_train_{M}", df_results[f"std_train_{M}"][index_row])
        mlflow.log_metric(f"std_test_{M}", df_results[f"std_test_{M}"][index_row])

        for i in range(CV_FOLDS):

            # Resultados de entrenamiento en cada fold
            mlflow.log_metric(f"train_{M}fold{i}", df_results[f"split{i}_train_{M}"][index_row])
            # Resultados de validación en cada fold
            mlflow.log_metric(f"test_{M}fold{i}", df_results[f"split{i}_test_{M}"][index_row])

    # Establece una etiqueta que describe el propósito de esta ejecución
    mlflow.set_tag("TIPO", "RL_SIN_SMOTE_GRID_SEARCH")

    # Infiere el signature del modelo, que describe el tipo de entrada y salida del modelo
    signature = infer_signature(X_scaled_train, grid_search.best_estimator_.predict(X_scaled_train))

    # Registra el modelo
    model_info = mlflow.sklearn.log_model(
        sk_model=grid_search,
        artifact_path="rf_model",
        signature=signature,
        input_example=X_scaled_train,
        registered_model_name="RL_SIN_SMOTE_GRID_SEARCH",
    )

Successfully registered model 'RL_SIN_SMOTE_GRID_SEARCH'.
Created version '1' of model 'RL_SIN_SMOTE_GRID_SEARCH'.


# TODAS LAS VARIABLES

# CON SMOTE-NC

In [40]:
data_scaled = DF_ALL.copy()
X_scaled = data_scaled.drop('Bestseller', axis=1)
y_scaled = data_scaled['Bestseller']

# Dividimos en train y test
X_scaled_train, X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X_scaled, y_scaled, test_size=TEST_SIZE, stratify=y, random_state=SEED)



In [41]:
X_scaled_train.describe()

Unnamed: 0,NumPages,SagaNumber,RedPerc,BluePerc,BelongsSaga,Price,WordsTitle,BookInterest1M,Rating20Days,HasTwitter,...,World History,World War I,World War II,Writing,Young Adult,Young Adult Contemporary,Young Adult Fantasy,Young Adult Romance,Young Adult Science Fiction,Zombies
count,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,...,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0
mean,356.7259,1.793651,0.481882,0.426237,0.376694,18.295823,3.293457,213.70151,4.118339,0.693767,...,0.000387,0.001549,0.028649,0.000387,0.178475,0.005807,0.037553,0.011614,0.000387,0.000387
std,113.935181,3.218545,0.230875,0.204655,0.484651,5.675363,1.634205,426.949548,0.373288,0.461017,...,0.019676,0.039329,0.16685,0.019676,0.382986,0.075998,0.19015,0.107163,0.019676,0.019676
min,11.0,0.0,0.01,0.02,0.0,0.99,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,304.0,1.0,0.29,0.26,0.0,14.99,2.0,0.0,3.88,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,351.0,1.0,0.46,0.4,0.0,17.35,3.0,100.0,4.15,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,400.0,1.0,0.67,0.58,1.0,21.23,4.0,180.0,4.38,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2335.0,58.0,0.99,0.94,1.0,59.95,14.0,2911.0,5.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
# Inicializamos RobustScaler
scaler = RobustScaler()

# Solo lo aplicamos a las variables numéricas

variables_numericas = ['NumPages', 'SagaNumber', 'RedPerc', 'BluePerc', 'Price', 'WordsTitle', 'BookInterest1M',
                     'Rating20Days', 'PrevBestSellAuthor']

# Aplicamos el RobustScaler a los datos de entrenamiento y test
X_scaled_train[variables_numericas] = scaler.fit_transform(X_scaled_train[variables_numericas])
X_scaled_test[variables_numericas] = scaler.transform(X_scaled_test[variables_numericas])

In [43]:
X_scaled_train.describe()

Unnamed: 0,NumPages,SagaNumber,RedPerc,BluePerc,BelongsSaga,Price,WordsTitle,BookInterest1M,Rating20Days,HasTwitter,...,World History,World War I,World War II,Writing,Young Adult,Young Adult Contemporary,Young Adult Fantasy,Young Adult Romance,Young Adult Science Fiction,Zombies
count,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,...,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0,2583.0
mean,0.059645,0.793651,0.057583,0.08199,0.376694,0.151574,0.146729,0.631675,-0.063322,0.693767,...,0.000387,0.001549,0.028649,0.000387,0.178475,0.005807,0.037553,0.011614,0.000387,0.000387
std,1.186825,3.218545,0.607565,0.639546,0.484651,0.909513,0.817103,2.371942,0.746577,0.461017,...,0.019676,0.039329,0.16685,0.019676,0.382986,0.075998,0.19015,0.107163,0.019676,0.019676
min,-3.541667,-1.0,-1.184211,-1.1875,0.0,-2.621795,-1.0,-0.555556,-4.3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.489583,0.0,-0.447368,-0.4375,0.0,-0.378205,-0.5,-0.555556,-0.54,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.510417,0.0,0.552632,0.5625,1.0,0.621795,0.5,0.444444,0.46,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,20.666667,57.0,1.394737,1.6875,1.0,6.826923,5.5,15.616667,1.7,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
list(X_scaled_train.columns)

['NumPages',
 'SagaNumber',
 'RedPerc',
 'BluePerc',
 'BelongsSaga',
 'Price',
 'WordsTitle',
 'PriceFormat',
 'BookInterest1M',
 'Rating20Days',
 'HasTwitter',
 'HasWikipedia',
 'PrevBestSellAuthor',
 '19th Century',
 '20th Century',
 'Abuse',
 'Action',
 'Adoption',
 'Adult',
 'Adult Fiction',
 'Adventure',
 'Africa',
 'African American',
 'African American Romance',
 'African Literature',
 'Aliens',
 'Alternate History',
 'Amateur Sleuth',
 'Amazon',
 'American',
 'American History',
 'Americana',
 'Amish',
 'Angels',
 'Animals',
 'Anthologies',
 'Apocalyptic',
 'Apple',
 'Art',
 'Arthurian',
 'Artificial Intelligence',
 'Asia',
 'Asian Literature',
 'Audiobook',
 'Australia',
 'Autistic Spectrum Disorder',
 'Autobiography',
 'BDSM',
 'Banned Books',
 'Baseball',
 'Basketball',
 'Biography',
 'Biography Memoir',
 'Biology',
 'Boarding School',
 'Book Club',
 'Books About Books',
 'Botswana',
 'Boys Love',
 'British Literature',
 'Buddhism',
 'Buisness',
 'Bulgaria',
 'Bulgarian Lite

### Creación del pipeline

Creamos un pipeline con las operaciones que se deben aplicar a cada fold en el entrenamiento:
* Oversampling (SMOTENC)
* Redondear variables enteras
* Transformación variables categóricas con un valor único
* Clasificador (MLP)

In [45]:
def redondearVariables(X):
    variablesRedondeo = ["WordsTitle", "NumPages", "SagaNumber", "PrevBestSellAuthor"]
    # Itera sobre las columnas especificadas y redondea sus valores
    for v in variablesRedondeo:
        X[v] = np.round(X[v])
    return X

In [46]:
# Columnas de los géneros
columnas_generos = X_scaled_train.columns[13:]

# Columnas categóricas
categoricalColumns = ["BelongsSaga", "PriceFormat", "HasTwitter", "HasWikipedia"] + list(columnas_generos)

In [47]:
smote = SMOTENC(categorical_features = categoricalColumns, random_state = SEED)

# Definimos el clasificador 
RL = LogisticRegression(random_state = SEED)

# Definimos el transformador para codificar la variable categórica 'PriceFormat'
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(), ['PriceFormat'])
], remainder='passthrough')

# Definimos el transformador de la función para redondear
transformador_funcion = FunctionTransformer(func=redondearVariables)

# Construimos el pipeline
pipeline = Pipeline([
    ('smote', smote),
    ('redondear_variables', transformador_funcion),
    ('encoder', column_transformer),
    ('classifier', RL)
])


### 2.1 Grid Search

In [48]:
# Definir los hiperparámetros a ajustar
param_grid = {
    'classifier__solver': ['saga'],
    'classifier__penalty': ['elasticnet'],
    'classifier__max_iter': [2000],
    'classifier__C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0], 
    'classifier__l1_ratio': np.linspace(0.0, 1.0, 10),
    'classifier__fit_intercept': [True, False]
}

# Inicializo GridSearch
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kf,
                           scoring=METRICS, refit = "balanced_accuracy", return_train_score=True, n_jobs=-1, error_score="raise")

grid_search.fit(X_scaled_train, y_scaled_train)

# Resultados
cv_results = grid_search.cv_results_
best_params = grid_search.best_params_



























In [49]:
best_params

{'classifier__C': 0.01,
 'classifier__fit_intercept': False,
 'classifier__l1_ratio': 0.3333333333333333,
 'classifier__max_iter': 2000,
 'classifier__penalty': 'elasticnet',
 'classifier__solver': 'saga'}

In [50]:
# Convertimos los resultados de la validación cruzada en un dataframe
df_results = pd.DataFrame(grid_search.cv_results_)

# Filtrar la fila con los mejores parámetros
filtered_row = df_results.loc[
    (df_results['param_classifier__C'] == best_params['classifier__C']) &
    (df_results['param_classifier__l1_ratio'] == best_params['classifier__l1_ratio']) &
    (df_results['param_classifier__max_iter'] == best_params['classifier__max_iter']) &
    (df_results['param_classifier__penalty'] == best_params['classifier__penalty']) &
    (df_results['param_classifier__solver'] == best_params['classifier__solver']) &
    (df_results['param_classifier__fit_intercept'] == best_params['classifier__fit_intercept'])
]

index_row = filtered_row.index[0]

# Registro los resultados en MLFlow
with mlflow.start_run():

    # Almaceno los valores de los hiperparámetros
    for key, value in best_params.items():
        mlflow.log_param(key, value)

    # Registra las métricas de cada fold para cada métrica
    for metric in METRICS:
        
        M = metric.replace(" ", "_")
        
        # Media
        
        mlflow.log_metric(f"mean_train_{M}", df_results[f"mean_train_{M}"][index_row])
        mlflow.log_metric(f"mean_test_{M}", df_results[f"mean_test_{M}"][index_row])

        # Desviación típica
        mlflow.log_metric(f"std_train_{M}", df_results[f"std_train_{M}"][index_row])
        mlflow.log_metric(f"std_test_{M}", df_results[f"std_test_{M}"][index_row])

        for i in range(CV_FOLDS):

            # Resultados de entrenamiento en cada fold
            mlflow.log_metric(f"train_{M}fold{i}", df_results[f"split{i}_train_{M}"][index_row])
            # Resultados de validación en cada fold
            mlflow.log_metric(f"test_{M}fold{i}", df_results[f"split{i}_test_{M}"][index_row])

    # Establece una etiqueta que describe el propósito de esta ejecución
    mlflow.set_tag("TIPO", "RL_SMOTE_GRID_SEARCH_ALL_VARIABLES")

    # Infiere el signature del modelo, que describe el tipo de entrada y salida del modelo
    signature = infer_signature(X_scaled_train, grid_search.best_estimator_.predict(X_scaled_train))

    # Registra el modelo
    model_info = mlflow.sklearn.log_model(
        sk_model=grid_search,
        artifact_path="rf_model",
        signature=signature,
        input_example=X_scaled_train,
        registered_model_name="RL_SMOTE_GRID_SEARCH_ALL_VARIABLES",
    )

Successfully registered model 'RL_SMOTE_GRID_SEARCH_ALL_VARIABLES'.
Created version '1' of model 'RL_SMOTE_GRID_SEARCH_ALL_VARIABLES'.


### 2.2 Random Search

In [51]:
# Definir los hiperparámetros a ajustar
from scipy.stats import uniform
param_grid = {
    'classifier__solver': ['saga'],
    'classifier__penalty': ['elasticnet'],
    'classifier__max_iter': [2000],
    'classifier__C': uniform(loc=0.0001, scale=9999.9999),  # Distribución uniforme entre 0.0001 y 10000,
    'classifier__l1_ratio': uniform(loc=0.0, scale=1.0),  # Distribución uniforme entre 0.0 y 1.0,
    'classifier__fit_intercept': [True, False]
}

In [52]:
# Definir la búsqueda aleatoria
random_search = RandomizedSearchCV(
    estimator=pipeline, param_distributions=param_grid, 
    n_iter=200, cv=kf, 
    scoring= METRICS, 
    refit = "balanced_accuracy",
    return_train_score=True, n_jobs = -1
)

random_search.fit(X_scaled_train, y_scaled_train)

# Resultados
cv_results = random_search.cv_results_
best_params = random_search.best_params_




















































In [53]:
best_params

{'classifier__C': 5062.3961261115055,
 'classifier__fit_intercept': True,
 'classifier__l1_ratio': 0.48106295569295743,
 'classifier__max_iter': 2000,
 'classifier__penalty': 'elasticnet',
 'classifier__solver': 'saga'}

In [54]:
# Convertimos los resultados de la validación cruzada en un dataframe
df_results = pd.DataFrame(cv_results)

# Filtrar la fila con los mejores parámetros
filtered_row = df_results.loc[
    (df_results['param_classifier__C'] == best_params['classifier__C']) &
    (df_results['param_classifier__l1_ratio'] == best_params['classifier__l1_ratio']) &
    (df_results['param_classifier__max_iter'] == best_params['classifier__max_iter']) &
    (df_results['param_classifier__penalty'] == best_params['classifier__penalty']) &
    (df_results['param_classifier__solver'] == best_params['classifier__solver']) &
    (df_results['param_classifier__fit_intercept'] == best_params['classifier__fit_intercept'])
]

index_row = filtered_row.index[0]

# Registro los resultados en MLFlow
with mlflow.start_run():

    # Almaceno los valores de los hiperparámetros
    for key, value in best_params.items():
        mlflow.log_param(key, value)

    # Registra las métricas de cada fold para cada métrica
    for metric in METRICS:
        
        M = metric.replace(" ", "_")
        
        # Media
        
        mlflow.log_metric(f"mean_train_{M}", df_results[f"mean_train_{M}"][index_row])
        mlflow.log_metric(f"mean_test_{M}", df_results[f"mean_test_{M}"][index_row])

        # Desviación típica
        mlflow.log_metric(f"std_train_{M}", df_results[f"std_train_{M}"][index_row])
        mlflow.log_metric(f"std_test_{M}", df_results[f"std_test_{M}"][index_row])

        for i in range(CV_FOLDS):

            # Resultados de entrenamiento en cada fold
            mlflow.log_metric(f"train_{M}fold{i}", df_results[f"split{i}_train_{M}"][index_row])
            # Resultados de validación en cada fold
            mlflow.log_metric(f"test_{M}fold{i}", df_results[f"split{i}_test_{M}"][index_row])

    # Establece una etiqueta que describe el propósito de esta ejecución
    mlflow.set_tag("TIPO", "RL_SMOTE_RANDOM_SEARCH_ALL_VARIABLES")

    # Infiere el signature del modelo, que describe el tipo de entrada y salida del modelo
    signature = infer_signature(X_scaled_train, random_search.best_estimator_.predict(X_scaled_train))

    # Registra el modelo
    model_info = mlflow.sklearn.log_model(
        sk_model=random_search,
        artifact_path="rf_model",
        signature=signature,
        input_example=X_scaled_train,
        registered_model_name="RL_SMOTE_RANDOM_SEARCH_ALL_VARIABLES",
    )

Successfully registered model 'RL_SMOTE_RANDOM_SEARCH_ALL_VARIABLES'.
Created version '1' of model 'RL_SMOTE_RANDOM_SEARCH_ALL_VARIABLES'.
