# Evaluación de modelos candidatos

## Carga y preparación datos test

In [13]:
import pandas as pd

In [14]:
data = pd.read_csv('/Users/maria/Dropbox/UCM/PD1/pruebas-proyecto/datosEntrenamientoModelosFinal.csv')

In [15]:
data.head()

Unnamed: 0,Bestseller,NumPages,SagaNumber,RedPerc,BluePerc,BelongsSaga,Price,WordsTitle,PriceFormat,BookInterest1M,...,World History,World War I,World War II,Writing,Young Adult,Young Adult Contemporary,Young Adult Fantasy,Young Adult Romance,Young Adult Science Fiction,Zombies
0,0.0,329.0,1.0,0.51,0.4,0,19.99,1.0,paperback,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,269.0,2.0,0.61,0.54,1,3.99,2.0,ebook,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,2335.0,1.0,0.72,0.57,1,20.99,7.0,ebook,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0.0,40.0,1.0,0.83,0.35,0,25.0,1.0,hardcover,0.0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,189.0,1.0,0.59,0.26,0,15.0,4.0,paperback,0.0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Semilla
SEED = 22

# Proporción del conjunto de test
TEST_SIZE = 0.3

Separamos el conjunto de datos sin escalar en train y test

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = data.drop('Bestseller', axis=1)
y = data['Bestseller']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=SEED)

In [22]:
X_test.head()

Unnamed: 0,NumPages,SagaNumber,RedPerc,BluePerc,BelongsSaga,Price,WordsTitle,PriceFormat,BookInterest1M,Rating20Days,...,World History,World War I,World War II,Writing,Young Adult,Young Adult Contemporary,Young Adult Fantasy,Young Adult Romance,Young Adult Science Fiction,Zombies
2720,300.0,2.0,0.49,0.28,1,19.79,4.0,ebook,100.0,4.25,...,0,0,0,0,0,0,0,0,0,0
2496,304.0,1.0,0.72,0.15,0,17.99,2.0,hardcover,0.0,4.26,...,0,0,0,0,1,0,0,0,0,0
2519,128.0,1.0,0.98,0.54,0,16.0,1.0,paperback,793.0,3.82,...,0,0,0,0,0,0,0,0,0,0
3352,336.0,1.0,0.52,0.56,0,14.79,5.0,paperback,100.0,4.06,...,0,0,0,0,0,0,0,0,0,0
2182,336.0,1.0,0.38,0.3,0,24.7,3.0,hardcover,0.0,3.94,...,0,0,0,0,1,0,0,0,0,0


### Escalado de variables
Escalamos las variables y dividimos en train y test

In [19]:
from sklearn.preprocessing import RobustScaler

In [24]:
data_scaled = data.copy()
X_scaled = data_scaled.drop('Bestseller', axis=1)
y_scaled = data_scaled['Bestseller']

# Dividimos en train y test
X_scaled_train, X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X_scaled, y_scaled, test_size=TEST_SIZE, stratify=y, random_state=SEED)

# Inicializamos RobustScaler
scaler = RobustScaler()

# Solo lo aplicamos a las variables numéricas
# Incluimos 'Zombies' que no hay que escalar porque el RobustScaler ignora la primera columna de la lista 
# (hemos hecho pruebas)
variables_numericas = ['Zombies', 'SagaNumber', 'NumPages', 'RedPerc', 'BluePerc', 'Price', 'WordsTitle', 'BookInterest1M',
                     'Rating20Days', 'PrevBestSellAuthor']

# Aplicamos el RobustScaler a los datos de entrenamiento y test
X_scaled_train[variables_numericas] = scaler.fit_transform(X_scaled_train[variables_numericas])
X_scaled_test[variables_numericas] = scaler.transform(X_scaled_test[variables_numericas])

In [26]:
X_scaled_test.head()

Unnamed: 0,NumPages,SagaNumber,RedPerc,BluePerc,BelongsSaga,Price,WordsTitle,PriceFormat,BookInterest1M,Rating20Days,...,World History,World War I,World War II,Writing,Young Adult,Young Adult Contemporary,Young Adult Fantasy,Young Adult Romance,Young Adult Science Fiction,Zombies
2720,-0.53125,1.0,0.078947,-0.375,1,0.391026,0.5,ebook,0.0,0.2,...,0,0,0,0,0,0,0,0,0,0.0
2496,-0.489583,0.0,0.684211,-0.78125,0,0.102564,-0.5,hardcover,-0.555556,0.22,...,0,0,0,0,1,0,0,0,0,0.0
2519,-2.322917,0.0,1.368421,0.4375,0,-0.216346,-1.0,paperback,3.85,-0.66,...,0,0,0,0,0,0,0,0,0,0.0
3352,-0.15625,0.0,0.157895,0.5,0,-0.410256,1.0,paperback,0.0,-0.18,...,0,0,0,0,0,0,0,0,0,0.0
2182,-0.15625,0.0,-0.210526,-0.3125,0,1.177885,0.0,hardcover,-0.555556,-0.42,...,0,0,0,0,1,0,0,0,0,0.0


## Recuperación de modelos de mlflow

In [29]:
import mlflow.pyfunc
import mlflow.tracking
import os
import urllib.parse
import sqlite3

In [36]:
import mlflow.pyfunc

def load_model_from_db(run_id):
    """
    Load a model from MLflow using the given run ID.

    Args:
    - run_id (str): The ID of the run containing the model.

    Returns:
    - model: The loaded MLflow model.
    """
    # Construct the model URI using the run ID
    model_uri = f"runs:/{run_id}/model"

    # Load the model from MLflow
    loaded_model = mlflow.pyfunc.load_model(model_uri)

    return loaded_model

# Example usage:
run_id = 'a6e5a4cdffc94b2097820ba4e545f13d'
MLP_model = load_model_from_db(run_id)

NotADirectoryError: [Errno 20] Not a directory: '/Users/maria/Dropbox/UCM/PD1/pruebas-proyecto/MLP_runs.db/.trash'

**MLP Model**

## Análisis balanced_accuracy

## Análisis especificidad y sensibilidad

## Análisis por segmentos (géneros literarios)

## Análisis de las variables de cada modelo