In [9]:
import awswrangler as wr

import mlflow

# Para que funcione, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


# Búsqueda de mejor modelo e hiperparámetros

Dado nuestro dataset, el cual ya pasó por el proceso de ETL y se encuentra en nuestro S3 bucket, vamos a realizar una búsqueda de cual seria el mejor modelo y que hiperparametros usar.

La búsqueda de hiperparametros la haremos usando Optuna y el tracking será realizado mediante MLFlow.


In [10]:
mlflow_server = "http://localhost:5000"

mlflow.set_tracking_uri(mlflow_server)

In [11]:
# Cargamos los datos para realizar nuestro estudio.
# OBS, no vamos a cargar los datos de testing, nada de Data leakage por aquí
X_train =  wr.s3.read_csv("s3://data/final/train/water_X_train.csv")
y_train =  wr.s3.read_csv("s3://data/final/train/water_y_train.csv")

X_test =  wr.s3.read_csv("s3://data/final/test/water_X_test.csv")
y_test =  wr.s3.read_csv("s3://data/final/test/water_y_test.csv")

In [12]:
X_train.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,-0.800591,-1.510326,-0.27448,-1.398306,-0.384074,-0.14519,0.401201,1.12754,0.38507
1,0.85437,-1.017271,1.466335,0.360593,-0.458307,-0.267888,-1.828221,0.076059,0.49195
2,-0.062761,-2.005646,-0.530256,-0.497127,-0.32801,1.216102,0.338255,0.562725,1.97075
3,-0.608615,-1.125973,-0.26282,1.843514,-0.654414,0.9748,0.784255,-1.525841,-0.215729
4,0.196108,1.142426,-0.744293,0.456643,0.823439,-0.406695,0.290145,-0.10325,-0.731019


## Correlación de features con la variable objetivo


In [13]:
from plots import plot_correlation_with_target, plot_information_gain_with_target

In [14]:
# Dado que estamos usando como tracking a MLFlow, mostrar los gráficos aquí no tiene sentido.
correlation_plot = plot_correlation_with_target(X_train, y_train, target_col="Potability")
information_gain_plot = plot_information_gain_with_target(X_train, y_train)

## Arrancamos a experimentar

In [15]:
import datetime
import optuna

from mlflow.models import infer_signature
from mlflow_aux import get_or_create_experiment

from optuna_aux import champion_callback, objective

from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, make_scorer

# Optuna es un poco verboso, dejamos que solo nos muestre logs de errores
optuna.logging.set_verbosity(optuna.logging.ERROR)

Se crea el experimento en MLFLow

In [16]:
# Creemos el experimento
experiment_id = get_or_create_experiment("Water Quality")
print(experiment_id)

run_name_parent = "best_hyperparam_"  + datetime.datetime.today().strftime('%Y/%m/%d-%H:%M:%S"')

1


Usamos Optuna

Agregamos como metrica a considerar el accuracy, de hecho medimos el champion por el accuracy tambien solo que mantenemos el log del f1 score en el mlflow

In [17]:
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name_parent, nested=True):
    # Inicializamos el estudio de Optuna
    study = optuna.create_study(direction="maximize")

    # Ejecutamos los trials de optimización de hiperparametros. Cada uno de estos trials se ejecuta con un run separado, pero 
    # está anidado al run padre.
    # Notar la adición del `champion_callback` para controlar qué mensajes mostramos
    # Para entender mejor esto ver la documentación de objective y champion_callback en optuna_aux
    study.optimize(lambda trial: objective(trial, X_train, y_train, experiment_id), n_trials=250, callbacks=[champion_callback])

    # Una vez que terminamos la búsqueda, guardamos los mejores parámetros en el run padre.
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_train_f1", study.best_value)

    mlflow.set_tags(
        tags={
            "project": "Water Quality",
            "optimizer_engine": "optuna",
            "model_family": "sklearn",
            "feature_set_version": 1,
        }
    )

    # Una vez que terminamos la búsqueda, nos quedamos con el mejor modelo y lo entrenamos
    if study.best_params["classifier"] == "SVC_linear":
        model = SVC(C=study.best_params["svc_c"], kernel='linear', gamma='scale')
    elif study.best_params["classifier"] == "SVC_poly":
        model = SVC(C=study.best_params["svc_c"], kernel='poly', 
                    gamma='scale', degree=study.best_params["svc_poly_degree"])
    elif study.best_params["classifier"] == "SVC_rbf":
        model = SVC(C=study.best_params["svc_c"], kernel='rbf', gamma='scale')
    elif study.best_params["classifier"] == "DecisionTreeClassifier":
        model = DecisionTreeClassifier(max_depth=study.best_params["tree_max_depth"])
    else:
        model = RandomForestClassifier(max_depth=study.best_params["rf_max_depth"], 
                                       n_estimators=study.best_params["rf_n_estimators"])

    model = model.fit(X_train, y_train.to_numpy().ravel())

    # Y testeamos el modelo y logueamos el resultado
    y_pred = model.predict(X_test)
    f1_score = f1_score(y_test.to_numpy().ravel(), y_pred)
    accuracy = accuracy_score(y_test.to_numpy().ravel(), y_pred)
    mlflow.log_metric("test_f1", f1_score)
    mlflow.log_metric("accuracy", accuracy)

    # Logueamos los artefactos de las gráficas de correlación y de information_gain
    mlflow.log_figure(figure=correlation_plot, artifact_file="correlation_plot.png")
    mlflow.log_figure(figure=information_gain_plot, artifact_file="information_gain_plot.png")

    # Guardamos el artefacto del modelo
    artifact_path = "model"

    signature = infer_signature(X_train, model.predict(X_train))

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=artifact_path,
        signature=signature,
        serialization_format='cloudpickle',
        registered_model_name="water_quality_model_dev",
        metadata={"model_data_version": 1}
    )

    # Obtenemos la ubicación del modelo guardado en MLFlow
    model_uri = mlflow.get_artifact_uri(artifact_path)


Initial trial 0 achieved value: 0.675621346844833
Trial 6 achieved value: 0.6764351828854338 with  0.1203% improvement
Trial 14 achieved value: 0.6768425148609938 with  0.0602% improvement
Trial 21 achieved value: 0.6768425148609939 with  0.0000% improvement
Trial 25 achieved value: 0.6784701869421953 with  0.2399% improvement
Trial 26 achieved value: 0.681318199129038 with  0.4180% improvement
Trial 27 achieved value: 0.6874281787624393 with  0.8888% improvement
Trial 32 achieved value: 0.6894606990876426 with  0.2948% improvement
Trial 86 achieved value: 0.6898696868842416 with  0.0593% improvement
Trial 110 achieved value: 0.6906860066564006 with  0.1182% improvement


Successfully registered model 'water_quality_model_dev'.
2024/04/28 21:20:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: water_quality_model_dev, version 1
Created version '1' of model 'water_quality_model_dev'.


## Testeando el modelo

Una vez que el modelo fue entrenado, podemos levantarlo y testearlo de una forma agnóstica a donde está guardado.

In [18]:
loaded = mlflow.sklearn.load_model(model_uri)

Downloading artifacts: 100%|██████████| 9/9 [00:00<00:00, 815.13it/s]


In [19]:
import numpy as np
test_data = [-1.41456982, -0.90755244, -0.26      ,  0.56886629,  2.52275163,
       -0.91573277,  0.39324227,  0.13926734,  1.38534189]
loaded.predict(np.array(test_data).reshape([1, -1]))



array([1.])

## Registramos el modelo 

Realizamos el registro del modelo en MLflow. En este registro se pone el modelo productivo que luego se usará para servir en formato on-line.

In [20]:
from mlflow import MlflowClient

client = MlflowClient()
name = "water_quality_model_prod"
desc = "This classifier predict water potability (1 = potable)"

# Creamos el modelo productivo
client.create_registered_model(name=name, description=desc)

# Guardamos como tag los hiper-parametros en la version del modelo
tags = model.get_params()
tags["model"] = type(model).__name__
tags["f1-score"] = f1_score

# Guardamos la version del modelo
result = client.create_model_version(
    name=name,
    source=model_uri,
    run_id=model_uri.split("/")[-3],
    tags=tags
)

# Y creamos como la version con el alias de champion para poder levantarlo en nuestro
# proceso de servicio del modelo on-line.
client.set_registered_model_alias(name, "champion", result.version)

2024/04/28 21:20:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: water_quality_model_prod, version 1
