# Table of Contents

* [Notebook Scenario](#1)
* [Using MLflowUtils class created for the context of Use Case - Alianza Caoba](#2)
* [Demo Steps](#3)
* [Conclusions](#4)
* [References](#5)

# Notebook Scenario
> Nota: Este Jupyter Notebook en la mayor medida esta en Ingles. Se encuentra en español las partes del contexto y habra comentarios con spanglish. ;)

**Customer Challenge**
The Transportation Superintendence has a portal for receiving transportation-related PQRs. However, they receive around 3000 PQRs per month, and the response is handled manually, causing a delay of up to 2.5 months in resolving and closing cases.

**Proposed solution**
The proposed solution is a classification model that allows determining the criticality level of each PQR, enabling prioritized responses for each case. 🚀🔍

# Using MLflowUtils class created for the context of Use Case - Alianza Caoba

In [4]:
#mlflow_utils.py

import json
import mlflow
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient
from pathlib import Path

class MLFlowUtils:
    """
    :Date: 27-07-2023
    :Version: 0.1
    :Author: Centro de Excelencia y Apropiación de Big Data y Data Analytics - CAOBA
    :Organization: Centro de Excelencia y Apropiación de Big Data y Data Analytics - CAOBA
    """
    def __init__(self, config_name = None):
        config_path = Path("../config/experiment_config.json")
        with open(config_path, 'r', encoding='utf-8') as file:
            self.config = json.load(file)[config_name]

        self.params = self.config.get("params", {})
        self.tags = self.config.get("tags", {})

        #self.experiment_name = self.config.get("experiment_name")
        #mlflow.set_experiment(self.experiment_name)

    def log_params(self):
        """Registra los parámetros en MLflow"""
        mlflow.log_params(self.params)

    def log_tags(self):
        """Registra los tags en MLflow"""
        mlflow.set_tags(self.tags)

    def log_model(self, model, artifact_path, signature=None):
        if signature:
            mlflow.sklearn.log_model(model, artifact_path, signature=signature)
        else:
            mlflow.sklearn.log_model(model, artifact_path)

    def log_metrics(self, metrics):
        for key, value in metrics.items():
            mlflow.log_metric(key, value)

    def log_artifact(self, artifact_path):
        mlflow.log_artifact(artifact_path)

    def end_run(self):
        mlflow.end_run()

    def load_model(self, run_id, artifact_path):
        """Carga un modelo desde MLflow basado en run_id y artifact_path."""
        logged_model = f"runs:/{run_id}/{artifact_path}"
        return mlflow.sklearn.load_model(logged_model)

    def register_model(self, run_id, artifact_path, model_name):
        """Registra el modelo en MLflow."""
        model_uri = f"runs:/{run_id}/{artifact_path}"
        mlflow.register_model(model_uri, model_name)

        # Devuelve la última versión del modelo registrado
        client = MlflowClient()
        model_version_details = client.get_latest_versions(model_name, stages=["None"])
        if model_version_details:
            return model_version_details[0].version
        return None

    def set_model_stage(self, model_name, version, stage):
        client = MlflowClient()
        client.transition_model_version_stage(name=model_name, version=version, stage=stage)


In [5]:
#dir(MLFlowUtils)


# Step 3.1 Setting up MLflow URI and Experiment Name in order to track from Kmeans-pocc-02-transporte 

In [6]:
#Modelo_KMeans.py

#General Libraries
import os
import sys
import numpy as np
import pandas as pd
import json
import joblib
from pathlib import Path
from datetime import datetime

#Analytics Libraries
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler

#MLflow libraries
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models.signature import infer_signature
from mlflow.pyfunc import PythonModel

def cargar_datos():
    """Carga los datos desde un archivo CSV."""
    camino = Path('../data/raw/pqr_with_variables_for_Kmeans.csv')
    data = pd.read_csv(camino)
    return data

def procesar_datos(data):
    """Procesa los datos para el modelo KMeans."""
    data['FECHA_RADICADO'] = pd.to_datetime(data['FECHA_RADICADO'], errors='coerce', format='%Y-%m-%d')
    hoy = datetime.now()
    data['dias'] = (hoy - data['FECHA_RADICADO']).dt.days
    X = data[['Cantidad Bigramas', 'Cantidad Trigramas', 'Topico', 'Cantidad Verbos', 'Cantidad Adjetivos', 'Cantidad Adverbios', 'Cantidad Sustantivos', 'Cantidad_Palabras', 'Promedio_Palabras', 'DIVERSIDAD_LEXICA']]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

def entrenar_kmeans(X, n_clusters):
    """Entrena el modelo KMeans con un número determinado de clusters."""
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)
    labels = kmeans.labels_
    return kmeans, labels

def asignar_criticidad(cluster):
    """Asigna una criticidad basada en el cluster."""
    if cluster == 0:
        return "alta"
    elif cluster == 1:
        return "media"
    elif cluster == 2:
        return "baja"
    else:
        return "muy baja"

def guardar_datos(data):
    """Guarda los datos con la criticidad en un archivo CSV."""
    data.to_csv(Path("../data/analytics/dataPredicciones.csv"), index=False)

def guardar_modelo_kmeans(modelo, path_modelo):
    """Guarda el modelo en un archivo .pkl."""
    joblib.dump(modelo, path_modelo)

if __name__ == "__main__":
    url_uri = "http://localhost:5000"
    experiment_name = "puj-202301-poc-02-sde-transporte-test1"

    # Setting tracking uri in localhost
    mlflow.set_tracking_uri(url_uri)

    print(f'url_uri used is: {url_uri}')
    print(f'The experiment to configure is: {experiment_name}')

    # Setting experiment name in localhost URI
    mlflow.set_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

    print("Experiment_id: {}".format(experiment.experiment_id))
    print("Artifact Location: {}".format(experiment.artifact_location))
    print("Tags: {}".format(experiment.tags))
    print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
    print("Creation timestamp: {}".format(experiment.creation_time))


url_uri used is: http://localhost:5000
The experiment to configure is: puj-202301-poc-02-sde-transporte-test1
Experiment_id: 461238389451589685
Artifact Location: mlflow-artifacts:/461238389451589685
Tags: {}
Lifecycle_stage: active
Creation timestamp: 1697824893719


# Step 3.2: Setting up tags from config/experiment_config using in-house component MLflowUtils

In [7]:
#Using MLFLowUtils module to set tags and params

tracker = MLFlowUtils(config_name="Modelo_KMeans")

# Loading and processing data
data = cargar_datos()
X_scaled = procesar_datos(data)


In [8]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24358 entries, 0 to 24357
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   #                     24358 non-null  int64         
 1   DESCRIPCION_HECHOS    24352 non-null  object        
 2   RADICADO ENTRADA      24358 non-null  int64         
 3   Modo                  10179 non-null  object        
 4   Motivo                10179 non-null  object        
 5   MEDIO_RECEPCION       24358 non-null  object        
 6   TIPO_PQR              24343 non-null  object        
 7   FECHA_RADICADO        24358 non-null  datetime64[ns]
 8   Textos Lematizados    24357 non-null  object        
 9   Verbos                24358 non-null  object        
 10  Adjetivos             24358 non-null  object        
 11  Adverbios             24358 non-null  object        
 12  Sustantivos           24358 non-null  object        
 13  Bigramas        

In [9]:
# Print a portion of the array
print(f"Portion of the X_scaled that is type: {type(X_scaled)}")
print(X_scaled[:1, :2])  # Prints the first row and first two columns of the array


Portion of the X_scaled that is type: <class 'numpy.ndarray'>
[[-0.89334889 -0.90238409]]


# Step 4: Running MLflow over data to tracked best model of Kmeans

In [10]:
clusters = 5
random_state = 42
for k in range(2, clusters):
    with mlflow.start_run(experiment_id=experiment.experiment_id):
        # Tracking Parameters
        mlflow.log_param("n_clusters", k)
        mlflow.log_param("random_state", random_state)

        kmeans = KMeans(n_clusters=k, random_state=random_state)
        kmeans.fit(X_scaled)
        labels = kmeans.labels_

        # Calculate and log metrics
        inertia = kmeans.inertia_
        silhouette_avg = silhouette_score(X_scaled, labels)
        dunn_score = davies_bouldin_score(X_scaled, labels)

        # Tracking Metrics
        mlflow.log_metric("Inertia", inertia)
        mlflow.log_metric("Silhouette_Score", silhouette_avg)
        mlflow.log_metric("Dunn_Index", dunn_score)

        tracker.log_tags()

        #Log model using infer_signature
        signature = infer_signature(pd.DataFrame(X_scaled), kmeans.predict(X_scaled))
        mlflow.sklearn.log_model(kmeans, "kmeans_model_"+str(k), signature=signature)

#Note: You can use mlflow.autolog()
#Read more in this link: https://mathdatasimplified.com/2023/10/20/streamlinw-experiment-logs-with-mlflows-auto-logging/

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


>Note: You can use mlflow.autolog()

>Read more in this link: [Streamline Experiment Logs with MLFlow's Auto-Logging](https://mathdatasimplified.com/2023/10/20/streamlinw-experiment-logs-with-mlflows-auto-logging/)

In [11]:
tracker.tags

{'client_name': 'Superintendencia de Transporte',
 'commercial_sector': 'Transporte',
 'model_programming_language': 'Python',
 'planguage_version': 'python3.10',
 'type_of_model': 'Modelo de Clusterización',
 'compute': 'local',
 'model_name': 'kmeans',
 'repo_url_origin': 'https://gitlab.com/CAOBA-Central/consultorias/pruebas-concepto-sde/pocc-02-sde-transporte.git',
 'dataset_url_raw_p1': 'https://gitlab.com/CAOBA-Central/productos-caoba/datalab/analitica-como-servicio/miscellaneous/mlflow/mlflow-pocc-sde/mlflow-pocc-02-transporte/-/tree/develop/demo2/data/raw?ref_type=heads',
 'dataset_url_analytics_p1': 'https://gitlab.com/CAOBA-Central/productos-caoba/datalab/analitica-como-servicio/miscellaneous/mlflow/mlflow-pocc-sde/mlflow-pocc-02-transporte/-/tree/develop/demo2/data/analytics?ref_type=heads'}

# Step 5.1: Establishing Model Registry

In [12]:
# # Registrar el modelo
model_name = "kmeans_model_4"
run_id = "5f787482048f4aac98afd8eaf6a6f463"
registered_model_name="kmeans_model_4_V1"

# Get the run object using the run_id
run_info = mlflow.get_run(run_id)

# Retrieve the artifact path for the run
artifact_uri = run_info.info.artifact_uri
model_uri=f"runs:/{run_id}/{model_name}"

#print(run_info)
#print(artifact_uri)
print(model_uri)


runs:/5f787482048f4aac98afd8eaf6a6f463/kmeans_model_4


In [13]:
# Register the model using the artifact path which is "runs:/{run_id}/{model_name}"
registered_model = mlflow.register_model(
model_uri=f"runs:/{run_id}/{model_name}",
name=registered_model_name,
await_registration_for=300  # Optional: Wait for up to 5 minutes (300 seconds) for registration completion
)

print(f"Model registered: {registered_model}")


Successfully registered model 'kmeans_model_4_V1'.
2023/10/20 13:08:04 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: kmeans_model_4_V1, version 1


Model registered: <ModelVersion: aliases=[], creation_timestamp=1697825284660, current_stage='None', description='', last_updated_timestamp=1697825284660, name='kmeans_model_4_V1', run_id='5f787482048f4aac98afd8eaf6a6f463', run_link='', source='mlflow-artifacts:/461238389451589685/5f787482048f4aac98afd8eaf6a6f463/artifacts/kmeans_model_4', status='READY', status_message='', tags={}, user_id='', version='1'>


Created version '1' of model 'kmeans_model_4_V1'.


# Step 5.2: Changing the stage of the registered model



In [14]:
# To which register model, I want to do stage change
registered_model_name="kmeans_model_4_V1"


#Get registered model information to use to transition
client = MlflowClient()
model_details = client.get_registered_model(registered_model_name)
latest_version = max([int(v.version) for v in model_details.latest_versions])
print(latest_version)

# Transition the latest version of the model to 'Staging'
client.transition_model_version_stage(
    name=registered_model_name,
    version=latest_version,
    stage="Staging"
)


1


<ModelVersion: aliases=[], creation_timestamp=1697825284660, current_stage='Staging', description='', last_updated_timestamp=1697825348138, name='kmeans_model_4_V1', run_id='5f787482048f4aac98afd8eaf6a6f463', run_link='', source='mlflow-artifacts:/461238389451589685/5f787482048f4aac98afd8eaf6a6f463/artifacts/kmeans_model_4', status='READY', status_message='', tags={}, user_id='', version='1'>

# Step 6.1: Consuming Model

In [18]:
# Use the run_id to define the logged_model
logged_model = f"runs:/{run_id}/{model_name}"

print(f"Logged Model to Load is: {logged_model}")

# Load the model with load_model function from mlflow.sklearn
loaded_model = mlflow.sklearn.load_model(logged_model)


Logged Model to Load is: runs:/5f787482048f4aac98afd8eaf6a6f463/kmeans_model_4


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [19]:
data_for_predictions = Path('../data/raw/pqr_with_variables_for_predictions.csv')
data_for_predictions = pd.read_csv(data_for_predictions)
X_scaled_for_predictions = procesar_datos(data_for_predictions)

# Make predictions on a pandas DataFrame
predictions = loaded_model.predict(pd.DataFrame(X_scaled_for_predictions))

# Añadir la criticidad al dataset
data_for_predictions['cluster'] = loaded_model.labels_
data_for_predictions['Criticidad'] = data_for_predictions['cluster'].apply(asignar_criticidad)
data_for_predictions['Predicciones'] = predictions

vars_to_look = ['TIPO_PQR','Topico','TipoPQRClasificado','cluster','Criticidad']
data_for_predictions[vars_to_look]


Unnamed: 0,TIPO_PQR,Topico,TipoPQRClasificado,cluster,Criticidad
0,DENUNCIA,3,Denuncia,2,baja
1,QUEJA,1,Felicitacion,1,media
2,PETICION,0,Peticion,1,media
3,PETICION,1,Felicitacion,1,media
4,DENUNCIA,3,Denuncia,1,media
...,...,...,...,...,...
24353,SOLICITUD,2,Reclamo y Queja,3,muy baja
24354,PETICION,0,Peticion,0,alta
24355,QUEJA,3,Denuncia,2,baja
24356,RECLAMO,1,Felicitacion,2,baja


In [32]:
#data_for_predictions[vars_to_look].groupby(['TipoPQRClasificado','Criticidad']).size()


cluster
0    9231
1    6961
2    6867
3    1299
dtype: int64

# Step 6.2: Tracking artifacts


In [20]:
#Data with variables for Kmeans
local_artifact_path1 = "../data/raw/pqr_with_variables_for_Kmeans.csv"

#Data with variables for predictions
local_artifact_path2 = "../data/raw/pqr_with_variables_for_predictions.csv"

#Data with predictions
local_artifact_path3 = "../data/analytics/pqr_with_variables_with_predictions.csv"
data_for_predictions.to_csv(local_artifact_path3)


In [21]:
with mlflow.start_run(run_id=run_id):

    # Log the artifact to the current run
    tracker.log_artifact(local_artifact_path1)
    tracker.log_artifact(local_artifact_path2)
    tracker.log_artifact(local_artifact_path3)


In [22]:
tracker.end_run()

# Conclusions

- The needs of analytics teams evolve, and MLflow is a great tool to streamline the experiment traceability process for the business. 📊🔍

- There is high cost-effectiveness in initiating MLOps processes with MLflow. 💰✅
- It is essential to establish governance in MLOps projects within analytics teams. 🏛️🤝

# References

- https://docs.aws.amazon.com/wellarchitected/latest/machine-learning-lens/best-practices-by-ml-lifecycle-phase.html
- https://media.giphy.com/media/MB139ObkGalArgZOvz/giphy.gif
- https://cloud.google.com/architecture/mlops-continuous-delivery-and-automation-pipelines-in-machine-learning
- https://mlflow.org/docs/latest/quickstart_mlops.html#
- https://crunchingthedata.com/cs01-mlflow-tracking/
- https://mlflow.org/docs/latest/tracking.html#concepts