# Table of Contents

* [Introduction](#1)
* [Workspace Preparation](#2)
* [Data Preparation](#3)
* [Getting x_train, x_test, y_train, y_test](#4)
* [MLFlow workspace preparation and Use](#5)
* [Conclusions](#6)
* [References](#7)

# Introduction

> Nota: Este Jupyter Notebook en la mayor medida esta en Ingles. Se encuentra en español las partes del contexto y habra comentarios con spanglish. ;)

### Escenario del Notebook

- XX

## Using MLflowUtils class created for the context of Use Case - Alianza Caoba

In [1]:
import json
import mlflow
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient
from pathlib import Path

class MLFlowUtils:

    def __init__(self, config_name = None):
        config_path = Path("../config/experiment_config.json")
        with open(config_path, 'r', encoding='utf-8') as file:
            self.config = json.load(file)[config_name]

        self.params = self.config.get("params", {})
        self.tags = self.config.get("tags", {})

        #self.experiment_name = self.config.get("experiment_name")
        #mlflow.set_experiment(self.experiment_name)

    def log_params(self):
        """Registra los parámetros en MLflow"""
        mlflow.log_params(self.params)

    def log_tags(self):
        """Registra los tags en MLflow"""
        mlflow.set_tags(self.tags)

    def log_model(self, model, artifact_path, signature=None):
        if signature:
            mlflow.sklearn.log_model(model, artifact_path, signature=signature)
        else:
            mlflow.sklearn.log_model(model, artifact_path)

    def log_metrics(self, metrics):
        for key, value in metrics.items():
            mlflow.log_metric(key, value)

    def log_artifact(self, artifact_path):
        mlflow.log_artifact(artifact_path)

    def end_run(self):
        mlflow.end_run()

    def load_model(self, run_id, artifact_path):
        """Carga un modelo desde MLflow basado en run_id y artifact_path."""
        logged_model = f"runs:/{run_id}/{artifact_path}"
        return mlflow.sklearn.load_model(logged_model)

    def register_model(self, run_id, artifact_path, model_name):
        """Registra el modelo en MLflow."""
        model_uri = f"runs:/{run_id}/{artifact_path}"
        mlflow.register_model(model_uri, model_name)

        # Devuelve la última versión del modelo registrado
        client = MlflowClient()
        model_version_details = client.get_latest_versions(model_name, stages=["None"])
        if model_version_details:
            return model_version_details[0].version
        return None

    def set_model_stage(self, model_name, version, stage):
        client = MlflowClient()
        client.transition_model_version_stage(name=model_name, version=version, stage=stage)



* 'schema_extra' has been renamed to 'json_schema_extra'


In [3]:
#dir(MLFlowUtils)


## Step 3.1 Setting up MLflow URI and Experiment Name in order to track from Kmeans-pocc-02-transporte 

In [23]:
#Modelo_KMeans.py

#General Libraries
import os
import sys
import numpy as np
import pandas as pd
import json
import joblib
from pathlib import Path
from datetime import datetime

#Analytics Libraries
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler

#MLflow libraries
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models.signature import infer_signature
from mlflow.pyfunc import PythonModel

def cargar_datos():
    """Carga los datos desde un archivo CSV."""
    camino = Path('../data/raw/pqr_with_variables_for_Kmeans.csv')
    data = pd.read_csv(camino)
    return data

def procesar_datos(data):
    """Procesa los datos para el modelo KMeans."""
    data['FECHA_RADICADO'] = pd.to_datetime(data['FECHA_RADICADO'], errors='coerce', format='%Y-%m-%d')
    hoy = datetime.now()
    data['dias'] = (hoy - data['FECHA_RADICADO']).dt.days
    X = data[['Cantidad Bigramas', 'Cantidad Trigramas', 'Topico', 'Cantidad Verbos', 'Cantidad Adjetivos', 'Cantidad Adverbios', 'Cantidad Sustantivos', 'Cantidad_Palabras', 'Promedio_Palabras', 'DIVERSIDAD_LEXICA']]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

def entrenar_kmeans(X, n_clusters):
    """Entrena el modelo KMeans con un número determinado de clusters."""
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)
    labels = kmeans.labels_
    return kmeans, labels

def asignar_criticidad(cluster):
    """Asigna una criticidad basada en el cluster."""
    if cluster == 0:
        return "alta"
    elif cluster == 1:
        return "media"
    elif cluster == 2:
        return "baja"
    else:
        return "muy baja"

def guardar_datos(data):
    """Guarda los datos con la criticidad en un archivo CSV."""
    data.to_csv(Path("../data/analytics/dataPredicciones.csv"), index=False)

def guardar_modelo_kmeans(modelo, path_modelo):
    """Guarda el modelo en un archivo .pkl."""
    joblib.dump(modelo, path_modelo)

if __name__ == "__main__":
    url_uri = "http://localhost:5000"
    experiment_name = "puj-202301-poc-02-sde-transporte-test1"

    # Setting tracking uri in localhost
    mlflow.set_tracking_uri(url_uri)

    print(f'url_uri used is: {url_uri}')
    print(f'The experiment to configure is: {experiment_name}')

    # Setting experiment name in localhost URI
    mlflow.set_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

    print("Experiment_id: {}".format(experiment.experiment_id))
    print("Artifact Location: {}".format(experiment.artifact_location))
    print("Tags: {}".format(experiment.tags))
    print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
    print("Creation timestamp: {}".format(experiment.creation_time))


url_uri used is: http://localhost:5000
The experiment to configure is: puj-202301-poc-02-sde-transporte-test1
Experiment_id: 995934375049530325
Artifact Location: mlflow-artifacts:/995934375049530325
Tags: {}
Lifecycle_stage: active
Creation timestamp: 1697663395720


## Step 3: Setting up tags of experiment using in-house component MLflowUtils

In [7]:
#Using MLFLowUtils module to set tags and params

tracker = MLFlowUtils(config_name="Modelo_KMeans")

# Loading and processing data
data = cargar_datos()
X_scaled = procesar_datos(data)


In [9]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24358 entries, 0 to 24357
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   #                     24358 non-null  int64         
 1   DESCRIPCION_HECHOS    24352 non-null  object        
 2   RADICADO ENTRADA      24358 non-null  int64         
 3   Modo                  10179 non-null  object        
 4   Motivo                10179 non-null  object        
 5   MEDIO_RECEPCION       24358 non-null  object        
 6   TIPO_PQR              24343 non-null  object        
 7   FECHA_RADICADO        24358 non-null  datetime64[ns]
 8   Textos Lematizados    24357 non-null  object        
 9   Verbos                24358 non-null  object        
 10  Adjetivos             24358 non-null  object        
 11  Adverbios             24358 non-null  object        
 12  Sustantivos           24358 non-null  object        
 13  Bigramas        

In [17]:
# Print a portion of the array
print(f"Portion of the X_scaled that is type: {type(X_scaled)}")
print(X_scaled[:1, :2])  # Prints the first row and first two columns of the array


Portion of the X_scaled that is type: <class 'numpy.ndarray'>
[[-0.89334889 -0.90238409]]


## Step 4: Running MLflow over data to tracked best model of Kmeans

In [18]:
clusters = 5
random_state = 42
for k in range(2, clusters):
    with mlflow.start_run(experiment_id=experiment.experiment_id):
        # Tracking Parameters
        mlflow.log_param("n_clusters", k)
        mlflow.log_param("random_state", random_state)

        kmeans = KMeans(n_clusters=k, random_state=random_state)
        kmeans.fit(X_scaled)
        labels = kmeans.labels_

        # Calculate and log metrics
        inertia = kmeans.inertia_
        silhouette_avg = silhouette_score(X_scaled, labels)
        dunn_score = davies_bouldin_score(X_scaled, labels)

        # Tracking Metrics
        mlflow.log_metric("Inertia", inertia)
        mlflow.log_metric("Silhouette_Score", silhouette_avg)
        mlflow.log_metric("Dunn_Index", dunn_score)

        tracker.log_tags()

        #Log model using infer_signature
        signature = infer_signature(pd.DataFrame(X_scaled), kmeans.predict(X_scaled))
        mlflow.sklearn.log_model(kmeans, "kmeans_model_"+str(k), signature=signature)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [24]:
tracker.tags


{'client_name': 'Superintendencia de Transporte',
 'commercial_sector': 'Transporte',
 'model_programming_language': 'Python',
 'planguage_version': 'python3.10',
 'type_of_model': 'Modelo de Clusterización',
 'compute': 'local',
 'model_name': 'kmeans',
 'repo_url_origin': 'https://gitlab.com/CAOBA-Central/consultorias/pruebas-concepto-sde/pocc-02-sde-transporte.git',
 'dataset_url_raw_p1': 'https://gitlab.com/CAOBA-Central/productos-caoba/datalab/analitica-como-servicio/miscellaneous/mlflow/mlflow-pocc-sde/mlflow-pocc-02-transporte/-/tree/develop/demo2/data/raw?ref_type=heads',
 'dataset_url_analytics_p1': 'https://gitlab.com/CAOBA-Central/productos-caoba/datalab/analitica-como-servicio/miscellaneous/mlflow/mlflow-pocc-sde/mlflow-pocc-02-transporte/-/tree/develop/demo2/data/analytics?ref_type=heads'}

## Step 5.1: Establishing Model Registry

In [25]:
# # Registrar el modelo
model_name = "kmeans_model_4"
run_id = "257d0daeb7d3432a88fd4694062734c3"
registered_model_name="kmeans_model_4_V1"

# Get the run object using the run_id
run_info = mlflow.get_run(run_id)

# Retrieve the artifact path for the run
artifact_uri = run_info.info.artifact_uri
model_uri=f"runs:/{run_id}/{model_name}"

print(run_info)
print(artifact_uri)
print(model_uri)

# Register the model using the artifact path
registered_model = mlflow.register_model(
model_uri=f"runs:/{run_id}/{model_name}",
name=registered_model_name,
await_registration_for=300  # Optional: Wait for up to 5 minutes (300 seconds) for registration completion
)

print(f"Model registered: {registered_model}")


Successfully registered model 'kmeans_model_4_V1'.
2023/10/18 16:40:07 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: kmeans_model_4_V1, version 1


<Run: data=<RunData: metrics={'Dunn_Index': 1.7993000319863348,
 'Inertia': 149272.37728304684,
 'Silhouette_Score': 0.17681422936050237}, params={'n_clusters': '4', 'random_state': '42'}, tags={'client_name': 'Superintendencia de Transporte',
 'commercial_sector': 'Transporte',
 'compute': 'local',
 'dataset_url_analytics_p1': 'https://gitlab.com/CAOBA-Central/productos-caoba/datalab/analitica-como-servicio/miscellaneous/mlflow/mlflow-pocc-sde/mlflow-pocc-02-transporte/-/tree/develop/demo2/data/analytics?ref_type=heads',
 'dataset_url_raw_p1': 'https://gitlab.com/CAOBA-Central/productos-caoba/datalab/analitica-como-servicio/miscellaneous/mlflow/mlflow-pocc-sde/mlflow-pocc-02-transporte/-/tree/develop/demo2/data/raw?ref_type=heads',
 'mlflow.log-model.history': '[{"run_id": "257d0daeb7d3432a88fd4694062734c3", '
                             '"artifact_path": "kmeans_model_4", '
                             '"utc_time_created": "2023-10-18 '
                             '21:27:44.167986"

Created version '1' of model 'kmeans_model_4_V1'.


Name: 01B4_MFlow_LR_train
Experiment_id: 24
Artifact Location: s3://mlflow-artifact-store-awscday/24
Tags: {}
Lifecycle_stage: active


## Step 5.2: Changing the stage of the registered model

# Conclusions

# References

# Other key functions and Info

In [248]:
def assert_experiment_names_equal(experiments, expected_names):
    actual_names = [e.name for e in experiments if e.name != "Default"]
    assert actual_names == expected_names, (actual_names, expected_names)

search_name='02B_MFlow_LR_train'
# Search for experiments with full_name
experiments = mlflow.search_experiments(filter_string="name = '02B_MFlow_LR_train'")
#assert_experiment_names_equal(experiments, [search_name])

print(f'%% Experiments by the name: {search_name} are: {len(experiments)}')
print(f'%% Making loop over experiments list \n')
for element in experiments:
    print("Name: {}".format(element.name))
    print("Experiment_id: {}".format(element.experiment_id))
    print("Artifact Location: {}".format(element.artifact_location))
    print("Tags: {}".format(element.tags))
    print("Lifecycle_stage: {}".format(element.lifecycle_stage))
    print("Creation timestamp: {}".format(element.creation_time))


%% Experiments by the name: 02B_MFlow_LR_train are: 0
%% Making loop over experiments list 



In [249]:
import mlflow

# Specify the name of the experiment you want to check
experiment_name = "02B2_MFlow_LR_train"

# Get the experiment by name
experiment = mlflow.get_experiment_by_name(experiment_name)

# Check if the experiment exists and if it is active
if experiment is not None and experiment.lifecycle_stage == "active":
    print("The experiment is active.")
else:
    print("The experiment is either not found or not active.")


The experiment is active.


In [243]:
# # Construct a Pandas DataFrame using iris flower data from a web URL
# dataset_source_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
# df = pd.read_csv(dataset_source_url)
# # Construct an MLflow PandasDataset from the Pandas DataFrame, and specify the web URL
# # as the source
# dataset = mlflow.data.pandas_dataset.from_pandas(df, source=dataset_source_url)

# with mlflow.start_run(experiment_id=experiment_id):
#     # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
#     # dataset is used for model training
#     mlflow.log_input(dataset, context="training")

# # Retrieve the run, including dataset information
# run = mlflow.get_run(mlflow.last_active_run().info.run_id)
# dataset_info = run.inputs.dataset_inputs[0].dataset
# print(f"Dataset name: {dataset_info.name}")
# print(f"Dataset digest: {dataset_info.digest}")
# print(f"Dataset profile: {dataset_info.profile}")
# print(f"Dataset schema: {dataset_info.schema}")


In [245]:
# import mlflow

# from sklearn.model_selection import train_test_split
# from sklearn.datasets import load_diabetes
# from sklearn.ensemble import RandomForestRegressor

# mlflow.autolog()

# db = load_diabetes()
# X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# # Create and train models.
# rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
# rf.fit(X_train, y_train)

# # Use the model to make predictions on the test dataset.
# predictions = rf.predict(X_test)
# autolog_run = mlflow.last_active_run()

# mlflow.end_run()


In [246]:
#client = mlflow.MlflowClient()
#data = client.get_run(mlflow.active_run().info.run_id).data


In [None]:
# experiment_name_formlflow="01B3_MFlow_LR_train"
# #Set an experiment name, which must be unique and case sensitive
# experiment_id = mlflow.set_experiment(
#     experiment_name_formlflow
# )

# print(type(experiment_id),experiment_id.experiment_id)

# from mlflow import MlflowClient

# # Create an experiment with a name that is unique and case sensitive.
# client = MlflowClient()

# experiment_id=experiment_id.experiment_id
# # Fetch experiment metadata information
# experiment = client.get_experiment(experiment_id)
# print("Name: {}".format(experiment.name))
# print("Experiment_id: {}".format(experiment.experiment_id))
# print("Artifact Location: {}".format(experiment.artifact_location))
# print("Tags: {}".format(experiment.tags))
# print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))


# import json
# import plotly.express as px
# import mlflow
# import requests

# ### prepare sample files to log
# # test data
# df = px.data.iris()

# # sample CSV file
# df.to_csv("1_data_sample.csv")

# # sample pandas HTML file
# df.to_html("2_data_sample.html")

# # sample image
# r = requests.get("https://upload.wikimedia.org/wikipedia/en/7/7d/Lenna_%28test_image%29.png")
# with open("3_image_sample.png", 'wb') as f:
#     f.write(r.content)

# # sample gif
# r = requests.get("https://media1.giphy.com/media/bU3YVJAAXckCI/giphy.gif")
# with open("4_gif_sample.gif", 'wb') as f:
#     f.write(r.content)

# # sample plotly plot - HTML
# fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species", marginal_y="rug", marginal_x="histogram")
# fig.write_html("5_plot_sample.html")

# # sample geojson
# with open("6_map_sample.geojson", "w+") as f:
#     data = requests.get("https://gist.githubusercontent.com/wavded/1200773/raw/e122cf709898c09758aecfef349964a8d73a83f3/sample.json").json()
#     f.write(json.dumps(data))

# ### log files to mlflow experiment
# with mlflow.start_run(experiment_id=experiment_id, run_name="file_display") as run:

#     mlflow.log_param("parameter","test")
#     mlflow.log_metric("the_answer",42.0)

#     mlflow.log_artifact("./1_data_sample.csv")
#     mlflow.log_artifact("./2_data_sample.html")
#     mlflow.log_artifact("./3_image_sample.png")
#     mlflow.log_artifact("./4_gif_sample.gif")
#     mlflow.log_artifact("./5_plot_sample.html")
#     mlflow.log_artifact("./6_map_sample.geojson")
