# Sample

1. Model Registration
2. Model Tracking 
3. Auto Logging


In [1]:
# pip install --upgrade mlflow

# mlflow tracking: Setup new experiment

In [2]:
import mlflow
from mlflow.models import infer_signature
import mlflow.sklearn

# MLflow tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("DT_FCM_Cluster_Model")

<Experiment: artifact_location='mlflow-artifacts:/322767974938960504', creation_time=1713984498197, experiment_id='322767974938960504', last_update_time=1713984498197, lifecycle_stage='active', name='DT_FCM_Cluster_Model', tags={}>

# Model

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

# Auto logging
mlflow.autolog()

# Data loading
path = "https://raw.githubusercontent.com/McGill-MMA-EnterpriseAnalytics/Customer-Personality-Analysis-2.0/main/Data/Clustered%20Data/Clustered_Data(FCM).csv?token=GHSAT0AAAAAACL3VCJAJEBP7YLR57PARZ2CZRJTHGA"
df = pd.read_csv(path)
X = df.drop(['ID', 'Cluster'], axis=1)  # Drop the ID and Clusters column
y = df['Cluster']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model parameters
params = {"random_state": 100}

# Decision Tree Classifier
clf = DecisionTreeClassifier(**params)
clf.fit(X_train_scaled, y_train)

# Predictions and accuracy
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
accuracy = clf.score(X_test_scaled, y_test)

2024/04/24 19:28:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/04/24 19:28:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2024/04/24 19:28:59 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '65adc5f09307429eaedd76df0cba33f6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


              precision    recall  f1-score   support

           0       0.95      0.96      0.95       180
           1       0.92      0.92      0.92       146
           2       0.86      0.84      0.85       121

    accuracy                           0.91       447
   macro avg       0.91      0.91      0.91       447
weighted avg       0.91      0.91      0.91       447



In [4]:
with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.set_tag("Training Info", "Basic DT model for GMM Cluster Interpretation")
    signature = infer_signature(X_train_scaled, y_pred)
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="DT_GMM_Interpre_model",
        signature=signature,
        input_example=X_train_scaled[0:1],
        registered_model_name="DT_GMM_Cluster_Model"
    )

Registered model 'DT_GMM_Cluster_Model' already exists. Creating a new version of this model...
2024/04/24 19:29:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DT_GMM_Cluster_Model, version 6
Created version '6' of model 'DT_GMM_Cluster_Model'.


In [5]:
# Model retrieval and prediction check
loaded_model = mlflow.pyfunc.load_model(f"models:/DT_GMM_Cluster_Model/Production")
predictions = loaded_model.predict(X_test_scaled)

  latest = client.get_latest_versions(name, None if stage is None else [stage])


Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/04/24 19:29:02 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [6]:
# Creating a result DataFrame
result = pd.DataFrame(X_test_scaled, columns=X.columns)
result["actual_class"] = y_test
result["predicted_class"] = predictions

result.head()

Unnamed: 0,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,...,Education_PhD,Marital_Status_Alone,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_YOLO,actual_class,predicted_class
0,0.232143,0.294693,0.0,0.5,0.959596,0.240617,0.0,0.077146,0.050193,0.098859,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,0
1,0.5,0.381941,0.0,0.5,0.010101,0.253351,0.0,0.109049,0.374517,0.653992,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,0
2,0.428571,0.494189,0.0,0.0,0.353535,0.323727,0.371859,0.065545,0.65251,0.140684,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,1
3,0.732143,0.216618,0.5,0.0,0.080808,0.007373,0.025126,0.012181,0.046332,0.019011,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,0
4,0.642857,0.142927,0.5,0.0,0.070707,0.009383,0.035176,0.0058,0.042471,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,0
