# productionize customer segment prediction model

1. Model Registration
2. Model Tracking 
3. Auto Logging


In [49]:
pip install --upgrade mlflow

Note: you may need to restart the kernel to use updated packages.


# mlflow tracking: Setup new experiment

In [50]:
import mlflow
import mlflow.sklearn
import mlflow.pyfunc

# MLflow tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Customer_Segment_Prediction_Model")

<Experiment: artifact_location='mlflow-artifacts:/991329419849384240', creation_time=1714001602979, experiment_id='991329419849384240', last_update_time=1714001602979, lifecycle_stage='active', name='Customer_Segment_Prediction_Model', tags={}>

# Model

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score
import mlflow
from mlflow.models import infer_signature

# Enable auto logging
mlflow.xgboost.autolog()

# Data loading
path = "https://raw.githubusercontent.com/McGill-MMA-EnterpriseAnalytics/Customer-Personality-Analysis-2.0/main/Data/Clustered%20Data/Clustered_Data(FCM).csv?token=GHSAT0AAAAAACL3VCJAJEBP7YLR57PARZ2CZRJTHGA"
df = pd.read_csv(path)
X = df.drop(['ID', 'Cluster'], axis=1)  # Drop the ID and Clusters column
y = df['Cluster']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the model parameters for grid search
params = {
    "max_depth": [3, 5],
    "learning_rate": [0.01, 0.1],
    "n_estimators": [10, 50],
    "subsample": [0.5, 0.7, 1.0]
}

# Initialize the classifier
xgb_clf = xgb.XGBClassifier(objective='multi:softprob', random_state=42)  # Updated model initialization

# Setup GridSearchCV
grid_search = GridSearchCV(xgb_clf, param_grid=params, scoring='accuracy', cv=3, verbose=1)

# Fit the model
grid_search.fit(X_train_scaled, y_train)

# Best estimator
best_clf = grid_search.best_estimator_

# Predictions and accuracy
y_pred = best_clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)

# Logging the best parameters and accuracy
mlflow.log_params(grid_search.best_params_)
mlflow.log_metric("accuracy", accuracy)

# Log the model with signature
signature = infer_signature(X_train_scaled, y_test.values.reshape(-1,1)) 
mlflow.xgboost.log_model(best_clf, "model", signature=signature)

print("Best parameters found: ", grid_search.best_params_)
print("Accuracy: ", accuracy)

2024/04/24 19:43:30 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6d6154182c894394830bf9ea7b169c8c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 3 folds for each of 24 candidates, totalling 72 fits


2024/04/24 19:43:36 INFO mlflow.sklearn.utils: Logging the 5 best runs, 19 runs will be omitted.


              precision    recall  f1-score   support

           0       0.96      0.97      0.97       180
           1       0.98      0.95      0.97       146
           2       0.91      0.93      0.92       121

    accuracy                           0.95       447
   macro avg       0.95      0.95      0.95       447
weighted avg       0.95      0.95      0.95       447

Best parameters found:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.5}
Accuracy:  0.9530201342281879


In [52]:
import mlflow.sklearn

with mlflow.start_run():
    # Log parameters and metrics
    mlflow.log_params(params)
    mlflow.log_metric("accuracy", accuracy)

    # Set a tag describing the run
    mlflow.set_tag("Training Info", "Model for Customer Segment Prediction")

    # Infer signature of the input and output of the model
    signature = infer_signature(X_train_scaled, y_pred)
    
    # Log the model and register it in the MLflow model registry under the name "Customer_Segment_Prediction_Model"
    mlflow.sklearn.log_model(
        sk_model=best_clf,
        artifact_path="model",
        registered_model_name="Customer_Segment_Prediction_Model"
    )


Exception: Run with UUID c83e08b06dbc4957acdfacf78c435ead is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [56]:
# Model retrieval and prediction check
loaded_model = mlflow.pyfunc.load_model(f"models:/Customer_Segment_Prediction_Model/Production")
input_data = X_test_scaled[0:1]
predictions = loaded_model.predict(input_data)

  latest = client.get_latest_versions(name, None if stage is None else [stage])


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

In [58]:
print("Model predictions: ", predictions)

Model predictions:  [2]
