# productionize customer segment prediction model

1. Model Registration
2. Model Tracking 
3. Auto Logging


In [66]:
pip install --upgrade mlflow

Note: you may need to restart the kernel to use updated packages.


# mlflow tracking: Setup new experiment

In [67]:
import mlflow
import mlflow.sklearn
import mlflow.pyfunc

# MLflow tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Customer_Segment_Prediction_Model")

<Experiment: artifact_location='mlflow-artifacts:/991329419849384240', creation_time=1714001602979, experiment_id='991329419849384240', last_update_time=1714001602979, lifecycle_stage='active', name='Customer_Segment_Prediction_Model', tags={}>

# Model Traning

In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score
import mlflow
from mlflow.models import infer_signature

# Enable auto logging
mlflow.xgboost.autolog()

# Data loading
path = "/Users/kellyliu/Documents/GitHub/Customer-Personality-Analysis-2.0/Data/Clustered Data/Clustered_Data(GMM).csv"
df = pd.read_csv(path)
X = df.drop(['ID', 'Cluster'], axis=1)  # Drop the ID and Clusters column
y = df['Cluster']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the model parameters for grid search
params = {
    "max_depth": [3, 5],
    "learning_rate": [0.01, 0.1],
    "n_estimators": [10, 50],
    "subsample": [0.5, 0.7, 1.0]
}

# Initialize the classifier
xgb_clf = xgb.XGBClassifier(objective='multi:softprob', random_state=42)  # Updated model initialization

# Setup GridSearchCV
grid_search = GridSearchCV(xgb_clf, param_grid=params, scoring='accuracy', cv=2, verbose=1)

# Fit the model
grid_search.fit(X_train_scaled, y_train)

# Best estimator
best_clf = grid_search.best_estimator_

# Predictions and accuracy
y_pred = best_clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)

# Logging the best parameters and accuracy
mlflow.log_params(grid_search.best_params_)
mlflow.log_metric("accuracy", accuracy)

# Log the model with signature
signature = infer_signature(X_train_scaled, y_test.values.reshape(-1,1)) 
mlflow.xgboost.log_model(best_clf, "model", signature=signature)

print("Best parameters found: ", grid_search.best_params_)
print("Accuracy: ", accuracy)

# Model Logging and Registering

In [71]:
option = "NONE"

if option == "REGISTRY":
    with mlflow.start_run():
        # Log parameters and metrics
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)

        # Set a tag describing the run
        mlflow.set_tag("Training Info", "Model for Customer Segment Prediction")

        # Infer signature of the input and output of the model
        signature = infer_signature(X_train_scaled, y_pred)
        
        # Log the model and register it in the MLflow model registry under the name "Customer_Segment_Prediction_Model"
        mlflow.sklearn.log_model(
            sk_model=best_clf,
            artifact_path="model",
            registered_model_name="Customer_Segment_Prediction_Model"
        )


In [73]:
# Model retrieval and prediction check
loaded_model = mlflow.pyfunc.load_model(f"models:/Customer_Segment_Prediction_Model/Staging")
input_data = X_test_scaled[0:1]
predictions = loaded_model.predict(input_data)

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
print("Model predictions: ", predictions)

Model predictions:  [2]


In [77]:
!brew install kubernetes-cli
!kubectl version

zsh:1: command not found: brew


zsh:1: command not found: kubectl
