# Model Registry

## Dependencies

In [None]:
import pandas as pd
import numpy as np

import time
from snowflake.ml.registry import Registry
from snowflake.snowpark.context import get_active_session

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

## Data and Model setup

In [None]:
session = get_active_session()

session.use_database("ML")
session.use_schema("RETAIL_STORE")

df_model_data = session.table('model_data') # importing data

df_model_data = df_model_data.drop("CUSTOMER_ID", "OFFER_PRODUCT_ID") # dropping id columns
X = df_model_data.drop("REPEATER_INT")
y = df_model_data.select("REPEATER_INT")


FEATURE_COLS = X.columns[:len(X.columns)]
LABEL_COLS = ["REPEATER_INT"]

print(f"Feature Columns: {FEATURE_COLS}")

X = X.to_pandas()
y = y.to_pandas()

y = y.values.ravel()

# 80/20 train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
params_accuracy = {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20} 

model = RandomForestClassifier(**params_accuracy)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.4f}')

recall = recall_score(y_test, predictions)
print(f'Recall: {recall:.4f}')

## Registry

Storing a model by initialising a registry in the correct database and schema. 

In [None]:
sp_session = get_active_session()

reg = Registry(session=sp_session, database_name="ML", schema_name="RETAIL_STORE")

In [None]:
# Loggoing the model
mv = reg.log_model(model,
                   model_name="RandomForestClassifier",
                   version_name="v5",
                   conda_dependencies=["scikit-learn"],
                   comment="RandomForestClassifier-balanced",
                   #metrics={"accuracy": accuracy, "recall": recall, "true_positive": true_positive, "true_negative": true_negative, "false_positive": false_positive, "false_negative": false_negative},
                   sample_input_data=X_train.head(5))

It takes 55s long to log/store the model

In [None]:
m = reg.get_model("RandomForestClassifier")
m.show_versions()

In [None]:
m.version("v5")
v5 = m.version("v5")
print(v5.comment)

In [None]:
v5.set_metric("accuracy", accuracy)
v5.set_metric("recall", recall)

In [None]:
v5.show_metrics()

## Inference on stored models

In [None]:
sp_session = get_active_session()

reg = Registry(session=sp_session, database_name="ML", schema_name="RETAIL_STORE")
# predict
start_time  = time.time()

m = reg.get_model("RandomForestClassifier")
model = m.version("v4")

end_time = time.time()
load_time = end_time - start_time
print(f"Load time: {load_time}")

start_time = time.time()

predictions = model.run(X_test, function_name="predict")

end_time = time.time()
prediction_time = end_time - start_time
total_time = prediction_time + load_time
print(f"Prediction time: {prediction_time}")
print(f"Total time {total_time}")

In [None]:
sp_session = get_active_session()

reg = Registry(session=sp_session, database_name="ML", schema_name="RETAIL_STORE")
# predict

start_time = time.time()

m = reg.get_model("RandomForestClassifier")
mv = m.version("v4") 

clf = mv.load(force=True) # Requires exact same model, force = true

end_time = time.time()
load_time = end_time - start_time
print(f"Load time: {load_time}")

start_time = time.time()

predictions = clf.predict(X_test)

end_time = time.time()
prediction_time = end_time - start_time
print(f"Prediction time {prediction_time}")
print(f"Total time {total_time}")