In [0]:
# 1. Read variables
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
dataset = dbutils.widgets.get("dataset")

path = f"{catalog}.{schema}.{dataset}"
print(f"Iris Dataset Path: {path}")

In [0]:
# Databricks notebook source
# Setup: libraries, MLflow experiment/registry, and config
import os
import uuid
from dataclasses import dataclass

import pandas as pd
import numpy as np

import mlflow
from mlflow import sklearn as mlflow_sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

# Databricks feature engineering
from databricks.feature_engineering import FeatureEngineeringClient

# Config
@dataclass
class Config:
    # Unity Catalog table with Iris data
    source_table: str = path
    # Primary key present in the table (see attached schema screenshot)
    primary_key: str = "id"
    # Target column name
    target_col: str = "species"
    # Registered model name in Unity Catalog
    registered_model: str = f"{catalog}.{schema}.iris_rf_classifier"
    # MLflow experiment path (use a workspace path so it shows in UI)
    experiment_path: str = "/Shared/MLflow Experiments/iris_feature_store_demo"
    # minimum F1 score required for production
    f1_threshold: float = 0.80

CFG = Config()

# Point MLflow to the desired experiment
mlflow.set_experiment(CFG.experiment_path)

# Initialize Feature Engineering client
fe = FeatureEngineeringClient()

print(f"Experiment set to: {mlflow.get_experiment_by_name(CFG.experiment_path).experiment_id}")
print(f"Registered model: {CFG.registered_model}")



In [0]:
# Create or use a Feature Engineering feature table for Iris
from pyspark.sql import functions as F
from pyspark.sql import types as T

feature_table_name = CFG.source_table

# Create a managed feature table if it doesn't exist, directly from the source Unity Catalog table
# Note: In real projects, you might build features with transformations; here we pass through numeric columns
numeric_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

source_df = spark.read.table(CFG.source_table).select(CFG.primary_key, *numeric_cols, CFG.target_col)

from databricks.feature_engineering import FeatureLookup

# Build a training set from the feature table; label is in the source table
feature_lookups = [
    FeatureLookup(
        table_name=feature_table_name,
        lookup_key=CFG.primary_key,
        feature_names=numeric_cols,
    )
]

training_set = fe.create_training_set(
    df=source_df.select(CFG.primary_key, CFG.target_col),
    feature_lookups=feature_lookups,
    label=CFG.target_col,
)

training_pd = training_set.load_df().toPandas()

X = training_pd[numeric_cols].astype(float)
y = training_pd[CFG.target_col].astype(str)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Shapes - train: {X_train.shape}, val: {X_val.shape}, test: {X_test.shape}")

In [0]:
# Train with MLflow autolog
import json
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient

mlflow.sklearn.autolog(log_models=True)
run_params = {
    "n_estimators": 200,
    "max_depth": None,
    "random_state": 42,
}

with mlflow.start_run(run_name="iris_rf_training") as run:
    clf = RandomForestClassifier(**run_params)
    clf.fit(X_train, y_train)

    # Evaluate on validation
    val_preds = clf.predict(X_val)
    f1_val = f1_score(y_val, val_preds, average="weighted")
    mlflow.log_metric("f1_val", float(f1_val))
    mlflow.log_dict({"numeric_features": list(X.columns)}, "features.json")

    # Final test metrics
    test_preds = clf.predict(X_test)
    f1_test = f1_score(y_test, test_preds, average="weighted")
    mlflow.log_metric("f1_test", float(f1_test))

    # Prepare example and signature
    input_example = X_train.iloc[:3]
    signature = infer_signature(X_train, clf.predict(X_train))

    # Log and register the model
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="model",
        signature=signature,
        input_example=input_example,
        registered_model_name=CFG.registered_model,
    )

    challenger_model_uri = f"runs:/{run.info.run_id}/model"
    print("Validation F1:", f1_val, " Test F1:", f1_test)
    print("Model URI:", challenger_model_uri)

challenger_run = mlflow.last_active_run()
challenger_run_id = challenger_run.info.run_id if challenger_run is not None else None
print("challenger Run:", challenger_run_id)

# Assign the “challenger” alias to the newly created model version
client = MlflowClient()
versions = client.search_model_versions(f"name='{CFG.registered_model}'")
if not versions:
    raise RuntimeError("No versions found for registered model.")
# Sort by version integer descending
versions_sorted = sorted(versions, key=lambda mv: int(mv.version), reverse=True)
most_recent = versions_sorted[0]
client.set_registered_model_alias(
    name=CFG.registered_model,
    alias="challenger",
    version=most_recent.version
)

In [0]:
from mlflow.exceptions import MlflowException
from mlflow.tracking import MlflowClient

mlflow_client = MlflowClient()
model_name = CFG.registered_model
production_alias = "champion"

# Get the latest version of the registered model
all_versions = mlflow_client.search_model_versions(f"name='{model_name}'")
latest_version = max(int(v.version) for v in all_versions)
latest_model = mlflow_client.get_model_version(model_name, str(latest_version))

# Retrieve F1 score for the latest version
run = mlflow_client.get_run(latest_model.run_id)
latest_f1 = run.data.metrics.get("f1_val") or run.data.metrics.get("f1")

# Retrieve F1 score for the current champion version (if it exists)
try:
    champ_model = mlflow_client.get_model_version_by_alias(model_name, production_alias)
    champ_run = mlflow_client.get_run(champ_model.run_id)
    champ_f1 = champ_run.data.metrics.get("f1_val") or champ_run.data.metrics.get("f1")
except MlflowException:
    # No existing champion alias; treat its F1 as the lowest possible
    champ_f1 = float("-inf")

print(f"Latest version: {latest_version}, F1 score: {latest_f1:.4f}")
print(f"Champion version: {getattr(champ_model, 'version', None)}, F1 score: {champ_f1:.4f}")

# Promote to champion only if it outperforms the current champion
if latest_f1 > champ_f1:
    mlflow_client.set_registered_model_alias(
        name=model_name,
        version=str(latest_version),
        alias=production_alias
    )
    print(f"Promoted version {latest_version} to '{production_alias}' (F1 {latest_f1:.4f} > {champ_f1:.4f})")
else:
    print(f"Skipping promotion: latest F1 {latest_f1:.4f} ≤ champion F1 {champ_f1:.4f}")