In [0]:
# Databricks notebook source
# Setup: libraries, MLflow experiment/registry, and config
import os
import uuid
from dataclasses import dataclass

import pandas as pd
import numpy as np

import mlflow
from mlflow import sklearn as mlflow_sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

# Databricks feature engineering
from databricks.feature_engineering import FeatureEngineeringClient

# Config
@dataclass
class Config:
    # Unity Catalog table with Iris data
    source_table: str = "workspace.iris.iris_species"
    # Primary key present in the table (see attached schema screenshot)
    primary_key: str = "id"
    # Target column name
    target_col: str = "species"
    # Registered model name in Unity Catalog
    registered_model: str = "workspace.iris.models/iris_rf_classifier"
    # MLflow experiment path (use a workspace path so it shows in UI)
    experiment_path: str = "/Shared/iris_feature_store_demo"

CFG = Config()

# Point MLflow to the desired experiment
mlflow.set_experiment(CFG.experiment_path)

# Initialize Feature Engineering client
fe = FeatureEngineeringClient()

print(f"Experiment set to: {mlflow.get_experiment_by_name(CFG.experiment_path).experiment_id}")
print(f"Registered model: {CFG.registered_model}")



In [None]:
# Create or use a Feature Engineering feature table for Iris
from pyspark.sql import functions as F
from pyspark.sql import types as T

spark.sql(f"USE CATALOG {CFG.registered_model.split('/')[0]}")

feature_table_name = "workspace.iris.features/iris_measurements_ft"

# Create a managed feature table if it doesn't exist, directly from the source Unity Catalog table
# Note: In real projects, you might build features with transformations; here we pass through numeric columns
numeric_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

source_df = spark.read.table(CFG.source_table).select(CFG.primary_key, *numeric_cols, CFG.target_col)

from databricks.feature_engineering import FeatureLookup

# Ensure feature table existence
fe.create_table(
    name=feature_table_name,
    primary_keys=[CFG.primary_key],
    schema=source_df.select(CFG.primary_key, *numeric_cols).schema,
    description="Iris numeric measurements as features",
    partition_columns=None,
    tags={"project": "iris", "dataset": "workspace.iris.iris_species"},
    if_not_exists=True,
)

# Write features (overwrite for idempotency in demo)
fe.write_table(
    name=feature_table_name,
    df=source_df.select(CFG.primary_key, *numeric_cols),
    mode="overwrite",
)

# Build a training set from the feature table; label is in the source table
feature_lookups = [
    FeatureLookup(
        table_name=feature_table_name,
        lookup_key=CFG.primary_key,
        feature_names=numeric_cols,
    )
]

training_set = fe.create_training_set(
    df=source_df.select(CFG.primary_key, CFG.target_col),
    feature_lookups=feature_lookups,
    label=CFG.target_col,
)

training_pd = training_set.load_df().toPandas()

X = training_pd[numeric_cols].astype(float)
y = training_pd[CFG.target_col].astype(str)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Shapes - train: {X_train.shape}, val: {X_val.shape}, test: {X_test.shape}")



In [None]:
# Train with MLflow autolog
import json

mlflow.sklearn.autolog(log_models=True)
run_params = {
    "n_estimators": 200,
    "max_depth": None,
    "random_state": 42,
}

with mlflow.start_run(run_name="iris_rf_training") as run:
    clf = RandomForestClassifier(**run_params)
    clf.fit(X_train, y_train)

    # Evaluate on validation
    val_preds = clf.predict(X_val)
    f1_val = f1_score(y_val, val_preds, average="weighted")

    mlflow.log_metric("f1_val", float(f1_val))
    mlflow.log_dict({"numeric_features": list(X.columns)}, "features.json")

    # Final test metrics for reporting
    test_preds = clf.predict(X_test)
    f1_test = f1_score(y_test, test_preds, average="weighted")
    mlflow.log_metric("f1_test", float(f1_test))

    # Log the training set lineage using Feature Engineering client
    fe.log_model(
        model=clf,
        artifact_path="model",
        flavor=mlflow_sklearn,
        training_set=training_set,
        registered_model_name=CFG.registered_model,
    )

    latest_model_uri = f"runs:/{run.info.run_id}/model"
    print("Validation F1:", f1_val, " Test F1:", f1_test)
    print("Model URI:", latest_model_uri)

latest_run = mlflow.last_active_run()
latest_run_id = latest_run.info.run_id if latest_run is not None else None
print("Latest Run:", latest_run_id)



In [None]:
# Compare latest model vs current champion by F1 and update alias if better
from mlflow.tracking import MlflowClient

client = MlflowClient()
model_name = CFG.registered_model
champion_alias = "champion"

# Helper: get metric value for a model version (f1_val prioritized, fallback to f1_test)
def _get_model_version_f1(model_name: str, version: str) -> float:
    mv = client.get_model_version(model_name, version)
    run = client.get_run(mv.run_id)
    data = run.data
    metrics = dict(data.metrics)
    # Prefer validation F1; fallback to test
    return float(metrics.get("f1_val", metrics.get("f1_test", float("nan"))))

# Resolve current champion version (if any)
champion_version = None
try:
    aliases = client.get_model_version_by_alias(model_name, champion_alias)
    champion_version = aliases.version
except Exception:
    pass

# Get latest created version
latest_versions = client.get_latest_versions(model_name)
latest_version = None
if latest_versions:
    # Choose the numerically greatest version as "latest"
    latest_version = sorted(latest_versions, key=lambda mv: int(mv.version))[-1].version

if latest_version is None:
    raise RuntimeError("No registered versions found for model. Ensure previous cell registered the model.")

latest_f1 = _get_model_version_f1(model_name, latest_version)
print(f"Latest version {latest_version} F1: {latest_f1}")

champion_f1 = None
if champion_version is not None:
    champion_f1 = _get_model_version_f1(model_name, champion_version)
    print(f"Current champion version {champion_version} F1: {champion_f1}")
else:
    print("No current champion alias set.")

should_promote = champion_f1 is None or (
    np.isfinite(latest_f1) and np.isfinite(champion_f1) and latest_f1 > champion_f1
)

if should_promote:
    # Point the champion alias to the latest version
    client.set_model_version_tag(model_name, latest_version, key="promoted_at", value=str(pd.Timestamp.utcnow()))
    client.set_registered_model_alias(model_name, champion_alias, latest_version)
    print(f"Promoted version {latest_version} to alias '{champion_alias}'.")
else:
    print("No promotion performed; champion F1 is greater or equal.")

