In [0]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import ParameterGrid
import mlflow
import mlflow.sklearn

In [0]:
mlflow.set_registry_uri("databricks")

## Load Training and Scoring Data

In [0]:
# Load normal data for training
df_train = spark.table("sensor_data_normal").select("temperature", "vibration", "pressure").toPandas()
X_train = df_train.values  # no scaling

# Load mixed data for scoring
df_score_spark = spark.table("sensor_data_mixed")
df_score = df_score_spark.toPandas()
X_score = df_score[["temperature", "vibration", "pressure"]].values

## Train the model

In [0]:
# Define Search Space for Tuning

param_grid = {
    "n_estimators": [50, 100],
    "contamination": [0.005, 0.01],
    "max_samples": ["auto"],
    "random_state": [42]
}

grid = list(ParameterGrid(param_grid))

In [0]:
mlflow.set_experiment("/Users/xuanang.liu@avanade.com/predictive_maintenance")

best_model = None
best_params = None
best_score = -np.inf

for params in grid:
    with mlflow.start_run():
        # Train model
        model = IsolationForest(**params)
        model.fit(X_train)

        # Score on scoring dataset
        scores = model.decision_function(X_score)
        mean_score = scores.mean()

        # Log to MLflow
        mlflow.log_params(params)
        mlflow.log_metric("mean_anomaly_score", mean_score)
        mlflow.sklearn.log_model(sk_model=model, name="model", input_example=X_train[:5])

        # Track best model
        if mean_score > best_score:
            best_score = mean_score
            best_model = model
            best_params = params
            best_run_id = mlflow.active_run().info.run_id

## Register the Best Model

In [0]:
model_uri = f"runs:/{best_run_id}/model"
model_name = "isolation_forest_pm_model"

mlflow.register_model(model_uri=model_uri, name=model_name)