# Model Testing

In [0]:
%pip install lightgbm==4.6.0

In [0]:
import mlflow
import mlflow.lightgbm
from mlflow.tracking import MlflowClient
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from pyspark.sql.functions import pandas_udf, PandasUDFType, struct
import pyspark.sql.functions as F
from pyspark.sql.types import DoubleType
from pyspark.sql import SparkSession
from ast import literal_eval

spark = SparkSession.builder.getOrCreate()

In [0]:
preparation = dbutils.widgets.get("preparation") if "preparation" in dbutils.widgets.getAll() else "unmodified"
job_id      = dbutils.widgets.get("job_id") if "job_id" in dbutils.widgets.getAll() else -1
museum_and_parks = literal_eval(dbutils.widgets.get("museum_and_parks")) if "museum_and_parks" in dbutils.widgets.getAll() else False

# Source mlflow parameters from training step
mlflow_run_name = dbutils.jobs.taskValues.get(taskKey = "train_model", key="mlflow_run_name", debugValue="rental_predictions_lightgbm_unmodified_-1" )
mlflow_run_id = dbutils.jobs.taskValues.get(taskKey="train_model", key="mlflow_run_id", debugValue="-1")

### Load Registered Model

In [0]:
registered_model_name = "workspace.rental_predictions.price_regressor_lightgbm"

client = MlflowClient()
versions = client.search_model_versions(f"name='{registered_model_name}'")

version_number = None
for v in versions:
    # Use get_model_version to get alias list (Unity Catalog safe)
    mv = client.get_model_version(name=registered_model_name, version=v.version)
    aliases = [a for a in mv.aliases]  # mv.aliases is a list of Alias objects
    if mlflow_run_name in aliases:
        version_number = v.version
        break

if version_number is None:
    raise ValueError(f"No model version found with alias '{mlflow_run_name}'")

print(f"Version: {version_number}, alias: {mlflow_run_name}")

In [0]:
model_uri = f"models:/{registered_model_name}/{version_number}"

lgb_model = mlflow.lightgbm.load_model(model_uri)

In [0]:
@pandas_udf(DoubleType())
def predict_udf(pdf: pd.DataFrame) -> pd.Series:
    pdf = pdf.copy()
    pdf.columns = all_features
    for col in categorical_cols:
        pdf[col] = pdf[col].astype("category")
    preds = lgb_model.predict(
        pdf, 
        num_iteration=lgb_model.best_iteration
    )
    return pd.Series(preds)

### Import Test Data and Inference

In [0]:
from features import (
    categorical_cols, 
    numeric_cols, 
    target_col
)

if museum_and_parks:
    categorical_cols += ["nearest_museum", "nearest_park"]
    numeric_cols += ["distance_to_museum", "distance_to_park"]

all_features = categorical_cols + numeric_cols

In [0]:
museum_parks_str = "_museum_and_parks" if museum_and_parks else ""

test_query = f"""
select *
from workspace.rental_predictions.prepared_testing_sets{museum_parks_str}
where data_set_preparation = '{preparation}' and snapshot_timestamp = (
    select max(snapshot_timestamp)
    from workspace.rental_predictions.prepared_testing_sets{museum_parks_str}
    where data_set_preparation = '{preparation}')
"""

test_data = spark.sql(test_query)

In [0]:
test_data = (
    test_data
    .withColumn(
        "price_prediction",
        predict_udf(struct(*all_features))
    )
)

In [0]:
display(test_data)

### Evaluation & Metrics

In [0]:
with mlflow.start_run(run_name=mlflow_run_name, run_id=str(mlflow_run_id)) as run:

    metrics = (
        test_data
        .agg(
            F.sqrt(F.mean(F.col("Price") - F.col("price_prediction"))**2 ).alias("test_rmse"),
            F.mean(F.abs(F.col("Price") - F.col("price_prediction"))).alias("test_mae"),
            F.mean(
                F.try_divide(
                    F.abs(F.col("Price") - F.col("price_prediction")),
                    F.col("Price")
                )
            ).alias("test_mape"),
            F.mean(F.col("Price") - F.col("price_prediction")).alias("test_bias")
        )
    ).collect()[0].asDict()

    mlflow.log_metrics(metrics)

display(metrics)

In [0]:
price_df = (
    test_data
    .withColumn("price_residual", F.col("Price") - F.col("price_prediction"))
).select("price_residual").toPandas()

plt.figure(figsize=(15,8))
plt.hist(price_df["price_residual"], bins=50, edgecolor = 'k', alpha =0.5, label='Test Errors')
plt.legend()
plt.xlabel("Testing Residuals")
plt.ylabel("Frequency")
plt.title("Distribution of Residuals (Test Set)")

fig_path = f"test_residuals_hist_{mlflow_run_name}.png"
plt.savefig(fig_path)
mlflow.log_artifact(fig_path, run_id=str(mlflow_run_id))
plt.show()