# Model Training

### Imports & Parameters

In [0]:
%pip install lightgbm==4.6.0

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
from mlflow.tracking import MlflowClient
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error, max_error, r2_score
from ast import literal_eval

In [0]:
preparation = dbutils.widgets.get("preparation") if "preparation" in dbutils.widgets.getAll() else "unmodified"
job_id      = dbutils.widgets.get("job_id") if "job_id" in dbutils.widgets.getAll() else -1
museum_and_parks = literal_eval(dbutils.widgets.get("museum_and_parks")) if "museum_and_parks" in dbutils.widgets.getAll() else True
SEED = 42

# Set mlflow experiment path
experiment_path = "/Shared/experiments/rental_predictions"
mlflow.set_experiment(experiment_path)

### Dataset Preparation

In [0]:
museum_parks_str = "_museum_and_parks" if museum_and_parks else ""

query = f"""
select *
from workspace.rental_predictions.prepared_training_sets{museum_parks_str}
where data_set_preparation = '{preparation}' and snapshot_timestamp = (
    select max(snapshot_timestamp) 
    from workspace.rental_predictions.prepared_training_sets{museum_parks_str}
    where data_set_preparation = '{preparation}'
)
"""

training_data = spark.sql(query)
training_data = training_data.toPandas()
display(training_data)

In [0]:
from features import (
    categorical_cols, 
    numeric_cols, 
    target_col
)

if museum_and_parks:
    categorical_cols += ["nearest_museum", "nearest_park"]
    numeric_cols += ["distance_to_museum", "distance_to_park"]

In [0]:
X = training_data[categorical_cols + numeric_cols]
y = training_data[target_col]

# Casting categorical columns as the "category" datatype is necessary for LightGBM models since LightGBM uses optimal partitioning rather than other methods like one-hot encoding.
for col in categorical_cols:
    X[col] = X[col].astype("category")

# Creating training and validation sets for LightGBM model training.  Using an 80-20 split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=SEED)

train_set = lgb.Dataset(
    X_train,
    label=y_train,
    categorical_feature=categorical_cols,
    free_raw_data=True # release original data from memory
)

valid_set = lgb.Dataset(
    X_valid,
    label=y_valid,
    categorical_feature=categorical_cols,
    reference = train_set,
    free_raw_data=True # release original data from memory
)

# Note we are not performing a train-test split here since we are relying on the separation given in the prompt initially.

### Train LightGBM Model

In [0]:
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 63,
    "feature_fraction": 1.0,
    "bagging_fraction": 1.0,
    "bagging_freq": 0,
    "min_data_in_leaf": 10,
    "verbose": -1
}

In [0]:
mlflow.lightgbm.autolog()

# Create mlflow run_name identifier and pass to workflow widgets.  We will use this for logging metrics in a subsequent task.
run_name = f"rental_predictions_lightgbm_{preparation}{museum_parks_str}_{job_id}"
dbutils.jobs.taskValues.set(key="mlflow_run_name", value=run_name)

with mlflow.start_run(run_name = run_name) as run:
    dbutils.jobs.taskValues.set(key="mlflow_run_id", value=run.info.run_id)

    signature = mlflow.models.infer_signature(
        X_train, y_train
    )

    model = lgb.train(
        params=params,
        train_set=train_set,
        valid_sets=[train_set, valid_set],
        valid_names=["train", "valid"],
        num_boost_round=5000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
        ],
    )

    # Log LightGBM model to mlflow & Databricks' model registry
    registered_model_name = "workspace.rental_predictions.price_regressor_lightgbm"
    logged_model = mlflow.lightgbm.log_model(
        lgb_model=model,
        artifact_path="lightgbm",
        signature = signature,
        input_example=X_train.head(5),
        registered_model_name= registered_model_name
    )

    # Tag registered version with run_name
    client = MlflowClient()

    client.set_registered_model_alias(
        name=registered_model_name,
        alias=run_name,
        version=logged_model.registered_model_version
    )
    
    # Predictions
    train_preds = model.predict(X_train, num_iteration=model.best_iteration)
    valid_preds = model.predict(X_valid, num_iteration=model.best_iteration)

    # Log metrics to mlflow experiment and display
    metrics = {
        "rmse_train": np.sqrt(mean_squared_error(y_train, train_preds)),
        "rmse_val": np.sqrt(mean_squared_error(y_valid, valid_preds)),
        "mape_train": mean_absolute_percentage_error(y_train, train_preds),
        "mape_val": mean_absolute_percentage_error(y_valid, valid_preds),
        "mae_train": mean_absolute_error(y_train, train_preds),
        "mae_val": mean_absolute_error(y_valid, valid_preds),
        "r2_train": r2_score(y_train, train_preds),
        "r2_val": r2_score(y_valid, valid_preds),
        "bias_train": np.mean(y_train-train_preds),
        "bias_val": np.mean(y_valid-valid_preds),
        "max_error_train": max_error(y_train, train_preds),
        "max_error_val": max_error(y_valid, valid_preds)
    }
    mlflow.log_metrics(metrics)

    for metrics, value in metrics.items():
        print(metrics, value)

### View Model Metrics

In [0]:
importance_gain = model.feature_importance(importance_type="gain")
importance_split = model.feature_importance(importance_type="split")

# feature names
feature_names = model.feature_name()

features_gain = pd.DataFrame({
    "feature": feature_names,
    "importance_gain": importance_gain
}).sort_values(by="importance_gain", ascending=False)

features_split = pd.DataFrame({
    "feature": feature_names,
    "importance_gain": importance_split
}).sort_values(by="importance_gain", ascending=False)

print(features_gain)
print("\n")
print(features_split)

In [0]:
plt.figure(figsize=(15,8))
plt.hist(y_train-train_preds, bins=50, edgecolor = 'k', alpha =0.5, label='Training Errors')
plt.hist(y_valid-valid_preds, bins=50, edgecolor = 'k', alpha =0.5, label='Validation Errors')
plt.legend()
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Distribution of Residuals (Training + Validation Sets)")

fig_path = f"residuals_hist_{run_name}.png"
plt.savefig(fig_path)
mlflow.log_artifact(fig_path, run_id=run.info.run_id)
plt.show()