- To register the best performing model to the model registery
- We use the march green trip dataset to test the best 5 models from the hyper parameter tuning that was perfromed in the last step.
***
1. First step is to select 5 models with the least rmse values from the last step
2. Now these models are to be trained and tested again on the train, val and test dataset
3. Log all the parameters (train loss, val loss, and test loss)
4. select the model with the lease `test_loss` rmse value and promote the model to the model registry

In [1]:
import os
import pickle
import click
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

print("cwd is: ", os.getcwd())

cwd is:  /workspaces/MLOps/02-experiment_tracking


In [2]:
HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

In [3]:
# now set the tracking server uri to the local tracking server and initialize the autolog
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()

In [4]:
# to load the dataset
def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [5]:

def train_and_log_model(data_path, params, name):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        new_params = {}
        for param in RF_PARAMS:
            new_params[param] = int(params[param])

        mlflow.set_tag("model_name_in_hyperopt", name)
        
        rf = RandomForestRegressor(**new_params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = root_mean_squared_error(y_val, rf.predict(X_val))
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = root_mean_squared_error(y_test, rf.predict(X_test))
        mlflow.log_metric("test_rmse", test_rmse)

In [6]:

def run_register_model(data_path: str, top_n: int):

    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params, name = run.info.run_id)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.test_rmse ASC"]
     )[0]

    # Register the best model
    run_id = best_run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri= model_uri, name = "best_model_taxi_trip_duration")

data_path = "processed_green_trip_data"
run_register_model(data_path=data_path, top_n=1)

: 

`Client` can be used for various purposes like creating a new experiment, search them, register a model....

In [1]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///backend.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [None]:
cruns = client.search_runs(
        experiment_ids=1,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=2,
        order_by=["metrics.rmse ASC"]
    )

cruns

[<Run: data=<RunData: metrics={'rmse': 5.322418787243458}, params={'max_depth': '20',
  'min_samples_leaf': '2',
  'min_samples_split': '3',
  'n_estimators': '23',
  'random_state': '42'}, tags={'mlflow.runName': 'brawny-ram-840',
  'mlflow.source.name': '/home/codespace/anaconda3/envs/env_mlflow/lib/python3.13/site-packages/ipykernel_launcher.py',
  'mlflow.source.type': 'LOCAL',
  'mlflow.user': 'codespace',
  'model': 'rfr',
  'type': 'hyperparameter tuning'}>, info=<RunInfo: artifact_uri='mlflow-artifacts:/1/30c6e283ac8547998edcc2372ffc9a3e/artifacts', end_time=1748617134011, experiment_id='1', lifecycle_stage='active', run_id='30c6e283ac8547998edcc2372ffc9a3e', run_name='brawny-ram-840', run_uuid='30c6e283ac8547998edcc2372ffc9a3e', start_time=1748617126461, status='FINISHED', user_id='codespace'>, inputs=<RunInputs: dataset_inputs=[]>>]

In [19]:
cruns[0].info.run_id

'30c6e283ac8547998edcc2372ffc9a3e'