# Experiment Tracking

In [1]:
import subprocess
from pathlib import Path


PROJECT_DIR = Path().absolute().parent.parent

## Download dataset

We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). We'll use "**Green** Taxi Trip Records".

Download the data for January, February and March 2022.

In [2]:
DATA_DIR = PROJECT_DIR / "data"
S3_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
FILE_NAMES = [
    "green_tripdata_2022-01.parquet",
    "green_tripdata_2022-02.parquet",
    "green_tripdata_2022-03.parquet",
]


def download_data(file_name: str) -> None:
    file_path = DATA_DIR / file_name
    url = S3_URL + file_name

    if not file_path.is_file():
        print("File does not exist, downloading from S3 bucket.")
        if not file_path.parent.exists():
            file_path.parent.mkdir(parents=True)
        subprocess.run(["wget", "-O", file_path, url])
        print(f"File downloaded successfully and saved at {file_path}")
    else:
        print("File already exists.")


for file_name in FILE_NAMES:
    download_data(file_name)

File already exists.
File already exists.
File already exists.


## Q2. Preprocess the data

Run the script `preprocess_data.py` to preprocess the data and save the resulting files in the `data/processed` folder.

In [3]:
import click
from preprocess_data import run_data_prep


ctx = click.Context(run_data_prep)
ctx.invoke( 
    run_data_prep,
    raw_data_path=str(DATA_DIR),
    dest_path=str(DATA_DIR / "preprocessed")
)

So what's the size of the saved DictVectorizer file?

In [4]:
import os

BYTES_IN_KILOBYTES = 1024

def get_file_size(file_path):
    size_in_bytes = os.path.getsize(file_path)
    size_in_kilobytes = size_in_bytes / BYTES_IN_KILOBYTES
    return size_in_kilobytes


print(
    "Size of DictVectorizer:",
    get_file_size(DATA_DIR / "preprocessed" / "dv.pkl"),
    "KB"
)

Size of DictVectorizer: 150.05859375 KB


## Q3. Train a model with autolog

Modify the  `train.py` script to enable **autologging** with MLflow, execute the script and then launch the MLflow UI to check that the experiment run was properly tracked.

In [9]:
import os
import pickle

import mlflow
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


def load_pickle(filename: str | Path):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


# Mlflow setup
mlflow.set_tracking_uri(str(PROJECT_DIR / "mlruns"))
mlflow.set_experiment("hw2_local_experiment")
mlflow.sklearn.autolog()

data_path = DATA_DIR / "preprocessed"
X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))


with mlflow.start_run():

    rf = RandomForestRegressor(max_depth=10, random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)

    print(f"Training complete. RMSE: {rmse:.2f}")
    
    run_id = mlflow.active_run().info.run_id
    print(f"Run ID: {run_id}")

2023/06/12 16:32:46 INFO mlflow.tracking.fluent: Experiment with name 'hw2_local_experiment' does not exist. Creating a new experiment.


Training complete. RMSE: 2.45
Run ID: 54476df742a54cdd80f1cfbf05b3bfc4


In [18]:
# Print the logged parameters and metrics
run = mlflow.get_run(run_id)

print("Parameters:")
[print(f"\t{key}: {value}") for key, value in run.data.params.items()]

print("Metrics:")
[print(f"\t{key}: {value}") for key, value in run.data.metrics.items()]


Parameters:
	min_weight_fraction_leaf: 0.0
	max_depth: 10
	bootstrap: True
	min_samples_leaf: 1
	n_jobs: None
	min_samples_split: 2
	verbose: 0
	warm_start: False
	random_state: 0
	n_estimators: 100
	max_samples: None
	max_leaf_nodes: None
	ccp_alpha: 0.0
	oob_score: False
	max_features: 1.0
	min_impurity_decrease: 0.0
	criterion: squared_error
Metrics:
	training_root_mean_squared_error: 1.9456616836464489
	training_score: 0.2905920668431764
	training_r2_score: 0.2905920668431764
	training_mean_absolute_error: 1.4846553814437824
	mean_squared_error_X_val: 2.453983836538874
	training_mean_squared_error: 3.785599387209934


[None, None, None, None, None, None]

## Q4. Tune model hyperparameters

Now let's try to reduce the validation error by tuning the hyperparameters of the `RandomForestRegressor` using `optuna`.

We have prepared the script `hpo.py` for this exercise.

Make sure that the validation RMSE is logged to the tracking server for each run of the hyperparameter optimization (you will need to add a few lines of code to the `objective` function) and run the script without passing any parameters.

Start the MLflow Server with:

```bash
mlflow server --backend-store-uri sqlite:///mlruns.db --default-artifact-root ./mlruns
```

In [60]:
import pickle

import mlflow
import optuna
from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


# Set new experiment using sqlite as backend
MLFLOW_TRACKING_URI = PROJECT_DIR / "mlflow.db"
mlflow.set_tracking_uri("sqlite:///" + MLFLOW_TRACKING_URI.as_posix())

# Turn off autologging
mlflow.sklearn.autolog(disable=True)


mlflow.set_experiment("random-forest-hyperopt")


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run_optimization(data_path: str, num_trials: int):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 10, 50, 1),
            "max_depth": trial.suggest_int("max_depth", 1, 20, 1),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10, 1),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4, 1),
            "random_state": 42,
            "n_jobs": -1,
        }

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)

        with mlflow.start_run():
            mlflow.log_params(params)
            mlflow.log_metric("rmse", rmse)

        return rmse

    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction="minimize", sampler=sampler)
    study.optimize(objective, n_trials=num_trials)


data_path = str(DATA_DIR / "preprocessed")
num_trials = 10
run_optimization(data_path, num_trials)

[I 2023-06-06 17:22:47,036] A new study created in memory with name: no-name-a9d4066b-6a4c-4107-b37e-f01352fea9e5
[I 2023-06-06 17:22:47,639] Trial 0 finished with value: 2.451379690825458 and parameters: {'n_estimators': 25, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 2.451379690825458.
[I 2023-06-06 17:22:47,766] Trial 1 finished with value: 2.4667366020368333 and parameters: {'n_estimators': 16, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.451379690825458.
[I 2023-06-06 17:22:48,420] Trial 2 finished with value: 2.449827329704216 and parameters: {'n_estimators': 34, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 2.449827329704216.
[I 2023-06-06 17:22:48,712] Trial 3 finished with value: 2.460983516558473 and parameters: {'n_estimators': 44, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 2 with value: 2.44982732

In [61]:
experiment = mlflow.get_experiment_by_name("random-forest-hyperopt")
runs = mlflow.search_runs(experiment.experiment_id)

best_run = runs.loc[runs["metrics.rmse"].idxmin()]
best_run_id = best_run.run_id

print(f"Best run ID: {best_run_id}")
print(f"Best RMSE metrics: {best_run['metrics.rmse']:.2f}")


Best run ID: fd7e32f30f6a434f8684857bc03d2ab9
Best RMSE metrics: 2.45


## Q5. Promote the best model to the model registry

Update the script `register_model.py` so that it selects the model with the lowest RMSE on the test set and registers it to the model registry.

In [62]:
import os
import pickle
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = [
    "max_depth",
    "n_estimators",
    "min_samples_split",
    "min_samples_leaf",
    "random_state",
    "n_jobs",
]


MLFLOW_TRACKING_URI = PROJECT_DIR / "mlflow.db"
mlflow.set_tracking_uri("sqlite:///" + MLFLOW_TRACKING_URI.as_posix())
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)


def run_model_optimizer(data_path: Path, top_n: int = 5):

    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"],
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.test_rmse ASC"],
    )[0]

    return best_run


best_run = run_model_optimizer(data_path=DATA_DIR / "preprocessed")
mlflow.register_model(
    f"runs:/{best_run.info.run_id}/model",
    "random-forest-model",
)


Registered model 'random-forest-model' already exists. Creating a new version of this model...
2023/06/06 17:22:54 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: random-forest-model, version 3
Created version '3' of model 'random-forest-model'.


<ModelVersion: aliases=[], creation_timestamp=1686068574139, current_stage='None', description=None, last_updated_timestamp=1686068574139, name='random-forest-model', run_id='307957faf8a043a6b041cf98c652d961', run_link=None, source='/home/fernando/code/mlops-zoomcamp/mlruns/2/307957faf8a043a6b041cf98c652d961/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [56]:
# Print the best run ID and the best test RMSE
print(f"Best run ID: {best_run.info.run_id}")
print(f"Best test RMSE: {best_run.data.metrics['test_rmse']:.2f}")

Best run ID: b457d6ddac3741e9932fcd0dfd96ec5d
Best test RMSE: 2.29
