# Q1

In [1]:
!pip install mlflow

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com




In [2]:
!mlflow --version

mlflow, version 2.3.2


# Q2

In [3]:
!python preprocess_data.py --raw_data_path data --dest_path data/cleaned

In [13]:
!ls -sh data/cleaned/dv.pkl

152K data/cleaned/dv.pkl


# Q3

In [4]:
import os
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [5]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

TEST_EXPERIMENT = "nyc-taxi-experiment"
mlflow.set_experiment(TEST_EXPERIMENT)

def run_train(data_path: str = 'data/cleaned'):
    with mlflow.start_run():
        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
        
        max_depth = 10
        rf = RandomForestRegressor(max_depth=max_depth, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_param('max_depth', max_depth)

run_train()

2023/05/28 16:01:01 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


# Q4

In [6]:
import optuna

from optuna.samplers import TPESampler

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
mlflow.set_experiment(HPO_EXPERIMENT_NAME)

def run_optimization(data_path: str = 'data/cleaned', num_trials: int = 10):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(trial):
        with mlflow.start_run():
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
                'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
                'random_state': 42,
                'n_jobs': -1
            }

            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            rmse = mean_squared_error(y_val, y_pred, squared=False)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_params(params)
        return rmse

    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction="minimize", sampler=sampler)
    study.optimize(objective, n_trials=num_trials)

run_optimization()


2023/05/28 16:01:06 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.
[32m[I 2023-05-28 16:01:06,986][0m A new study created in memory with name: no-name-38230248-7845-41cd-bfa2-aec735c16f54[0m
[32m[I 2023-05-28 16:01:07,504][0m Trial 0 finished with value: 2.451379690825458 and parameters: {'n_estimators': 25, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-28 16:01:07,772][0m Trial 1 finished with value: 2.4667366020368333 and parameters: {'n_estimators': 16, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-28 16:01:08,223][0m Trial 2 finished with value: 2.449827329704216 and parameters: {'n_estimators': 34, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 2.449827329704216.[0m
[32m[I 2023-05-28 16:01

# Q5

In [7]:
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state', 'n_jobs']

mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()

def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)



data_path: str = 'data/cleaned'
top_n: int = 5

client = MlflowClient()

# Retrieve the top_n model runs and log the models
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=top_n,
    order_by=["metrics.rmse ASC"]
)
for run in runs:
    train_and_log_model(data_path=data_path, params=run.data.params)

# Select the model with the lowest test RMSE
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
best_run = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=top_n,
    order_by=["metrics.test_rmse ASC"]
)[0]

# Register the best model
run_id = best_run.info.run_id
model_uri = f'runs:/{run_id}/model'
print(f'model_uri {model_uri}')
mlflow.register_model(model_uri=model_uri, name=EXPERIMENT_NAME)


2023/05/28 16:01:09 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.
Successfully registered model 'random-forest-best-models'.
2023/05/28 16:01:16 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: random-forest-best-models, version 1


model_uri runs:/f1ea05ec44ca48aaa0e614a609c66980/model


Created version '1' of model 'random-forest-best-models'.


<ModelVersion: aliases=[], creation_timestamp=1685282476807, current_stage='None', description='', last_updated_timestamp=1685282476807, name='random-forest-best-models', run_id='f1ea05ec44ca48aaa0e614a609c66980', run_link='', source='mlflow-artifacts:/3/f1ea05ec44ca48aaa0e614a609c66980/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>