In [1]:
import os
import pickle
import click
import mlflow
import optuna

from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import utils

In [3]:
TRACKING_URL = "http://ec2-18-142-183-214.ap-southeast-1.compute.amazonaws.com:5050"
EXPERIMENT_NAME ="random-forest-hyperopt"
data_path = "./output/"

In [4]:
mlflow.set_tracking_uri(TRACKING_URL)
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='s3://s3-mlflow-artifacts-storage/mlflow/6', creation_time=1685087914152, experiment_id='6', last_update_time=1685087914152, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>

In [5]:
X_train, y_train = utils.load_pickle(os.path.join(data_path, "train.pkl"))
X_val, y_val = utils.load_pickle(os.path.join(data_path, "val.pkl"))

In [6]:
def objective(trial):

    with mlflow.start_run():

        mlflow.set_tag("developer", "nelsonlin")
        mlflow.set_tag("model", "RandomForest")

        params = {
            'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
            'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
            'random_state': 42,
            'n_jobs': -1
        }

        mlflow.log_params(params)
        
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        mlflow.log_metric("rmse", rmse)

    return rmse

In [7]:
num_trials = 10

In [8]:
sampler = TPESampler(seed=42)

In [9]:
study = optuna.create_study(direction="minimize", sampler=sampler)

[32m[I 2023-05-26 16:23:11,390][0m A new study created in memory with name: no-name-411f383f-9879-4f8f-9c0c-482a6bafede7[0m


In [10]:
study.optimize(objective, n_trials=num_trials)

[32m[I 2023-05-26 16:23:13,620][0m Trial 0 finished with value: 2.451379690825458 and parameters: {'n_estimators': 25, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-26 16:23:14,365][0m Trial 1 finished with value: 2.4667366020368333 and parameters: {'n_estimators': 16, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-26 16:23:15,515][0m Trial 2 finished with value: 2.449827329704216 and parameters: {'n_estimators': 34, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 2.449827329704216.[0m
[32m[I 2023-05-26 16:23:16,364][0m Trial 3 finished with value: 2.460983516558473 and parameters: {'n_estimators': 44, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 2 with value: 2.449827329704216.[0m
[32m[I 2023-05-26 16:23:17,212][0m Trial 4 finished

In [55]:
from mlflow.tracking import MlflowClient

In [56]:
client = MlflowClient()

In [57]:
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)

In [58]:
runs = client.search_runs(experiment_ids=experiment.experiment_id,
                                        order_by=["metrics.rmse ASC"],max_results=1)

In [59]:
best_run = runs[0]

In [60]:
best_run

<Run: data=<RunData: metrics={'rmse': 2.449827329704216}, params={'max_depth': '15',
 'min_samples_leaf': '4',
 'min_samples_split': '2',
 'n_estimators': '34',
 'n_jobs': '-1',
 'random_state': '42'}, tags={'developer': 'nelsonlin',
 'mlflow.runName': 'youthful-bug-321',
 'mlflow.source.git.commit': '9b60fcf7ac3f82fa589a64091376d0663262f8c6',
 'mlflow.source.name': '/opt/homebrew/Caskroom/miniforge/base/envs/mlflow/lib/python3.10/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'nelsonlin',
 'model': 'RandomForest'}>, info=<RunInfo: artifact_uri='s3://s3-mlflow-artifacts-storage/mlflow/6/3f39350ee30040579c7edb8f64bf471c/artifacts', end_time=1685089395416, experiment_id='6', lifecycle_stage='active', run_id='3f39350ee30040579c7edb8f64bf471c', run_name='youthful-bug-321', run_uuid='3f39350ee30040579c7edb8f64bf471c', start_time=1685089394367, status='FINISHED', user_id='nelsonlin'>>

In [61]:
run_id = best_run.info.run_id
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="green-taxi-tip-amount-best-model")

Successfully registered model 'green-taxi-tip-amount-best-model'.
2023/05/26 16:50:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: green-taxi-tip-amount-best-model, version 1
Created version '1' of model 'green-taxi-tip-amount-best-model'.


<ModelVersion: aliases=[], creation_timestamp=1685091023828, current_stage='None', description='', last_updated_timestamp=1685091023828, name='green-taxi-tip-amount-best-model', run_id='3f39350ee30040579c7edb8f64bf471c', run_link='', source='s3://s3-mlflow-artifacts-storage/mlflow/6/3f39350ee30040579c7edb8f64bf471c/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>