In [2]:
import os
import pickle
import click
import mlflow
import optuna

from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import utils

In [6]:
TRACKING_URL = "http://ec2-18-142-183-214.ap-southeast-1.compute.amazonaws.com:5050"
EXPERIMENT_NAME ="random-forest-hyperopt"
data_path = "./output/"

In [4]:
mlflow.set_tracking_uri(TRACKING_URL)
mlflow.set_experiment(EXPERIMENT_NAME)

2023/05/26 15:58:34 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://s3-mlflow-artifacts-storage/mlflow/6', creation_time=1685087914152, experiment_id='6', last_update_time=1685087914152, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>

In [7]:
X_train, y_train = utils.load_pickle(os.path.join(data_path, "train.pkl"))
X_val, y_val = utils.load_pickle(os.path.join(data_path, "val.pkl"))

In [10]:
def objective(trial):

    with mlflow.start_run():

        mlflow.set_tag("developer", "nelsonlin")
        mlflow.set_tag("model", "RandomForest")

        params = {
            'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
            'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
            'random_state': 42,
            'n_jobs': -1
        }

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        mlflow.log_metric("rmse", rmse)

    return rmse

In [13]:
num_trials = 10

In [11]:
sampler = TPESampler(seed=42)

In [12]:
study = optuna.create_study(direction="minimize", sampler=sampler)

[32m[I 2023-05-26 16:06:40,702][0m A new study created in memory with name: no-name-1e40d7fc-1cc9-4550-8dd7-181005fd9c66[0m


In [14]:
study.optimize(objective, n_trials=num_trials)

[32m[I 2023-05-26 16:07:15,858][0m Trial 0 finished with value: 2.451379690825458 and parameters: {'n_estimators': 25, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-26 16:07:24,913][0m Trial 1 finished with value: 2.4667366020368333 and parameters: {'n_estimators': 16, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-26 16:07:34,803][0m Trial 2 finished with value: 2.449827329704216 and parameters: {'n_estimators': 34, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 2.449827329704216.[0m
[32m[I 2023-05-26 16:07:50,060][0m Trial 3 finished with value: 2.460983516558473 and parameters: {'n_estimators': 44, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 2 with value: 2.449827329704216.[0m
[32m[I 2023-05-26 16:08:37,453][0m Trial 4 finished

In [18]:
experiment_results = mlflow.search_experiments(filter_string=f"name='{EXPERIMENT_NAME}'")

In [19]:
experiment_results = experiment_results[0]

In [23]:
experiment_results

<Experiment: artifact_location='s3://s3-mlflow-artifacts-storage/mlflow/6', creation_time=1685087914152, experiment_id='6', last_update_time=1685087914152, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>