In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import cudf
import dask.array as da
from cuml.linear_model import LogisticRegression
from cuml.preprocessing.model_selection import train_test_split
from sklearn.datasets import load_iris

import pandas as pd
import optuna
import numpy as np
import mlflow
import cuml
from cuml.ensemble import RandomForestRegressor
import sklearn
from cuml.metrics import accuracy_score

In [3]:
from contextlib import contextmanager
import time

@contextmanager
def timed(name):
    t0 = time.time()
    yield
    t1 = time.time()
    print("..%-24s:  %8.4f" % (name, t1 - t0))

In [4]:
def objective(trial):
    
    max_depth = trial.suggest_int("max_depth", 5, 15)
    n_estimators = trial.suggest_int("n_estimators", 100, 750)

    classifier = RandomForestRegressor(max_depth=max_depth,
                                       n_estimators=n_estimators)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y)
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_valid)
    
    score = accuracy_score(y_valid, y_pred)
    return score

In [5]:
N_TRIALS = 25
INPUT_FILE = "/home/hyperopt/data/air_par.parquet"

In [6]:
with timed("etl"):
    df = cudf.read_parquet(INPUT_FILE)
    df['Dest'] = df['Dest'].astype('category').cat.codes.astype('float32')
    df['Origin'] = df['Origin'].astype('category').cat.codes.astype('float32')
    df['UniqueCarrier'] = df['UniqueCarrier'].astype('category').cat.codes.astype('float32')
    
    X, y = df.drop(["ArrDelayBinary"], axis=1), df["ArrDelayBinary"].astype('int32')

..etl                     :    1.6268


In [7]:
with timed("hpo"):
    study = optuna.create_study(direction="maximize") # Equivalent to an experiment, a set of trials
    study.optimize(objective, n_trials=N_TRIALS)

[32m[I 2020-06-12 17:00:26,404][0m Finished trial#0 with value: 0.8305467963218689 with parameters: {'max_depth': 11, 'n_estimators': 456}. Best is trial#0 with value: 0.8305467963218689.[0m
[32m[I 2020-06-12 17:00:38,409][0m Finished trial#1 with value: 0.8306936025619507 with parameters: {'max_depth': 10, 'n_estimators': 210}. Best is trial#1 with value: 0.8306936025619507.[0m
[32m[I 2020-06-12 17:01:12,615][0m Finished trial#2 with value: 0.8308894038200378 with parameters: {'max_depth': 14, 'n_estimators': 608}. Best is trial#2 with value: 0.8308894038200378.[0m
[32m[I 2020-06-12 17:01:19,251][0m Finished trial#3 with value: 0.8305041790008545 with parameters: {'max_depth': 9, 'n_estimators': 111}. Best is trial#2 with value: 0.8308894038200378.[0m
[32m[I 2020-06-12 17:01:34,596][0m Finished trial#4 with value: 0.8305754065513611 with parameters: {'max_depth': 10, 'n_estimators': 268}. Best is trial#2 with value: 0.8308894038200378.[0m
[32m[I 2020-06-12 17:02:04,659

..hpo                     :  529.2843


In [8]:
params_max_depth, params_n_estimators = study.trials_dataframe()['params_max_depth'], study.trials_dataframe()['params_n_estimators']

In [9]:
with timed("sequential-call"):
    max_acc = -1
    for i in range(N_TRIALS):
        classifier = RandomForestRegressor(max_depth=params_max_depth[i],
                                           n_estimators=params_n_estimators[i])

        X_train, X_valid, y_train, y_valid = train_test_split(X, y)

        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_valid)
        score = accuracy_score(y_valid, y_pred)
        if score > max_acc:
            max_acc = score

..sequential-call         :  529.1178


In [10]:
print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  25
Best trial:
  Value:  0.831117570400238
  Params: 
    max_depth: 12
    n_estimators: 471


In [11]:
# def mlflow_callback(study, trial):
#     trial_value = trial.value if trial.value is not None else float("nan")
#     with mlflow.start_run(run_name=study.study_name):
#         mlflow.log_params(trial.params)
#         mlflow.log_metrics({"accuracy": trial_value})

In [12]:
# with timed("mlflow-gpu"):
#     study = optuna.create_study(direction="maximize")
#     study.optimize(objective, n_trials=N_TRIALS, timeout=600, callbacks=[mlflow_callback])


In [13]:
# CPU with 750 estimators max does not finish running after hours.
# def objective_cpu(trial):
    
#     max_depth = trial.suggest_int("max_depth", 5, 15)
#     n_estimators = trial.suggest_int("n_estimators", 100, 750)

#     classifier = sklearn.ensemble.RandomForestRegressor(max_depth=max_depth,
#                                        n_estimators=n_estimators)

#     X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X_, y_)
    
#     classifier.fit(X_train, y_train)
#     y_pred = classifier.predict(X_valid)
    
#     score = accuracy_score(y_valid, y_pred)
#     return score

In [14]:
# with timed("cpu-etl"):
#     df_pd = pd.read_parquet(INPUT_FILE)
#     df_pd['Dest'] = df_pd['Dest'].astype('category').cat.codes.astype('float32')
#     df_pd['Origin'] = df_pd['Origin'].astype('category').cat.codes.astype('float32')
#     df_pd['UniqueCarrier'] = df_pd['UniqueCarrier'].astype('category').cat.codes.astype('float32')
    
#     X_, y_ = df_pd.drop(["ArrDelayBinary"], axis=1), df_pd["ArrDelayBinary"].astype('int32')
    
# with timed("cpu-hpo"):
#     study = optuna.create_study(direction="maximize") # Equivalent to an experiment, a set of trials
#     study.optimize(objective_cpu, n_trials=N_TRIALS)

In [15]:
with timed("mlflow-cpu"):
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_cpu, n_trials=N_TRIALS, timeout=600, callbacks=[mlflow_callback])

NameError: name 'objective_cpu' is not defined

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))