In [1]:
import cudf
import dask.array as da
from cuml.preprocessing.model_selection import train_test_split
from sklearn.datasets import load_iris

import pandas as pd
import optuna
import numpy as np
import mlflow
import cuml
from cuml.ensemble import RandomForestClassifier
import sklearn
from cuml.metrics import accuracy_score

import random
import time

from joblib import parallel_backend

In [2]:
from contextlib import contextmanager
import time

@contextmanager
def timed(name):
    t0 = time.time()
    yield
    t1 = time.time()
    print("..%-24s:  %8.4f" % (name, t1 - t0))

In [3]:
N_TRIALS = 10
INPUT_FILE = "/home/hyperopt/data/air_par.parquet"
n_gpu = 2

In [4]:
import time

from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:46841  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 49.16 GB


In [5]:
df = cudf.read_parquet(INPUT_FILE)
X, y = df.drop(["ArrDelayBinary"], axis=1), df["ArrDelayBinary"].astype('int32')

In [6]:
def print_results(study):
    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [7]:
def objective(trial):
    # Please write actual objective function here.
    max_depth = trial.suggest_int("max_depth", 5, 7)
    n_estimators = trial.suggest_int("n_estimators", 100, 500)

    classifier = RandomForestClassifier(max_depth=max_depth,
                         n_estimators=n_estimators)


    X_train, X_valid, y_train, y_valid = train_test_split(X, y)

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    
    return score

In [8]:
with timed("multi-gpu"):
    study = optuna.create_study(storage="sqlite:///optuna_db.db", direction="maximize",
                                load_if_exists=True)
    with parallel_backend("dask", n_jobs=n_gpu):
        study.optimize(objective, n_trials=N_TRIALS, n_jobs=n_gpu)

[I 2020-06-22 22:03:31,060] A new study created with name: no-name-f04f06e5-8ac7-4f65-9a45-916563e28f48


..multi-gpu               :  108.3841


In [9]:
print_results(study)

Number of finished trials:  10
Best trial:
  Value:  0.830996572971344
  Params: 
    max_depth: 7
    n_estimators: 194


In [10]:
params_max_depth, params_n_estimators = study.trials_dataframe()['params_max_depth'], study.trials_dataframe()['params_n_estimators']

In [11]:
def seq_call(max_depth, n_estimators):
    classifier = RandomForestClassifier(max_depth=max_depth, n_estimators = n_estimators)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y)

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    return score, max_depth, n_estimators

In [17]:
scores = []
from joblib import Parallel, delayed
with timed("no-optuna-call"):
    with parallel_backend("dask", n_jobs=n_gpu):
        results = Parallel()(delayed(seq_call)(max_depth=params_max_depth[i],
                     n_estimators=params_n_estimators[i]) for i in range(N_TRIALS))
    print(results)

[(0.8307356238365173, 6, 375), (0.8306080102920532, 6, 396), (0.831063985824585, 7, 337), (0.8309422135353088, 5, 311), (0.831070601940155, 7, 324), (0.830725371837616, 5, 238), (0.8307965993881226, 5, 308), (0.830644428730011, 6, 183), (0.8310064077377319, 7, 469), (0.8308534026145935, 5, 488)]
..no-optuna-call          :   99.0706


In [12]:
def mlflow_callback(study, trial):
    trial_value = trial.value if trial.value is not None else float("nan")
    with mlflow.start_run(run_name=study.study_name):
        mlflow.log_params(trial.params)
        mlflow.log_metrics({"accuracy": trial_value})

In [20]:
with timed("mlflow-callback"):
    study = optuna.create_study(storage="sqlite:///mlflow_db.db", direction="maximize",
                               load_if_exists=True)
    with parallel_backend("dask", n_jobs=n_gpu):
        study.optimize(objective, n_trials=N_TRIALS, n_jobs=n_gpu, timeout=600, callbacks=[mlflow_callback])

[I 2020-06-22 21:03:03,693] A new study created with name: no-name-ac871ab4-5e43-4b54-9dae-afd1d5224bfe


..mlflow-callback         :   93.4839


In [21]:
print_results(study)

Number of finished trials:  10
Best trial:
  Value:  0.8313869833946228
  Params: 
    max_depth: 7
    n_estimators: 109


In [7]:
from cuml.dask.ensemble import RandomForestClassifier as dask_RF
def objective_mg(trial):
    # Please write actual objective function here.
    max_depth = trial.suggest_int("max_depth", 5, 7)
    n_estimators = trial.suggest_int("n_estimators", 100, 500)

    import dask_cudf 
    
    classifier = dask_RF(max_depth=max_depth,
                         n_estimators=n_estimators)


    X_train, X_valid, y_train, y_valid = train_test_split(X, y)

    X_train = dask_cudf.from_cudf(X_train, npartitions=2)
    X_valid = dask_cudf.from_cudf(X_valid, npartitions=2)
    
    y_train = dask_cudf.from_cudf(y_train, npartitions=2)
    y_valid = dask_cudf.from_cudf(y_valid, npartitions=2)
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    return score


In [9]:
# TODO : Setup postgres and try this 
# with timed("multi-GPU-estimators"):
#     study = optuna.create_study(direction="maximize",
#             storage="sqlite:///as.db")
#     study.optimize(objective_mg, n_trials=N_TRIALS, n_jobs=n_gpu,
#                   )

[I 2020-06-22 22:10:42,471] A new study created with name: no-name-d3d5a7c2-030e-4adf-9401-ad5cc085690c
[W 2020-06-22 22:10:48,408] Setting status of trial#0 as TrialState.FAIL because of the following error: AssertionError('Attempting to create treelite from un-fit forest.',)
Traceback (most recent call last):
  File "/opt/conda/envs/rapids/lib/python3.6/site-packages/optuna/study.py", line 734, in _run_trial
    result = func(trial)
  File "<ipython-input-7-9f13c1f783dd>", line 22, in objective_mg
    y_pred = classifier.predict(X_valid)
  File "/opt/conda/envs/rapids/lib/python3.6/site-packages/cuml/dask/ensemble/randomforestclassifier.py", line 305, in predict
    delayed=delayed)
  File "/opt/conda/envs/rapids/lib/python3.6/site-packages/cuml/dask/ensemble/randomforestclassifier.py", line 311, in predict_using_fil
    self.local_model = self._concat_treelite_models()
  File "/opt/conda/envs/rapids/lib/python3.6/site-packages/cuml/dask/ensemble/base.py", line 115, in _concat_treeli

AssertionError: Attempting to create treelite from un-fit forest.

[W 2020-06-22 22:10:53,357] Setting status of trial#2 as TrialState.FAIL because of the following error: AssertionError('Attempting to create treelite from un-fit forest.',)
Traceback (most recent call last):
  File "/opt/conda/envs/rapids/lib/python3.6/site-packages/optuna/study.py", line 734, in _run_trial
    result = func(trial)
  File "<ipython-input-7-9f13c1f783dd>", line 22, in objective_mg
    y_pred = classifier.predict(X_valid)
  File "/opt/conda/envs/rapids/lib/python3.6/site-packages/cuml/dask/ensemble/randomforestclassifier.py", line 305, in predict
    delayed=delayed)
  File "/opt/conda/envs/rapids/lib/python3.6/site-packages/cuml/dask/ensemble/randomforestclassifier.py", line 311, in predict_using_fil
    self.local_model = self._concat_treelite_models()
  File "/opt/conda/envs/rapids/lib/python3.6/site-packages/cuml/dask/ensemble/base.py", line 115, in _concat_treelite_models
    mod_bytes = self.client.compute(model_serialized_futures, sync=True)
  File "/opt/conda/en

In [None]:
# # CPU with 750 estimators max does not finish running after hours.
# def objective_cpu(trial):
    
#     max_depth = trial.suggest_int("max_depth", 5, 15)
#     n_estimators = trial.suggest_int("n_estimators", 100, 750)

#     classifier = sklearn.ensemble.RandomForestRegressor(max_depth=max_depth,
#                                        n_estimators=n_estimators)

#     X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X_, y_)
    
#     classifier.fit(X_train, y_train)
#     y_pred = classifier.predict(X_valid)
    
#     score = accuracy_score(y_valid, y_pred)
#     return score

In [None]:
# with timed("cpu-etl"):
#     df_pd = pd.read_parquet(INPUT_FILE)
#     X_, y_ = df_pd.drop(["ArrDelayBinary"], axis=1), df_pd["ArrDelayBinary"].astype('int32')
    
# with timed("cpu-hpo"):
#     study = optuna.create_study(direction="maximize") # Equivalent to an experiment, a set of trials
#     study.optimize(objective_cpu, n_trials=N_TRIALS, n_jobs=-1)