In [1]:
from cuml.preprocessing.model_selection import train_test_split

import cuml
from cuml.ensemble import RandomForestClassifier
from cuml.metrics import accuracy_score
from cuml.datasets.classification import make_classification

from joblib import parallel_backend

from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

# This will use all GPUs on the local host by default
cluster = LocalCUDACluster(threads_per_worker=1, ip="", dashboard_address="8002")
c = Client(cluster)

# Query the client for all connected workers
workers = c.has_what().keys()
n_workers = len(workers)
n_streams = 8 # Performance optimization
c



0,1
Client  Scheduler: tcp://172.17.0.2:40823  Dashboard: http://172.17.0.2:8002/status,Cluster  Workers: 2  Cores: 2  Memory: 49.16 GB


In [2]:
from contextlib import contextmanager
import time

@contextmanager
def timed(name):
    t0 = time.time()
    yield
    t1 = time.time()
    print("..%-24s:  %8.4f" % (name, t1 - t0))

In [3]:
def train_and_eval():
    classifier = RandomForestClassifier()

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=77)

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    return score

In [4]:
X, y = make_classification(n_samples=10000000, n_features=10, n_classes=2)
import joblib
with timed("dask-backend"):
    with parallel_backend("dask", n_jobs=n_workers, client=c):
        joblib.Parallel(verbose=100)(
        joblib.delayed(train_and_eval)()
        for i in range(10))

[Parallel(n_jobs=2)]: Using backend DaskDistributedBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   18.0s
[Parallel(n_jobs=2)]: Done   2 tasks      | elapsed:   22.1s
[Parallel(n_jobs=2)]: Done   3 tasks      | elapsed:   25.7s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   30.4s
[Parallel(n_jobs=2)]: Done   5 tasks      | elapsed:   36.0s
[Parallel(n_jobs=2)]: Done   6 tasks      | elapsed:   41.7s
[Parallel(n_jobs=2)]: Done   7 tasks      | elapsed:   46.8s
[Parallel(n_jobs=2)]: Done   8 out of  10 | elapsed:   52.4s remaining:   13.1s
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:  1.0min finished
..dask-backend            :   61.4132


In [5]:
with timed("loky-backend"):
    with parallel_backend("loky", n_jobs=n_workers):
        joblib.Parallel(verbose=100)(
        joblib.delayed(train_and_eval)()
        for i in range(10))

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   17.2s
[Parallel(n_jobs=2)]: Done   2 tasks      | elapsed:   17.6s
[Parallel(n_jobs=2)]: Done   3 tasks      | elapsed:   31.9s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   31.9s
[Parallel(n_jobs=2)]: Done   5 tasks      | elapsed:   46.2s
[Parallel(n_jobs=2)]: Done   6 tasks      | elapsed:   46.6s
[Parallel(n_jobs=2)]: Done   7 tasks      | elapsed:  1.0min
[Parallel(n_jobs=2)]: Done   8 out of  10 | elapsed:  1.0min remaining:   15.3s
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:  1.3min finished
..loky-backend            :   75.7332


In [6]:
with timed("seq"):
    for i in range(10):
        s= train_and_eval()

..seq                     :   76.3795
