This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML will cause TPOT to raise a `ValueError`, indicating you should install cuML.

In [1]:
from tpot import TPOTRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
NSAMPLES = 50000
NFEATURES = 50
SEED = 12

# For cuML with TPOT, you must use CPU data (such as NumPy arrays)
X, y = make_regression(
    n_samples=NSAMPLES,
    n_features=NFEATURES,
    n_informative=NFEATURES,
    random_state=SEED,
    noise=200,
)

X = X.astype("float32")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)

Note that for cuML to work correctly, you must set `n_jobs=1` (the default setting).

In [3]:
# TPOT setup
GENERATIONS = 5
POP_SIZE = 100
CV = 5

tpot = TPOTRegressor(
    generations=GENERATIONS,
    population_size=POP_SIZE,
    random_state=SEED,
    config_dict="TPOT cuML",
    n_jobs=1, # cuML requires n_jobs=1
    cv=CV,
    verbosity=2,
)

tpot.fit(X_train, y_train)

preds = tpot.predict(X_test)
print(r2_score(y_test, preds))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=30.0, style=ProgressStyle(des…


Generation 1 - Current best internal CV score: -40245.878012401336
Generation 2 - Current best internal CV score: -40245.878012401336
Generation 3 - Current best internal CV score: -40245.878012401336
Generation 4 - Current best internal CV score: -40245.87130877891
Generation 5 - Current best internal CV score: -40245.87130877891
Best pipeline: Ridge(RobustScaler(input_matrix))
0.8281615479382644


In [4]:
tpot.export('tpot_regression_cuml_pipeline.py')
print(tpot.export())

import numpy as np
import pandas as pd
from cuml.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=12)

# Average CV score on the training set was: -40245.87130877891
exported_pipeline = make_pipeline(
    RobustScaler(),
    Ridge()
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 12)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

