In [1]:
#
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

<img src="http://developer.download.nvidia.com/notebooks/dlsw-notebooks/tensorrt_torchtrt_efficientnet/nvidia_logo.png" width="90px">

# Distributed Hyperparameter Tuning: Optuna + Spark Dataframes


This demo demonstrates distributed hyperparameter tuning for XGBoost using Spark Dataframes.  
We implement best practices to precompute data and maximize computations on the GPU.  

Reference: https://forecastegy.com/posts/xgboost-hyperparameter-tuning-with-optuna/

#### Note:
Before running, please make sure you've followed the relevant [setup instructions](../README.md) for your environment (standalone or databricks).


In [2]:
from typing import Iterable, List, Dict, Optional, Union, Sequence, Any
import math
import os
import requests
import pandas as pd
import optuna
from optuna.samplers import TPESampler
import xgboost as xgb
from pyspark.sql import SparkSession, DataFrame
from pyspark import TaskContext, SparkConf
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType, BooleanType

### Download the dataset

We'll use the [red wine quality dataset](https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv) to regress wine quality based on features such as acidity, sugar content, etc.  

**Note**: This example uses a small dataset for demonstration purposes. The performance advantages of distributed training are best realized with large datasets and computational workloads.

In [3]:
cwd = os.getcwd()
os.mkdir(os.path.join(cwd, "data")) if not os.path.exists(os.path.join(cwd, "data")) else None
filepath = os.path.join(cwd, "data", "winequality-red.csv")

In [4]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

response = requests.get(url)
if response.status_code == 200:
    with open(filepath, "wb") as f:
        f.write(response.content)
    print(f"File downloaded and saved to {filepath}")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

File downloaded and saved to /home/rishic/Code/myforks/spark-rapids-examples/examples/ML+DL-Examples/Optuna-Spark/optuna-examples/data/winequality-red.csv


## Part 1. Running Optuna locally

In [5]:
import cudf
from cuml.metrics.regression import mean_squared_error
from cuml.model_selection import train_test_split

### Prepare data

In [6]:
data = cudf.read_csv(filepath, delimiter=";")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Prepare the train/validation sets. Precompute the Quantile DMatrix, which is used by histogram-based tree methods to save memory.

In [7]:
X = data.iloc[:, :-1].values
y = data["quality"].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
Xy_train_qdm = xgb.QuantileDMatrix(X_train, y_train)  # Precompute Quantile DMatrix to avoid repeated quantization every trial.

### Objective function

We define the objective and a hyperparameter search space to optimize via the `trial.suggest_` methods.  

In each trial, new hyperparameters will be suggested based on previous results. See [optuna.trial.Trial](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html) API for a full list of functions.

In [8]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "tree_method": "gpu_hist",
        "device": "cuda",
    }

    booster = xgb.train(params=params, dtrain=Xy_train_qdm, num_boost_round=trial.suggest_int("num_boost_round", 100, 500))
    predictions = booster.inplace_predict(X_val)
    rmse = mean_squared_error(y_val, predictions, squared=False).get()
    
    return rmse   

Create the study and optimize. By default, the study results will be stored in memory.

In [9]:
study = optuna.create_study(study_name="optuna-xgboost-local", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=10)

[I 2024-12-11 23:47:48,356] A new study created in memory with name: optuna-xgboost-local


[I 2024-12-11 23:47:48,724] Trial 0 finished with value: 0.6377619522504244 and parameters: {'learning_rate': 0.005611516415334507, 'max_depth': 10, 'subsample': 0.7453942447208348, 'colsample_bytree': 0.6187255599871848, 'min_child_weight': 4, 'num_boost_round': 162}. Best is trial 0 with value: 0.6377619522504244.
[I 2024-12-11 23:47:49,676] Trial 1 finished with value: 0.6703788974319568 and parameters: {'learning_rate': 0.0013066739238053278, 'max_depth': 9, 'subsample': 0.6210592611560484, 'colsample_bytree': 0.7226689489062432, 'min_child_weight': 1, 'num_boost_round': 488}. Best is trial 0 with value: 0.6377619522504244.
[I 2024-12-11 23:47:49,819] Trial 2 finished with value: 0.6181751362616256 and parameters: {'learning_rate': 0.04622589001020832, 'max_depth': 3, 'subsample': 0.2227337188467456, 'colsample_bytree': 0.22423428436076215, 'min_child_weight': 7, 'num_boost_round': 310}. Best is trial 2 with value: 0.6181751362616256.
[I 2024-12-11 23:47:49,942] Trial 3 finished wi

In [10]:
trial = study.best_trial
print("Best RMSE: ", trial.value)
print("Best hyperparameters: ", trial.params)

Best RMSE:  0.6060010014477214
Best hyperparameters:  {'learning_rate': 0.0123999678368461, 'max_depth': 2, 'subsample': 0.9711053963763306, 'colsample_bytree': 0.7863761821930588, 'min_child_weight': 19, 'num_boost_round': 458}


## Part 2. Distributed Optuna on Spark 

### PySpark

For standalone users, we need to create the Spark session with the Spark-Rapids plugin. For Databricks users, the Spark session will be preconfigured and this cell can be skipped.

In [11]:
def get_rapids_jar():
    SPARK_RAPIDS_VERSION = "25.10.0"
    rapids_jar = f"rapids-4-spark_2.12-{SPARK_RAPIDS_VERSION}.jar"
    if not os.path.exists(rapids_jar):
        print("Downloading Spark Rapids jar")
        url = f"https://edge.urm.nvidia.com/artifactory/sw-spark-maven/com/nvidia/rapids-4-spark_2.12/{SPARK_RAPIDS_VERSION}/{rapids_jar}"
        response = requests.get(url)
        if response.status_code == 200:
            with open(rapids_jar, "wb") as f:
                f.write(response.content)
            print(f"File '{rapids_jar}' downloaded and saved successfully.")
        else:
            print(f"Failed to download the plugin. Status code: {response.status_code}")
    else:
        print("Plugin file already exists. Skipping download.")
    return rapids_jar

def initialize_spark(rapids_jar: str):
    import socket
    hostname = socket.gethostname()
    conda_env = os.environ.get("CONDA_PREFIX")

    conf = SparkConf()
    conf.setMaster(f"spark://{hostname}:7077")  # Assuming master is on host and default port. 
    conf.set("spark.task.maxFailures", "1")
    conf.set("spark.task.resource.gpu.amount", f"{1/4}")  # Setting to 1/4 for single-node demo. In practice, set to 1. 
    conf.set("spark.executor.resource.gpu.amount", "1")
    conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
    conf.set("spark.pyspark.python", f"{conda_env}/bin/python")
    conf.set("spark.pyspark.driver.python", f"{conda_env}/bin/python")
    conf.set("spark.jars", rapids_jar)
    conf.set("spark.executorEnv.PYTHONPATH", rapids_jar)
    conf.set("spark.rapids.memory.gpu.minAllocFraction", "0.0001")
    conf.set("spark.plugins", "com.nvidia.spark.SQLPlugin")
    conf.set("spark.locality.wait", "0s")
    conf.set("spark.sql.cache.serializer", "com.nvidia.spark.ParquetCachedBatchSerializer")
    conf.set("spark.rapids.memory.gpu.pooling.enabled", "false")
    conf.set("spark.sql.execution.sortBeforeRepartition", "false")
    conf.set("spark.rapids.sql.format.parquet.reader.type", "MULTITHREADED")
    conf.set("spark.rapids.sql.format.parquet.multiThreadedRead.maxNumFilesParallel", "20")
    conf.set("spark.rapids.sql.multiThreadedRead.numThreads", "20")
    conf.set("spark.rapids.sql.python.gpu.enabled", "true")
    conf.set("spark.rapids.memory.pinnedPool.size", "2G")
    conf.set("spark.python.daemon.module", "rapids.daemon")
    conf.set("spark.rapids.sql.batchSizeBytes", "512m")
    conf.set("spark.sql.adaptive.enabled", "false")
    conf.set("spark.sql.files.maxPartitionBytes", "512m")
    conf.set("spark.rapids.sql.concurrentGpuTasks", "2")
    conf.set("spark.rapids.sql.explain", "NONE")
    
    spark = SparkSession.builder.appName("optuna-spark-xgboost").config(conf=conf).getOrCreate()
    return spark

if 'spark' not in globals():
    rapids_jar = get_rapids_jar()
    spark = initialize_spark(rapids_jar)

Plugin file already exists. Skipping download.


24/12/11 23:47:51 WARN Utils: Your hostname, cb4ae00-lcedt resolves to a loopback address: 127.0.1.1; using 10.110.47.100 instead (on interface eno1)
24/12/11 23:47:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/12/11 23:47:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/11 23:47:52 WARN RapidsPluginUtils: RAPIDS Accelerator 25.02.1 using cudf 25.02.1, private revision bd4e99e18e20234ee0c54f95f4b0bfce18a6255e
24/12/11 23:47:52 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.


#### Helper Class

First we'll define a helper class. This will store the hyperparameters we want optimized in each trial, and easily convert that into a schema for the output dataframe.

In [12]:
class OptunaParams:
    def __init__(self):
        self.hyperparameters = {}

    def add_categorical_param(self, name: str, choices: Sequence[Union[None, bool, int, float, str]]):
        """
        Adds a categorical hyperparameter to be tuned via Optuna's trial.suggest_categorical().
        """
        self.hyperparameters[name] = { "type": "categorical", "choices": choices }
    
    def add_int_param(self, name: str, low: int, high: int, step: int = 1, log: bool = False):
        """
        Adds an integer hyperparameter to be tuned via Optuna's trial.suggest_int().
        """
        self.hyperparameters[name] = { "type": "int", "low": low, "high": high, "step": step, "log": log }
    
    def add_float_param(self, name: str, low: float, high: float, step: Optional[float] = None, log: bool = False):
        """
        Adds a float hyperparameter to be tuned via Optuna's trial.suggest_float().
        """
        self.hyperparameters[name] = { "type": "float", "low": low, "high": high, "step": step,"log": log }

    def suggest_params(self, trial) -> Dict[str, Union[int, float, str, bool]]:
        """
        Converts the hyperparameter space into a dictionary of suggested values in Optuna format,
        to be called within the objective function.
        """
        suggested_params = {}
        for name, config in self.hyperparameters.items():
            if config["type"] == "categorical":
                suggested_params[name] = trial.suggest_categorical(name, config["choices"])
            elif config["type"] == "int":
                suggested_params[name] = trial.suggest_int(
                    name, config["low"], config["high"], step=config["step"], log=config["log"]
                )
            elif config["type"] == "float":
                suggested_params[name] = trial.suggest_float(
                    name, config["low"], config["high"], step=config.get("step", None), log=config["log"]
                )
        return suggested_params

    def to_schema(self) -> StructType:
        """
        Converts the hyperparameter space into a Spark StructType output schema.
        """
        fields = []
        for name, config in self.hyperparameters.items():
            if config["type"] == "float":
                fields.append(StructField(name, DoubleType(), False))
            elif config["type"] == "int":
                fields.append(StructField(name, IntegerType(), False))
            elif config["type"] == "categorical":
                if isinstance(config["choices"][0], str):
                    fields.append(StructField(name, StringType(), False))
                elif isinstance(config["choices"][0], bool):
                    fields.append(StructField(name, BooleanType(), False))
                elif isinstance(config["choices"][0], (int, float)):
                    fields.append(StructField(name, DoubleType(), False))
                else:
                    raise ValueError(f"Unsupported categorical type for field {name}")
        
        # Study will also return the best achieved loss:
        fields.append(StructField("best_value", DoubleType(), False)) 
        return StructType(fields)

### Optuna Task

This implementation demonstrates **Spark I/O**.

This means that Spark will read the dataset and create a duplicate of the dataset for each worker (1 partition = 1 duplicate), then map the tuning task onto each partition.  
In practice, this enables the code to be chained to other Dataframe operations (e.g. ETL stages) without the intermediate step of writing to DBFS, at the cost of some overhead during duplication.

For the alternative implementation using **Worker I/O**, see the [JoblibSpark notebook](optuna-joblibspark.ipynb). 

In the task, each worker will:
1. Concatenate the pandas partition batches to form the dataset
2. Load the study from the MySQL storage backend
3. Optimize over the objective for the assigned number of trials, sending results back to the database after each iteration

In [13]:
def task_udf(pdf_iter: Iterable[pd.DataFrame],
             xgb_params: Dict[str, Any],
             optuna_params: OptunaParams,
             trials_per_task: List[int],            
             driver_ip: str,
             study_name: str,
             seed: int) -> Iterable[pd.DataFrame]:

    import cudf
    from cuml.metrics.regression import mean_squared_error
    from cuml.model_selection import train_test_split
    
    tc = TaskContext.get()
    assert "gpu" in tc.resources(), "GPU resource not found."
    num_trials = trials_per_task[tc.partitionId()]

    df_list = []
    for pdf in pdf_iter:
        df_list.append(cudf.DataFrame.from_pandas(pdf))
    
    data = cudf.concat(df_list)
    X = data.iloc[:, :-1].values
    y = data["quality"].values
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    tuning_max_bin = "max_bin" in optuna_params.hyperparameters
    if not tuning_max_bin:
        max_bin = xgb_params.get("max_bin", 256)
        # Precompute Quantile DMatrix to avoid repeated quantization every trial.
        Xy_train_qdm = xgb.QuantileDMatrix(X_train, y_train, max_bin=max_bin)

    def objective(trial):
        tuning_params = optuna_params.suggest_params(trial)
        xgb_params.update(tuning_params)

        if tuning_max_bin:
            # If tuning the max_bin param, we must recompute the QDM every trial, since the quantiles change.
            if "n_estimators" not in xgb_params:
                xgb_params["n_estimators"] = 100  # Default value if not tuning.

            model = xgb.XGBRegressor(**xgb_params)
            model.fit(X_train, y_train)
            booster = model.get_booster()
        else:
            # Train the model with xgb.train() API using the precomputed QDM.
            num_boost_round = xgb_params.get("n_estimators", 100)
            booster = xgb.train(params=xgb_params, dtrain=Xy_train_qdm, num_boost_round=num_boost_round)
        
        predictions = booster.inplace_predict(X_val)
        rmse = mean_squared_error(y_val, predictions, squared=False).get()
        
        return rmse

    study = optuna.load_study(
        study_name=study_name,
        storage=f"mysql://optuna_user:optuna_password@{driver_ip}/optuna",
        sampler=TPESampler(seed=seed),
    )

    print(f"Running {num_trials} trials on partition {tc.partitionId()}.")
    study.optimize(objective, n_trials=num_trials)

    result_dict = {f"{key}": [value] for key, value in study.best_params.items()}
    result_dict['best_value'] = [study.best_value]
    
    yield pd.DataFrame(result_dict)

## Setup and run the Optuna study

Get the driver IP for the MySQL database.  
- For standalone users, make sure you've followed the [database setup instructions](../README.md#setup-database-for-optuna). The database should be on 'localhost'. 
- For databricks users, the database should already be setup on the driver node by the init script.

In [14]:
# check if we're running on databricks
on_databricks = os.environ.get("DATABRICKS_RUNTIME_VERSION", False)

In [15]:
if on_databricks:
    driver_ip = spark.conf.get("spark.driver.host")
else:
    driver_ip = "localhost"

print(f"MySQL database is hosted on {driver_ip}")

MySQL database is hosted on localhost


Create a new study, referencing the MySQL database as the storage backend.

In [16]:
study_name = "optuna-xgboost-dataframe"
seed = 42

try:
    # Delete the study if it already exists
    optuna.delete_study(
        study_name=study_name, 
        storage=f"mysql://optuna_user:optuna_password@{driver_ip}/optuna"
    )
except:
    pass

optuna.create_study(
    study_name=study_name,
    storage=f"mysql://optuna_user:optuna_password@{driver_ip}/optuna",
    sampler=TPESampler(seed=seed)
)

[I 2024-12-11 23:47:53,347] A new study created in RDB with name: optuna-xgboost-dataframe


<optuna.study.study.Study at 0x756423c12560>

Define the number of tasks, number of trials, and trials per task. 

**NOTE**: for standalone users running on a single worker, the 4 tasks will all be assigned to the same worker and will time-share the GPU for demonstration. In practice, you should set `spark.task.resource.gpu.amount=1` and set num_tasks to the number of workers in the cluster so that each task gets full access to the GPU.

In [17]:
def partition_trials(total_trials: int, total_tasks: int) -> List[int]:
    base_size = total_trials // total_tasks
    extra = total_trials % total_tasks
    partitions = [base_size] * total_tasks
    for i in range(extra):
        partitions[i] += 1
    
    return partitions

In [18]:
num_tasks = 4
num_trials = 100
trials_per_task = partition_trials(num_trials, num_tasks)
print(f"Trials per task: {trials_per_task}")

Trials per task: [25, 25, 25, 25]


#### Define params
Define the XGBoost model params and the hyperparams for Optuna to tune. 

In [19]:
# Keep these params consistent:
xgb_params = {
    "objective": "reg:squarederror",
    "verbosity": 0,
    "tree_method": "gpu_hist",
    "device": "cuda",
    "seed": seed,
}

In [20]:
# Tune these params:
hyperparams = OptunaParams()
hyperparams.add_int_param("n_estimators", low=100, high=500)
hyperparams.add_float_param("learning_rate", low=1e-3, high=0.1, log=True)
hyperparams.add_int_param("max_depth", low=1, high=10)
hyperparams.add_float_param("subsample", low=0.05, high=1.0)
hyperparams.add_float_param("colsample_bytree", low=0.05, high=1.0)
hyperparams.add_int_param("min_child_weight", low=1, high=20)

out_schema = hyperparams.to_schema()

We'll also define the following helper function, which will create duplicates of the dataframe held in separate partitions.

In [21]:
def coalesce_tree_union(df: DataFrame, num_duplicates: int):
    """
    Coalesce the DataFrame to a single partition and recursively self-union to create duplicates.
    """
    input_df = df.coalesce(1).cache()
    current_df = input_df
    
    if num_duplicates <= 1:
        return current_df

    recursions = int(math.log(num_duplicates, 2))
    remainder = num_duplicates - 2 ** recursions

    for _ in range(recursions):
        current_df = current_df.union(current_df)

    for _ in range(remainder):
        current_df = current_df.union(input_df)
    
    return current_df

#### Load dataset

Read the data from the local directory with Spark and then duplicate it to prepare to run the task.

In [22]:
if on_databricks:
    # once the dataset is in dbfs, databricks appends "dbfs:" to the filepath automatically
    filepath = '/FileStore/optuna-data/winequality-red.csv'
else:
    cwd = os.getcwd()
    filepath = os.path.join(cwd, "data", "winequality-red.csv")

in_schema = StructType([
    StructField("fixed acidity", DoubleType(), True),
    StructField("volatile acidity", DoubleType(), True),
    StructField("citric acid", DoubleType(), True),
    StructField("residual sugar", DoubleType(), True),
    StructField("chlorides", DoubleType(), True),
    StructField("free sulfur dioxide", DoubleType(), True),
    StructField("total sulfur dioxide", DoubleType(), True),
    StructField("density", DoubleType(), True),
    StructField("pH", DoubleType(), True),
    StructField("sulphates", DoubleType(), True),
    StructField("alcohol", DoubleType(), True),
    StructField("quality", IntegerType(), True)
])

data_df = spark.read.csv(filepath, header=True, schema=in_schema, sep=";")
data_df = coalesce_tree_union(data_df, num_duplicates=num_tasks)    

### Run the study

Map the Optuna task onto the dataframe and collect the results (it might take a few minutes).

In [23]:
result_df = data_df.mapInPandas(lambda pdf_iter: 
                                task_udf(pdf_iter,
                                         xgb_params=xgb_params,
                                         optuna_params=hyperparams,
                                         trials_per_task=trials_per_task,
                                         driver_ip=driver_ip,
                                         study_name=study_name,
                                         seed=seed),
                                         schema=out_schema).toPandas()

                                                                                

In [24]:
results = result_df.iloc[0].to_dict()
best_value = results.pop("best_value")

print(f"Best parameters: {results}")
print(f"Best value: {best_value}")

Best parameters: {'n_estimators': 419.0, 'learning_rate': 0.015039610889407229, 'max_depth': 10.0, 'subsample': 0.6630214978050138, 'colsample_bytree': 0.8524338650689898, 'min_child_weight': 2.0}
Best value: 0.533100375625104
