In [1]:
#
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

<img src="http://developer.download.nvidia.com/notebooks/dlsw-notebooks/tensorrt_torchtrt_efficientnet/nvidia_logo.png" width="90px">

# Distributed Hyperparameter Tuning: Optuna + Spark Dataframes


This demo demonstrates distributed hyperparameter tuning for XGBoost using Spark Dataframes.  
We implement best practices to precompute data and maximize computations on the GPU.  

Reference: https://forecastegy.com/posts/xgboost-hyperparameter-tuning-with-optuna/

#### Note:
Before running, please make sure you've followed the relevant [setup instructions](../README.md) for your environment (standalone or databricks).


In [None]:
from typing import Iterable, List, Dict, Optional, Union, Sequence, Tuple
import math
import json
import os
import requests
import pandas as pd
import optuna
from optuna.samplers import TPESampler
import xgboost as xgb
from pyspark.sql import SparkSession, DataFrame
from pyspark import TaskContext, SparkConf
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType, BooleanType

### Download the dataset

We'll use the [red wine quality dataset](https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv) to regress wine quality based on features such as acidity, sugar content, etc.  

**Note**: This example uses a small dataset for demonstration purposes. The performance advantages of distributed training are best realized with large datasets and computational workloads.

In [3]:
cwd = os.getcwd()
os.mkdir(os.path.join(cwd, "data")) if not os.path.exists(os.path.join(cwd, "data")) else None
filepath = os.path.join(cwd, "data", "winequality-red.csv")

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

response = requests.get(url)
if response.status_code == 200:
    with open(filepath, "wb") as f:
        f.write(response.content)
    print(f"File downloaded and saved to {filepath}")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

File downloaded and saved to /home/rishic/Code/myforks/spark-rapids-examples/examples/ML+DL-Examples/Optuna-Spark/optuna-examples/data/winequality-red.csv


## Part 1. Running Optuna locally

In [None]:
import cudf
from cuml.metrics.regression import mean_squared_error
from cuml.model_selection import train_test_split

### Prepare data

In [None]:
data = cudf.read_csv(filepath, delimiter=";")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Prepare the train/validation sets. Precompute the Quantile DMatrix, which is used by histogram-based tree methods to save memory.

In [6]:
X = data.iloc[:, :-1].values
y = data["quality"].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
Xy_train_qdm = xgb.QuantileDMatrix(X_train, y_train)  # Precompute Quantile DMatrix to avoid repeated quantization every trial.

### Objective function

We define the objective and a hyperparameter search space to optimize via the `trial.suggest_` methods.  

In each trial, new hyperparameters will be suggested based on previous results. See [optuna.trial.Trial](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html) API for a full list of functions.

In [7]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "max_bins": 64,
        "tree_method": "gpu_hist",
        "device": "cuda",
    }

    booster = xgb.train(params, dtrain=Xy_train_qdm, num_boost_round=1000)
    predictions = booster.inplace_predict(X_val)
    rmse = mean_squared_error(y_val, predictions, squared=False).get()
    
    return rmse   

Create the study and optimize. By default, the study results will be stored in memory.

In [None]:
study = optuna.create_study(study_name="optuna-xgboost-local", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=10)

[I 2024-12-06 18:48:20,863] A new study created in memory with name: local-xgboost


[I 2024-12-06 18:48:22,724] Trial 0 finished with value: 0.5448661551754318 and parameters: {'learning_rate': 0.005611516415334507, 'max_depth': 10, 'subsample': 0.7453942447208348, 'colsample_bytree': 0.6187255599871848, 'min_child_weight': 4}. Best is trial 0 with value: 0.5448661551754318.
[I 2024-12-06 18:48:22,964] Trial 1 finished with value: 0.6762149201768457 and parameters: {'learning_rate': 0.002051110418843397, 'max_depth': 1, 'subsample': 0.8728673384861885, 'colsample_bytree': 0.6210592611560484, 'min_child_weight': 15}. Best is trial 0 with value: 0.5448661551754318.
[I 2024-12-06 18:48:24,697] Trial 2 finished with value: 0.6871436852105118 and parameters: {'learning_rate': 0.0010994335574766201, 'max_depth': 10, 'subsample': 0.8408205087604007, 'colsample_bytree': 0.25172215514436236, 'min_child_weight': 4}. Best is trial 0 with value: 0.5448661551754318.
[I 2024-12-06 18:48:25,281] Trial 3 finished with value: 0.6096809835807359 and parameters: {'learning_rate': 0.0023

In [9]:
trial = study.best_trial
print("Best RMSE: ", trial.value)
print("Best hyperparameters: ", trial.params)

Best RMSE:  0.5448661551754318
Best hyperparameters:  {'learning_rate': 0.005611516415334507, 'max_depth': 10, 'subsample': 0.7453942447208348, 'colsample_bytree': 0.6187255599871848, 'min_child_weight': 4}


## Part 2. Distributed Optuna on Spark 

First we'll define a helper class. This will store the hyperparameters we want optimized in each trial, and easily convert that into a schema for the output dataframe.

In [10]:
class OptunaParams:
    def __init__(self):
        self.hyperparameters = {}

    def add_categorical_param(self, name: str, choices: Sequence[Union[None, bool, int, float, str]]):
        """
        Adds a categorical hyperparameter to be tuned via Optuna's trial.suggest_categorical().
        """
        self.hyperparameters[name] = { "type": "categorical", "choices": choices }
    
    def add_int_param(self, name: str, low: int, high: int, step: int = 1, log: bool = False):
        """
        Adds an integer hyperparameter to be tuned via Optuna's trial.suggest_int().
        """
        self.hyperparameters[name] = { "type": "int", "low": low, "high": high, "step": step, "log": log }
    
    def add_float_param(self, name: str, low: float, high: float, step: Optional[float] = None, log: bool = False):
        """
        Adds a float hyperparameter to be tuned via Optuna's trial.suggest_float().
        """
        self.hyperparameters[name] = { "type": "float", "low": low, "high": high, "step": step,"log": log }

    def suggest_params(self, trial) -> Dict[str, Union[int, float, str, bool]]:
        """
        Converts the hyperparameter space into a dictionary of suggested values in Optuna format,
        to be called within the objective function.
        """
        suggested_params = {}
        for name, config in self.hyperparameters.items():
            if config["type"] == "categorical":
                suggested_params[name] = trial.suggest_categorical(name, config["choices"])
            elif config["type"] == "int":
                suggested_params[name] = trial.suggest_int(
                    name, config["low"], config["high"], step=config["step"], log=config["log"]
                )
            elif config["type"] == "float":
                suggested_params[name] = trial.suggest_float(
                    name, config["low"], config["high"], step=config.get("step", None), log=config["log"]
                )
        return suggested_params

    def to_schema(self) -> StructType:
        """
        Converts the hyperparameter space into a Spark StructType output schema.
        """
        fields = []
        for name, config in self.hyperparameters.items():
            if config["type"] == "float":
                fields.append(StructField(name, DoubleType(), False))
            elif config["type"] == "int":
                fields.append(StructField(name, IntegerType(), False))
            elif config["type"] == "categorical":
                if isinstance(config["choices"][0], str):
                    fields.append(StructField(name, StringType(), False))
                elif isinstance(config["choices"][0], bool):
                    fields.append(StructField(name, BooleanType(), False))
                elif isinstance(config["choices"][0], (int, float)):
                    fields.append(StructField(name, DoubleType(), False))
                else:
                    raise ValueError(f"Unsupported categorical type for field {name}")
        
        # Study will also return the best achieved loss:
        fields.append(StructField("best_value", DoubleType(), False)) 
        return StructType(fields)

## Part 2a. Worker I/O

We'll first demonstrate a distributed implementation that uses **worker I/O**. 

This means that each worker will read the full dataset from the filepath rather than passing the data in a dataframe.  
In practice, this requires the dataset to be written to a distributed file system accessible to all workers prior to tuning. 

### Optuna Task

Define the task UDF to distribute across the Spark cluster. In each task, the worker will:
1. Read the dataset from the specified filepath.
2. Load the study from the MySQL storage backend.
3. Optimize over the objective for the assigned number of trials, sending results back to the database after each iteration.

In [None]:
def task_udf(pdf_iter: Iterable[pd.DataFrame],
             load_data: callable,
             optuna_params: OptunaParams,
             trials_per_task: List[int],             
             driver_ip: str,
             study_name: str,
             seed: int,
             filepath: str = None) -> Iterable[pd.DataFrame]:

    from cuml.metrics.regression import mean_squared_error
    from cuml.model_selection import train_test_split

    def get_gpu_id(task_context: TaskContext) -> int:
        if task_context is None:
            raise RuntimeError("_get_gpu_id should not be invoked from driver side.")
        
        resources = task_context.resources()
        if "gpu" not in resources:
            raise RuntimeError(
                "Couldn't get the gpu id, Please check the GPU resource configuration"
            )
        
        return int(resources["gpu"].addresses[0].strip())  # Return the first GPU ID for multi-GPU setups.
    
    tc = TaskContext.get()
    gpu_id = get_gpu_id(tc)
    num_trials = trials_per_task[tc.partitionId()]
    
    X, y = load_data(pdf_iter=pdf_iter, filepath=filepath)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    if "max_bins" not in optuna_params.hyperparameters:
        Xy_train_qdm = xgb.QuantileDMatrix(X_train, y_train)  # Precompute Quantile DMatrix to avoid repeated quantization every trial.

    def objective(trial):
        params = {
            "objective": "reg:squarederror",
            "verbosity": 0,
            "tree_method": "gpu_hist",
            "device": f"cuda:{gpu_id}",
            "seed": seed,
        }
        params.update(optuna_params.suggest_params(trial))

        if "max_bins" in params:
            # If tuning the max_bins param, we must recompute the QDM every trial, since the quantiles change.
            if "num_estimators" not in params:
                params["num_estimators"] = 1000

            model = xgb.XGBRegressor(**params)
            model.fit(X_train, y_train)
            booster = model.get_booster()
        else:
            # Train the model with xgb.train() API using the precomputed QDM.
            num_boost_round = params.get("num_estimators", 1000)
            booster = xgb.train(params, dtrain=Xy_train_qdm, num_boost_round=num_boost_round)
            
        # Perform in-place predictions on GPU using the booster.
        predictions = booster.inplace_predict(X_val)
        rmse = mean_squared_error(y_val, predictions, squared=False).get()
        
        return rmse

    study = optuna.load_study(
        study_name=study_name,
        storage=f"mysql://optuna_user:optuna_password@{driver_ip}/optuna",
        sampler=TPESampler(seed=seed),
    )

    print(f"Running {num_trials} trials on partition {tc.partitionId()}.")
    study.optimize(objective, n_trials=num_trials)

    result_dict = {f"{key}": [value] for key, value in study.best_params.items()}
    result_dict['best_value'] = [study.best_value]
    
    yield pd.DataFrame(result_dict)

We'll pass in the 'read_data' callable to read data from the filepath.

In [None]:
def read_data(**kwargs):
    """
    Read the data from the given filepath and return the X, y arrays.
    """
    import cudf

    filepath = kwargs.get("filepath")
    if filepath.startswith("/dbfs/"):
        # Check to ensure GPU direct storage is disabled for cuDF on databricks.
        libcudf_policy = os.environ.get('LIBCUDF_CUFILE_POLICY')
        if libcudf_policy != 'OFF':
            raise RuntimeError("Set LIBCUDF_CUFILE_POLICY=OFF to read from DBFS with cuDF.")    

    data = cudf.read_csv(filepath, delimiter=";")
    X = data.iloc[:, :-1].values
    y = data["quality"].values

    return X, y

### PySpark

For standalone users, we need to create the Spark session with the Spark-Rapids plugin. For Databricks users, the Spark session will be preconfigured and this cell can be skipped.

In [None]:
def get_rapids_jar():
    SPARK_RAPIDS_VERSION = "24.10.1"
    rapids_jar = f"rapids-4-spark_2.12-{SPARK_RAPIDS_VERSION}.jar"
    if not os.path.exists(rapids_jar):
        print("Downloading Spark Rapids jar")
        url = f"https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/{SPARK_RAPIDS_VERSION}/{rapids_jar}"
        response = requests.get(url)
        if response.status_code == 200:
            with open(rapids_jar, "wb") as f:
                f.write(response.content)
            print(f"File '{rapids_jar}' downloaded and saved successfully.")
        else:
            print(f"Failed to download the plugin. Status code: {response.status_code}")
    else:
        print("Plugin file already exists. Skipping download.")
    return rapids_jar

def initialize_spark(rapids_jar: str):
    import socket
    hostname = socket.gethostname()
    conda_env = os.environ.get("CONDA_PREFIX")

    conf = SparkConf()
    conf.setMaster(f"spark://{hostname}:7077")  # Assuming master is on host and default port. 
    conf.set("spark.task.maxFailures", "1")
    conf.set("spark.task.resource.gpu.amount", f"{1/4}")  # Setting to 1/4 for single-node demo. In practice, set to 1. 
    conf.set("spark.executor.resource.gpu.amount", "1")
    conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
    conf.set("spark.pyspark.python", f"{conda_env}/bin/python")
    conf.set("spark.pyspark.driver.python", f"{conda_env}/bin/python")
    conf.set("spark.jars", rapids_jar)
    conf.set("spark.executorEnv.PYTHONPATH", rapids_jar)
    conf.set("spark.rapids.memory.gpu.minAllocFraction", "0.0001")
    conf.set("spark.plugins", "com.nvidia.spark.SQLPlugin")
    conf.set("spark.locality.wait", "0s")
    conf.set("spark.sql.cache.serializer", "com.nvidia.spark.ParquetCachedBatchSerializer")
    conf.set("spark.rapids.memory.gpu.pooling.enabled", "false")
    conf.set("spark.sql.execution.sortBeforeRepartition", "false")
    conf.set("spark.rapids.sql.format.parquet.reader.type", "MULTITHREADED")
    conf.set("spark.rapids.sql.format.parquet.multiThreadedRead.maxNumFilesParallel", "20")
    conf.set("spark.rapids.sql.multiThreadedRead.numThreads", "20")
    conf.set("spark.rapids.sql.python.gpu.enabled", "true")
    conf.set("spark.rapids.memory.pinnedPool.size", "2G")
    conf.set("spark.python.daemon.module", "rapids.daemon")
    conf.set("spark.rapids.sql.batchSizeBytes", "512m")
    conf.set("spark.sql.adaptive.enabled", "false")
    conf.set("spark.sql.files.maxPartitionBytes", "512m")
    conf.set("spark.rapids.sql.concurrentGpuTasks", "2")
    conf.set("spark.rapids.sql.explain", "NONE")
    
    spark = SparkSession.builder.appName("optuna-spark-xgboost").config(conf=conf).getOrCreate()
    return spark

if 'spark' not in globals():
    rapids_jar = get_rapids_jar()
    spark = initialize_spark(rapids_jar)

## Setup and run the Optuna study

Get the driver IP for the MySQL database.  
- For standalone users, make sure you've followed the [database setup instructions](../README.md#setup-database-for-optuna). The database should be on 'localhost'. 
- For databricks users, the database should already be setup on the driver node by the init script.

In [27]:
# check if we're running on databricks
on_databricks = os.environ.get("DATABRICKS_RUNTIME_VERSION", False)

In [28]:
if on_databricks:
    driver_ip = spark.conf.get("spark.driver.host")
else:
    driver_ip = "localhost"

print(f"MySQL database is hosted on {driver_ip}")

MySQL database is hosted on localhost


Create a new study, referencing the MySQL database as the storage backend.

In [None]:
study_name = "optuna-xgboost-worker-io"
seed = 42

try:
    # Delete the study if it already exists
    optuna.delete_study(
        study_name=study_name, 
        storage=f"mysql://optuna_user:optuna_password@{driver_ip}/optuna"
    )
except:
    pass

optuna.create_study(
    study_name=study_name,
    storage=f"mysql://optuna_user:optuna_password@{driver_ip}/optuna",
    sampler=TPESampler(seed=seed)
)

[I 2024-12-06 18:49:59,047] A new study created in RDB with name: optuna-spark-xgboost-worker-io


<optuna.study.study.Study at 0x71a45e42ffd0>

Define the number of tasks, number of trials, and trials per task. 

**NOTE**: for standalone users running on a single worker, the 4 tasks will all be assigned to the same worker and will time-share the GPU for demonstration. In practice, you should set `spark.task.resource.gpu.amount=1` and set num_tasks to the number of workers in the cluster so that each task gets full access to the GPU.

In [30]:
def partition_trials(total_trials: int, total_tasks: int) -> List[int]:
    base_size = total_trials // total_tasks
    extra = total_trials % total_tasks
    partitions = [base_size] * total_tasks
    for i in range(extra):
        partitions[i] += 1
    
    return partitions

In [31]:
num_tasks = 4
num_trials = 100
trials_per_task = partition_trials(num_trials, num_tasks)
print(f"Trials per task: {trials_per_task}")

Trials per task: [25, 25, 25, 25]


Define the hyperparameter search space.

In [32]:
hyperparams = OptunaParams()
hyperparams.add_float_param("learning_rate", low=1e-3, high=0.1, log=True)
hyperparams.add_int_param("max_depth", low=1, high=10)
hyperparams.add_float_param("subsample", low=0.05, high=1.0)
hyperparams.add_float_param("colsample_bytree", low=0.05, high=1.0)
hyperparams.add_int_param("min_child_weight", low=1, high=20)
hyperparams.add_int_param("max_bins", low=32, high=256)

out_schema = hyperparams.to_schema()

**For Databricks**: we must download the dataset to DBFS so that all workers can access it.

In [None]:
if on_databricks:
    dbutils.fs.mkdirs("/FileStore/optuna-data")
    filepath = "/dbfs/FileStore/optuna-data/winequality-red.csv"
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

    response = requests.get(url)
    if response.status_code == 200:
        with open(filepath, "wb") as f:
            f.write(response.content)
        print(f"File downloaded and saved to {filepath}")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")

Define a dummy dataframe with a partition for each task.

In [None]:
dummy_rdd = spark.sparkContext.parallelize([(i,) for i in range(num_tasks)], numSlices=num_tasks)
dummy_df = dummy_rdd.toDF(schema=["task_id"])

                                                                                

### Run the study

Map the Optuna task onto the dataframe and collect the results.

In [None]:
result_df = dummy_df.mapInPandas(lambda pdf_iter: 
                                 task_udf(pdf_iter, 
                                          load_data=read_data,
                                          optuna_params=hyperparams,
                                          trials_per_task=trials_per_task,
                                          driver_ip=driver_ip,
                                          study_name=study_name,
                                          seed=seed,
                                          filepath=filepath),
                                          schema=out_schema).toPandas()

                                                                                

In [36]:
results = result_df.iloc[0].to_dict()
print("Best Results:\n", json.dumps(results, indent=4))

Best Results:
 {
    "learning_rate": 0.08605370736264656,
    "max_depth": 9.0,
    "subsample": 0.6368572972930286,
    "colsample_bytree": 0.5549539635999119,
    "min_child_weight": 2.0,
    "max_bins": 99.0,
    "best_value": 0.5306980019370002
}


## Part 2b. Spark I/O

This second implementation uses **Spark I/O**.

By this we mean Spark reads the dataset and creates a duplicate of the dataset for each worker (1 partition = 1 duplicate), then maps the tuning task onto each partition.  
In practice, this enables the code to be chained to other Dataframe operations (e.g. ETL stages) without the intermediate step of writing to DBFS, at the cost of memory overhead during duplication.


### Optuna Task

We'll use the same task as before, but instead of reading the dataset from the filepath, the task_udf will be mapped onto the dataframe partition.  
The task_udf will be given an iterator of batches over the partition. See the [mapInPandas docs](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.mapInPandas.html) for more info.

In [None]:
def concat_data(**kwargs):
    """
    Concatenate the arrow batches and return the X, y arrays.
    """
    import cudf

    pdf_iter = kwargs.get("pdf_iter")

    df_list = []
    for pdf in pdf_iter:
        df_list.append(cudf.DataFrame.from_pandas(pdf))
    
    data = cudf.concat(df_list)
    X = data.iloc[:, :-1].values
    y = data["quality"].values

    return X, y

We'll create a new study for this run using the MySQL database, and define the number of tasks/trials.

In [None]:
study_name = "optuna-xgboost-spark-io"
seed = 42

try:
    # Delete the study if it already exists
    optuna.delete_study(
        study_name=study_name, 
        storage=f"mysql://optuna_user:optuna_password@{driver_ip}/optuna"
    )
except:
    pass

optuna.create_study(
    study_name=study_name,
    storage=f"mysql://optuna_user:optuna_password@{driver_ip}/optuna",
    sampler=TPESampler(seed=seed)
)

[I 2024-12-06 18:51:02,723] A new study created in RDB with name: optuna-spark-xgboost-spark-io


<optuna.study.study.Study at 0x71a45e476b00>

We'll also define the following helper function, which will create *n* duplicates of a dataframe in separate partitions.

In [39]:
def coalesce_tree_union(df: DataFrame, num_duplicates: int):
    """
    Coalesce the DataFrame to a single partition and recursively self-union to create duplicates.
    """
    input_df = df.coalesce(1).cache()
    current_df = input_df
    
    if num_duplicates <= 1:
        return current_df

    recursions = int(math.log(num_duplicates, 2))
    remainder = num_duplicates - 2 ** recursions

    for _ in range(recursions):
        current_df = current_df.union(current_df)

    for _ in range(remainder):
        current_df = current_df.union(input_df)
    
    return current_df

#### Load dataset

This time, we'll read the data from the local directory with Spark and then duplicate it to prepare to run the task.

In [None]:
if on_databricks:
    # once the dataset is in dbfs, databricks appends "dbfs:" to the filepath automatically
    filepath = '/FileStore/optuna-data/winequality-red.csv'
else:
    cwd = os.getcwd()
    filepath = os.path.join(cwd, "data", "winequality-red.csv")

in_schema = StructType([
    StructField("fixed acidity", DoubleType(), True),
    StructField("volatile acidity", DoubleType(), True),
    StructField("citric acid", DoubleType(), True),
    StructField("residual sugar", DoubleType(), True),
    StructField("chlorides", DoubleType(), True),
    StructField("free sulfur dioxide", DoubleType(), True),
    StructField("total sulfur dioxide", DoubleType(), True),
    StructField("density", DoubleType(), True),
    StructField("pH", DoubleType(), True),
    StructField("sulphates", DoubleType(), True),
    StructField("alcohol", DoubleType(), True),
    StructField("quality", IntegerType(), True)
])

data_df = spark.read.csv(filepath, header=True, schema=in_schema, sep=";")
data_df = coalesce_tree_union(data_df, num_duplicates=num_tasks)    

### Run the study

Map the Optuna task onto the dataframe and collect the results.

In [42]:
result_df = data_df.mapInPandas(lambda pdf_iter: 
                                task_udf(pdf_iter, 
                                         load_data=concat_data,
                                         optuna_params=hyperparams,
                                         trials_per_task=trials_per_task,
                                         driver_ip=driver_ip,
                                         study_name=study_name,
                                         seed=seed),
                                         schema=out_schema).toPandas()

                                                                                

In [43]:
results = result_df.iloc[0].to_dict()
print("Best Results:\n", json.dumps(results, indent=4))

Best Results:
 {
    "learning_rate": 0.05394082588108075,
    "max_depth": 9.0,
    "subsample": 0.816260723947209,
    "colsample_bytree": 0.6155035966483708,
    "min_child_weight": 4.0,
    "max_bins": 43.0,
    "best_value": 0.5435414669831914
}
