In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from sklearn.datasets import load_iris
import pandas as pd
from pyspark.sql import SparkSession
import os

In [None]:
num_cores = os.cpu_count() or 4

In [None]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("HPO") \
        .config("spark.default.parallelism", num_cores * 2) \
        .config("spark.sql.shuffle.partitions", num_cores * 2) \
        .config(key='spark.sql.session.timeZone', value='UTC') \
        .config(key='spark.ui.enabled', value='false') \
        .config(key='spark.app.id', value='Test') \
        .config(key='spark.driver.host', value='localhost') \
        .getOrCreate()

In [None]:
data = load_iris()

X = data.data
y = data.target

iris_pandas = pd.DataFrame(X, columns=data.feature_names)
iris_pandas['label'] = y

In [None]:
iris_sdf = spark.createDataFrame(iris_pandas)

In [None]:
feature_cols = data.feature_names
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
iris_sdf = assembler.transform(iris_sdf)

In [None]:
X_train, y_train = iris_sdf.randomSplit([0.8, 0.2], seed=42)

In [None]:
rf_clf = RandomForestClassifier(featuresCol='features', labelCol='label')

In [None]:
param_grid = ParamGridBuilder(). \
                addGrid(rf_clf.maxDepth, [2, 4, 6]). \
                addGrid(rf_clf.numTrees, [10, 20, 30, 50, 100]). \
                addGrid(rf_clf.minInstancesPerNode, [1, 2, 4, 10]). \
                addGrid(rf_clf.minInfoGain, [0.0, 0.1, 0.2, 0.01]). \
                build()

In [None]:

evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')

parallelism = max(1, num_cores - 1)

print("Using CrossValidator parallelism:", parallelism)

cross_validator = CrossValidator(estimator=rf_clf,
                                 estimatorParamMaps=param_grid,
                                 evaluator=evaluator,
                                 numFolds=3,
                                 parallelism=parallelism)

In [None]:
cv_model = cross_validator.fit(X_train)

In [None]:
best_model = cv_model.bestModel
best_params = {param[0].name: param[1] for param in best_model.extractParamMap().items()}
best_score = evaluator.evaluate(best_model.transform(X_train))
print("Best Hyperparameters:", best_params)
print("Best Cross-Validation Score:", best_score) # Best Cross-Validation Score: 0.9487179487179487

# Best Hyperparameters:
{'bootstrap': True,
 'cacheNodeIds': False,
 'checkpointInterval': 10,
 'featureSubsetStrategy': 'auto',
 'featuresCol': 'features',
 'impurity': 'gini',
 'labelCol': 'label',
 'leafCol': '',
 'maxBins': 32,
 'maxDepth': 2,
 'maxMemoryInMB': 256,
 'minInfoGain': 0.1,
 'minInstancesPerNode': 1,
 'minWeightFractionPerNode': 0.0,
 'numTrees': 10,
 'predictionCol': 'prediction',
 'probabilityCol': 'probability',
 'rawPredictionCol': 'rawPrediction',
 'seed': -5222259564144254699,
 'subsamplingRate': 1.0}

In [None]:
predictions = best_model.transform(y_train)
accuracy = evaluator.evaluate(predictions)
print("Test Set Accuracy:", accuracy) # Test Set Accuracy: 0.9696969696969697