# Tuning RandomForestRegressor using CrossValidator

In [None]:
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from pyspark.ml.evaluation import RegressionEvaluator
import time

## Create synthetic dataset

In [None]:
n_rows = 50000
n_cols = 300
dtype='float32'
X, y = make_regression(n_samples=n_rows, n_features=n_cols, random_state=1)
X = X.astype(dtype)
y = y.astype(dtype)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

## Convert dataset to Spark DataFrame

In [None]:
pd_data_train = pd.DataFrame({"features": list(X_train), "label": y_train})
pd_data_test = pd.DataFrame({"features": list(X_test), "label": y_test})
df_train = spark.createDataFrame(pd_data_train)
df_test = spark.createDataFrame(pd_data_test)

In [None]:
df_train.printSchema()

In [None]:
def build_rf_regressor(estimator_class):
    return ( estimator_class()
                .setFeaturesCol("features")
                .setLabelCol("label")
                .setFeatureSubsetStrategy("all")
           )

## CrossValidator builder

We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) CrossValidator objects,
demonstrating the common API, and verify they yield similar performance on our synthetic dataset

In [None]:
from pyspark.ml.tuning import ParamGridBuilder

def create_crossvalidator(cv_class, rf_regressor_class):
    # instantiate evaluator
    evaluator = RegressionEvaluator().setLabelCol("label")

    # instantiate RandomForestRegressor
    rf_reg = (
        rf_regressor_class()
        .setFeaturesCol("features")
        .setLabelCol("label")
        .setFeatureSubsetStrategy("all")
        )

    # create the parameters to be tuned
    grid = (
        ParamGridBuilder()
        .addGrid(rf_reg.maxDepth, [5, 8])
        .addGrid(rf_reg.maxBins, [32, 64])
        .build()
    )

    # instantiate the CrossValidator
    cv = (
        cv_class()
        .setEstimator(rf_reg)
        .setEvaluator(evaluator)
        .setEstimatorParamMaps(grid)
        .setNumFolds(3)
    )
    return cv

## Spark RAPIDS ML (GPU)

In [None]:
from spark_rapids_ml.tuning import CrossValidator
from spark_rapids_ml.regression import RandomForestRegressor

cross_validator = create_crossvalidator(CrossValidator, RandomForestRegressor)

### tuning

In [None]:
start_time = time.time()
cv_model = cross_validator.fit(df_train)
print(f"Tuning took: {time.time() - start_time} sec")

In [None]:
transformed_df = cv_model.transform(df_test)
evaluator = (
    RegressionEvaluator()
    .setPredictionCol("prediction")
    .setLabelCol("label")
)
print(f"rmse: {evaluator.evaluate(transformed_df)}")

Check that the RMSE is smaller than the standard deviation of the label column, indicating that the model is making non-trivial predictions.

In [None]:
from pyspark.sql.functions import stddev
print(f'label stddev: {transformed_df.select(stddev("label").alias("stddev")).toPandas()["stddev"][0]}')

## Spark ML (CPU)

In [None]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.regression import RandomForestRegressor

cross_validator = create_crossvalidator(CrossValidator, RandomForestRegressor)

### Tuning

Convert array sql type to VectorUDT expected by Spark ML (Note: Spark RAPIDS ML also accepts VectorUDT Dataframes in addition to array type Dataframe above, along with a scalar column format - see docs).

In [None]:
from pyspark.ml.functions import array_to_vector

vector_df_train = df_train.select(array_to_vector(df_train.features).alias("features"),"label")
vector_df_test = df_test.select(array_to_vector(df_test.features).alias("features"),"label")

In [None]:
start_time = time.time()
cv_model = cross_validator.fit(vector_df_train)
print(f"Tuning took: {time.time() - start_time} sec")

In [None]:
evaluator = (
    RegressionEvaluator()
    .setPredictionCol("prediction")
    .setLabelCol("label")
)
print(f"rmse: {evaluator.evaluate(cv_model.transform(vector_df_test))}")