# Random Forest Regression

In [None]:
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from pyspark.ml.evaluation import RegressionEvaluator
import time

### Create synthetic dataset

In [None]:
n_rows = 50000
n_cols = 300
dtype='float32'
X, y = make_regression(n_samples=n_rows, n_features=n_cols, random_state=1)
X = X.astype(dtype)
y = y.astype(dtype)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

### Convert dataset to Spark DataFrame

In [None]:
pd_data_train = pd.DataFrame({"features": list(X_train), "label": y_train})
pd_data_test = pd.DataFrame({"features": list(X_test), "label": y_test})
df_train = spark.createDataFrame(pd_data_train)
df_test = spark.createDataFrame(pd_data_test)

In [None]:
df_train.printSchema()

### Regressor builder
We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) random forest regressor objects, demonstrating the common API, and verify they yield similar performance on our synthetic dataset

In [None]:
def build_rf_regressor(estimator_class):
    return ( estimator_class()
                .setFeaturesCol("features")
                .setLabelCol("label")
                .setFeatureSubsetStrategy("all")
                .setNumTrees(50)
                .setMaxDepth(9)
           )

## Spark RAPIDS ML (GPU)

Note: spark-rapids-ml RandomForestRegressor leverages RAPIDS cuML MNMG implementation and inherits its current *embarrassingly parallel* implementation.   See RAPIDS API docs for more information:  https://docs.rapids.ai/api/cuml/stable/api/#cuml.dask.ensemble.RandomForestRegressor

In [None]:
from spark_rapids_ml.regression import RandomForestRegressor
gpu_rf_regressor = build_rf_regressor(RandomForestRegressor)

Spark Rapids ML estimator can be persisted and reloaded similarly to Spark ML

In [None]:
estimator_path = "/tmp/spark-rapids-ml-rf-regressor-estimator"

In [None]:
gpu_rf_regressor.write().overwrite().save(estimator_path)
gpu_rf_regressor_loaded = RandomForestRegressor.load(estimator_path)

### Fit

In [None]:
start_time = time.time()
gpu_model = gpu_rf_regressor_loaded.fit(df_train)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
gpu_model.getNumTrees

### Transform

In [None]:
model_path = "/tmp/spark-rapids-ml-rf-regressor-model"

In [None]:
gpu_model.write().overwrite().save(model_path)

In [None]:
gpu_model_loaded = gpu_model.read().load(model_path)

In [None]:
gpu_model_loaded.getNumTrees

In [None]:
transformed_df = gpu_model_loaded.setPredictionCol("prediction").transform(df_test)

In [None]:
transformed_df.printSchema()

In [None]:
transformed_df.count()

In [None]:
transformed_df.select("features","label","prediction").sort("features").show(10)

Check the RMSE on the test set of the GPU trained model.

In [None]:
evaluator = (
            RegressionEvaluator()
                .setPredictionCol("prediction")
                .setLabelCol("label")
            )
print(f"rmse: {evaluator.evaluate(transformed_df)}")

Check that the RMSE is smaller than the standard deviation of the label column, indicating that the model is making non-trivial predictions.

In [None]:
from pyspark.sql.functions import stddev

print(f'label stddev: {transformed_df.select(stddev("label").alias("stddev")).toPandas()["stddev"][0]}')


## Spark ML (CPU)

In [None]:
from pyspark.ml.regression import RandomForestRegressor
cpu_rf_regressor = build_rf_regressor(RandomForestRegressor)

Convert array sql type to VectorUDT expected by Spark ML (Note: Spark RAPIDS ML also accepts VectorUDT Dataframes in addition to array type Dataframe above, along with a scalar column format - see docs).

In [None]:
from pyspark.ml.functions import array_to_vector

In [None]:
vector_df_train = df_train.select(array_to_vector(df_train.features).alias("features"),"label")

### Fit

In [None]:
start_time = time.time()
cpu_model = cpu_rf_regressor.fit(vector_df_train)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
cpu_model.getNumTrees

### Transform

In [None]:
vector_df_test = df_test.select(array_to_vector(df_test.features).alias("features"),"label")

In [None]:
cpu_transformed_df = cpu_model.setPredictionCol("prediction").transform(vector_df_test)

In [None]:
cpu_transformed_df.select("features","label","prediction").sort("features").show(10)

Test set RMSEs of GPU model above and CPU model below are similar.

In [None]:
print(f"rmse: {evaluator.evaluate(cpu_transformed_df)}")