# Pyspark Random Forest Regression

### 1. Set up spark context and SparkSession

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark Random Forest Regression") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

### 2. load dataset

In [2]:
df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').load("./data/WineData.csv",header=True);

In [3]:
df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [4]:
# convert the data to dense vector
def transData(row):
    return Row(label=row["quality"],
               features=Vectors.dense([row["fixed acidity"],
                                       row["volatile acidity"],
                                       row["citric acid"],
                                       row["residual sugar"],
                                       row["chlorides"], 
                                       row["free sulfur dioxide"],
                                       row["total sulfur dioxide"],
                                       row["residual sugar"],
                                       row["density"],                                          
                                       row["pH"],
                                       row["sulphates"],
                                       row["alcohol"]
                                      ]))

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [6]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

transformed = df.rdd.map(transData).toDF() 
transformed.show(6)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[7.4,0.7,0.0,1.9,...|    5|
|[7.8,0.88,0.0,2.6...|    5|
|[7.8,0.76,0.04,2....|    5|
|[11.2,0.28,0.56,1...|    6|
|[7.4,0.7,0.0,1.9,...|    5|
|[7.4,0.66,0.0,1.8...|    5|
+--------------------+-----+
only showing top 6 rows



In [7]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = transformed.randomSplit([0.7, 0.3])

In [8]:
# Train a RandomForest model.
rf = RandomForestRegressor()
model = rf.fit(trainingData)

In [9]:
model.getNumTrees

20

In [10]:
# Make predictions.
predictions = model.transform(testData)

In [11]:
predictions.show(10)

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[4.6,0.52,0.15,2....|    4| 5.889719869104179|
|[5.0,0.38,0.01,1....|    6| 6.655705370010426|
|[5.0,0.4,0.5,4.3,...|    6| 6.353853647732902|
|[5.1,0.51,0.18,2....|    7| 6.661956266488519|
|[5.2,0.32,0.25,1....|    5| 5.384293372390834|
|[5.3,0.47,0.11,2....|    7|6.8320080555657325|
|[5.3,0.57,0.01,1....|    7| 6.632849078239775|
|[5.4,0.74,0.0,1.2...|    6| 5.842063865526534|
|[5.6,0.31,0.37,1....|    5| 5.435963994046107|
|[5.6,0.31,0.78,13...|    6| 5.394307996995469|
+--------------------+-----+------------------+
only showing top 10 rows



In [12]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+-----------------+-----+--------------------+
|       prediction|label|            features|
+-----------------+-----+--------------------+
|5.889719869104179|    4|[4.6,0.52,0.15,2....|
|6.655705370010426|    6|[5.0,0.38,0.01,1....|
|6.353853647732902|    6|[5.0,0.4,0.5,4.3,...|
|6.661956266488519|    7|[5.1,0.51,0.18,2....|
|5.384293372390834|    5|[5.2,0.32,0.25,1....|
+-----------------+-----+--------------------+
only showing top 5 rows



In [13]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.592509


# Pyspark Random Forest Classifier

In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [15]:
df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').load("./data/WineData.csv",header=True);

In [16]:
df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [17]:
# convert the data to dense vector
def transData(row):
    return Row(label=row["quality"],
               features=Vectors.dense([row["fixed acidity"],
                                       row["volatile acidity"],
                                       row["citric acid"],
                                       row["residual sugar"],
                                       row["chlorides"], 
                                       row["free sulfur dioxide"],
                                       row["total sulfur dioxide"],
                                       row["residual sugar"],
                                       row["density"],                                          
                                       row["pH"],
                                       row["sulphates"],
                                       row["alcohol"]
                                      ]))

In [18]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

data = df.rdd.map(transData).toDF() 
data.show(6)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[7.4,0.7,0.0,1.9,...|    5|
|[7.8,0.88,0.0,2.6...|    5|
|[7.8,0.76,0.04,2....|    5|
|[11.2,0.28,0.56,1...|    6|
|[7.4,0.7,0.0,1.9,...|    5|
|[7.4,0.66,0.0,1.8...|    5|
+--------------------+-----+
only showing top 6 rows



In [19]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
labelIndexer.transform(data).show()

+--------------------+-----+------------+
|            features|label|indexedLabel|
+--------------------+-----+------------+
|[7.4,0.7,0.0,1.9,...|    5|         0.0|
|[7.8,0.88,0.0,2.6...|    5|         0.0|
|[7.8,0.76,0.04,2....|    5|         0.0|
|[11.2,0.28,0.56,1...|    6|         1.0|
|[7.4,0.7,0.0,1.9,...|    5|         0.0|
|[7.4,0.66,0.0,1.8...|    5|         0.0|
|[7.9,0.6,0.06,1.6...|    5|         0.0|
|[7.3,0.65,0.0,1.2...|    7|         2.0|
|[7.8,0.58,0.02,2....|    7|         2.0|
|[7.5,0.5,0.36,6.1...|    5|         0.0|
|[6.7,0.58,0.08,1....|    5|         0.0|
|[7.5,0.5,0.36,6.1...|    5|         0.0|
|[5.6,0.615,0.0,1....|    5|         0.0|
|[7.8,0.61,0.29,1....|    5|         0.0|
|[8.9,0.62,0.18,3....|    5|         0.0|
|[8.9,0.62,0.19,3....|    5|         0.0|
|[8.5,0.28,0.56,1....|    7|         2.0|
|[8.1,0.56,0.28,1....|    5|         0.0|
|[7.4,0.59,0.08,4....|    4|         3.0|
|[7.9,0.32,0.51,1....|    6|         1.0|
+--------------------+-----+------

In [20]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="features", \
                              outputCol="indexedFeatures", \
                              maxCategories=4).fit(data)

featureIndexer.transform(data).show()   

+--------------------+-----+--------------------+
|            features|label|     indexedFeatures|
+--------------------+-----+--------------------+
|[7.4,0.7,0.0,1.9,...|    5|[7.4,0.7,0.0,1.9,...|
|[7.8,0.88,0.0,2.6...|    5|[7.8,0.88,0.0,2.6...|
|[7.8,0.76,0.04,2....|    5|[7.8,0.76,0.04,2....|
|[11.2,0.28,0.56,1...|    6|[11.2,0.28,0.56,1...|
|[7.4,0.7,0.0,1.9,...|    5|[7.4,0.7,0.0,1.9,...|
|[7.4,0.66,0.0,1.8...|    5|[7.4,0.66,0.0,1.8...|
|[7.9,0.6,0.06,1.6...|    5|[7.9,0.6,0.06,1.6...|
|[7.3,0.65,0.0,1.2...|    7|[7.3,0.65,0.0,1.2...|
|[7.8,0.58,0.02,2....|    7|[7.8,0.58,0.02,2....|
|[7.5,0.5,0.36,6.1...|    5|[7.5,0.5,0.36,6.1...|
|[6.7,0.58,0.08,1....|    5|[6.7,0.58,0.08,1....|
|[7.5,0.5,0.36,6.1...|    5|[7.5,0.5,0.36,6.1...|
|[5.6,0.615,0.0,1....|    5|[5.6,0.615,0.0,1....|
|[7.8,0.61,0.29,1....|    5|[7.8,0.61,0.29,1....|
|[8.9,0.62,0.18,3....|    5|[8.9,0.62,0.18,3....|
|[8.9,0.62,0.19,3....|    5|[8.9,0.62,0.19,3....|
|[8.5,0.28,0.56,1....|    7|[8.5,0.28,0.56,1....|


In [21]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = transformed.randomSplit([0.7, 0.3])

In [22]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

In [23]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [24]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

In [25]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

In [26]:
# Make predictions.
predictions = model.transform(testData)

In [27]:
# Select example rows to display.
predictions.select("features","label","predictedLabel").show(5)

+--------------------+-----+--------------+
|            features|label|predictedLabel|
+--------------------+-----+--------------+
|[5.0,0.42,0.24,2....|    8|             6|
|[5.0,1.02,0.04,1....|    4|             5|
|[5.1,0.47,0.02,1....|    6|             6|
|[5.2,0.32,0.25,1....|    5|             5|
|[5.2,0.645,0.0,2....|    6|             6|
+--------------------+-----+--------------+
only showing top 5 rows



In [28]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.388889


In [29]:
rfModel = model.stages[2]
rfModel

RandomForestClassificationModel (uid=rfc_e9e9efc375d9) with 10 trees

In [30]:
rfModel.trees

[DecisionTreeClassificationModel (uid=dtc_1a4a5effb453) of depth 5 with 51 nodes,
 DecisionTreeClassificationModel (uid=dtc_0e1f3d9288fd) of depth 5 with 59 nodes,
 DecisionTreeClassificationModel (uid=dtc_1213437c5d74) of depth 5 with 55 nodes,
 DecisionTreeClassificationModel (uid=dtc_2bd8edefd6fe) of depth 5 with 59 nodes,
 DecisionTreeClassificationModel (uid=dtc_d0a23ca40b68) of depth 5 with 53 nodes,
 DecisionTreeClassificationModel (uid=dtc_d4e20b9a9f80) of depth 5 with 57 nodes,
 DecisionTreeClassificationModel (uid=dtc_e188b475db2c) of depth 5 with 57 nodes,
 DecisionTreeClassificationModel (uid=dtc_5b90a9b8c375) of depth 5 with 53 nodes,
 DecisionTreeClassificationModel (uid=dtc_3e93c72bffc9) of depth 5 with 63 nodes,
 DecisionTreeClassificationModel (uid=dtc_c98f26bf9a6c) of depth 5 with 55 nodes]