# Tree Methods Documentation Examples.

We'll work through the concepts:
- Decision Tree
- Random Forest
- Gradient Boosted Trees.

We will also expand a little more from the documentation example and show some useful evaluation features.
Also expand on how you can use multi-class evaluators for binary-classification problems.

In [1]:
# Start a new Spark Session:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("trees").getOrCreate()

In [3]:
from pyspark.ml import Pipeline
# A simple pipeline, which acts as an estimator.  A Pipeline consists of a sequence of stages, each of which is
# either an Estimator or a Transformer.
# When Pipeline.fit() is called, the stages are executed in order.

In [4]:
from pyspark.ml.classification import (RandomForestClassifier, 
                                       GBTClassifier,
                                       DecisionTreeClassifier)

In [5]:
# from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor, GBTRegressor
# Notice how you can can make use of tree methods for regression problems as well.

In [6]:
data = spark.read.format("libsvm").load("sample_libsvm_data.txt")

In [7]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [8]:
# Split this into a training set and into a test set:
train_data, test_data = data.randomSplit([0.7, 0.3], seed=123)

In [9]:
# We start of by creating the most basic model: the DecisionTreeClassifier
dec_tree_cls = DecisionTreeClassifier()  # The defaults already match

# Other parameters you can play around with include:
# --- maxDepth
# --- maxBins
# --- minInfoGain

In [10]:
# Instantiate a RandomForestClassifier
rnd_forest_cls = RandomForestClassifier(numTrees=100)

# An important parameter to play around with is:
# --- numTrees.

In [11]:
# Instantiate a Gradient-Boosted Classifier
grad_boosted_cls = GBTClassifier()

In [12]:
# Fit the classifiers:
fitted_dtc = dec_tree_cls.fit(train_data)
fitted_rfc = rnd_forest_cls.fit(train_data)
fitted_gbt = grad_boosted_cls.fit(train_data)

In [13]:
# Use the fitted models to transform the test data.
dtc_preds = fitted_dtc.transform(dataset=test_data)
rfc_preds = fitted_rfc.transform(dataset=test_data)
gbt_preds = fitted_gbt.transform(dataset=test_data)

In [14]:
# Have a look at the predictions:
dtc_preds.show()
rfc_preds.show()
gbt_preds.show()
# gbt_preds DOES HAVE "rawPrediction" and "probability" columns.  Instructor is using old PySpark version.

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[100,101,102...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[121,122,123...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [0.0,38.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[126,127,128...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[128,129,130...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[150,151,152...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(69

In [15]:
# Use an evaluator on the predictions.
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# We get access to a lot more metrics using the MulticlassClassificationEvaluator such as:
# --- precision
# --- recall
# --- accuracy
# --- auroc
# --- etc...

In [16]:
accuracy_eval = MulticlassClassificationEvaluator(metricName="accuracy")
# metric name can be either one of the following:
# --- f1
# --- weightedPrecision
# --- weightedRecall
# --- accuracy

In [17]:
print("Decision Tree Accuracy:")
accuracy_eval.evaluate(dataset=dtc_preds)

Decision Tree Accuracy:


0.8823529411764706

In [18]:
print("Random Forest Classifier Accuracy:")
accuracy_eval.evaluate(dataset=rfc_preds)

Random Forest Classifier Accuracy:


1.0

In [19]:
print("GBT Classifier Accuracy:")
accuracy_eval.evaluate(dataset=gbt_preds)

GBT Classifier Accuracy:


0.8823529411764706

## See how we can grab feature importance:

In [20]:
fitted_rfc.featureImportances
# Estimate of the importance of each feature.

# From the documentation:
# Feature importance for single decision trees can have high variance due to correlated predictor variables.
# Consider using a RandomForestClassifier to determine feature importance isntead.

SparseVector(692, {155: 0.0007, 156: 0.0012, 176: 0.0012, 182: 0.0007, 207: 0.0103, 208: 0.0074, 234: 0.0008, 239: 0.0012, 240: 0.0004, 241: 0.0005, 242: 0.0005, 243: 0.0015, 244: 0.0204, 245: 0.0051, 258: 0.0012, 260: 0.0032, 262: 0.0088, 266: 0.0003, 267: 0.0002, 271: 0.0019, 272: 0.0237, 273: 0.0066, 274: 0.0047, 290: 0.0006, 291: 0.0129, 293: 0.0005, 295: 0.0009, 299: 0.0061, 300: 0.0092, 302: 0.0175, 313: 0.0003, 318: 0.0057, 320: 0.0008, 323: 0.0134, 327: 0.0019, 330: 0.0093, 343: 0.0004, 345: 0.0113, 346: 0.0078, 347: 0.0021, 350: 0.0077, 351: 0.0006, 352: 0.0025, 355: 0.0011, 356: 0.0053, 357: 0.0116, 358: 0.0138, 372: 0.0178, 373: 0.0075, 377: 0.0272, 378: 0.0106, 380: 0.0011, 382: 0.0016, 384: 0.0014, 385: 0.0249, 386: 0.0151, 388: 0.002, 398: 0.0012, 399: 0.0143, 401: 0.0061, 405: 0.0513, 406: 0.01, 407: 0.011, 408: 0.0014, 412: 0.0093, 413: 0.0108, 414: 0.0059, 427: 0.0077, 429: 0.0007, 433: 0.0318, 434: 0.0297, 435: 0.0083, 436: 0.0006, 437: 0.0013, 441: 0.0259, 442: 0.002