### Decision Tree, Random Forest and Gradient boosting example in Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,GBTClassifier

In [2]:
spark = SparkSession.builder.appName('Trees').getOrCreate()

In [3]:
df = spark.read.format('libsvm').load('../datasets/sample_libsvm_data.txt')

In [4]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [6]:
# Train Test Split
train_data,test_data = df.randomSplit([0.7,0.3])

### Create models with default hyper parameters

In [5]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
gbc = GBTClassifier()

In [7]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbc_model = gbc.fit(train_data)

In [8]:
dtc_pred = dtc_model.transform(test_data)
rfc_pred = rfc_model.transform(test_data)
gbc_pred = gbc_model.transform(test_data)

### Evaluation

In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [10]:
eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [18]:
print('GBC ACCURACY:')
print(eval.evaluate(gbc_pred))
print('\nRFC ACCURACY:')
print(eval.evaluate(rfc_pred))
print('\nDTC ACCURACY:')
print(eval.evaluate(dtc_pred))

GBC ACCURACY:
0.967741935483871

RFC ACCURACY:
0.967741935483871

DTC ACCURACY:
0.967741935483871


`That's it, we've seen simple example of how to use DTree, Random Forest and Gradient Boosting Classifier`

Note: we can also use these three in regression problem by importing DTreeregressor,RandomForestregressor and GBTregressor