In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [2]:
# Importing pipeline from ml
from pyspark.ml import Pipeline
# Importing tree classifiers from ml.classification
from pyspark.ml.classification import RandomForestClassifier,GBTClassifier,DecisionTreeClassifier

In [3]:
# Reading data
data = spark.read.format('libsvm').load('Tree_Methods/sample_libsvm_data.txt')

In [4]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [8]:
train_data , test_data = data.randomSplit([0.7,0.3])

In [9]:
# Creating objects for all the classifiers
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [10]:
# creating models for all the classifiers using created objects
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [11]:
# Transforming the test data for prediction and evaluation
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [15]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[123,124,125...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[129,130,131...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(69

In [17]:
# GBT Classifier won't have the rawPrediction column
gbt_preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[123,124,125...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.10999902810676...|[0.90203102392811...|       0.0|
|  0.0|(692,[126,127,128...|[1.23684048042532...|[0.92227603747858...|       0.0|
|  0.0|(692,[126,127,128...|[1.43568457210882...|[0.94641282740144...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127

In [18]:
# Using evulator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
acc_eval = MulticlassClassificationEvaluator ()
acc_eval.evaluate(dtc_preds)

1.0

In [20]:
# Higher the number in the feature importance will have more impact in the prediction
rfc_model.featureImportances

SparseVector(692, {99: 0.0008, 101: 0.0003, 120: 0.0005, 121: 0.001, 130: 0.0005, 153: 0.0007, 154: 0.0003, 155: 0.0018, 177: 0.0005, 179: 0.0008, 181: 0.0001, 182: 0.0005, 208: 0.0131, 211: 0.0001, 213: 0.0003, 214: 0.0055, 216: 0.0007, 217: 0.0011, 235: 0.0056, 243: 0.0005, 262: 0.0084, 266: 0.0005, 270: 0.0007, 271: 0.0074, 272: 0.008, 273: 0.0056, 275: 0.0006, 290: 0.0177, 293: 0.0013, 295: 0.0012, 299: 0.0114, 300: 0.0178, 302: 0.0072, 304: 0.0008, 313: 0.0008, 318: 0.0015, 319: 0.0006, 330: 0.0065, 342: 0.0009, 343: 0.0003, 344: 0.008, 345: 0.0105, 347: 0.0072, 351: 0.0271, 353: 0.0007, 356: 0.0332, 357: 0.0287, 372: 0.007, 377: 0.0013, 378: 0.0108, 379: 0.02, 380: 0.0015, 382: 0.0007, 383: 0.0006, 384: 0.0077, 385: 0.0137, 386: 0.0056, 397: 0.0014, 398: 0.0012, 400: 0.0061, 405: 0.0035, 406: 0.0623, 407: 0.0094, 408: 0.0029, 409: 0.0015, 410: 0.0011, 412: 0.003, 413: 0.0006, 428: 0.0083, 429: 0.0238, 432: 0.0008, 433: 0.0299, 434: 0.029, 435: 0.017, 438: 0.0027, 442: 0.0006, 455