# Decision Tree Methods

Decision Trees are just the trees of values to decide what branch will be reviewed next based on the previous branch value. 
The mathematical methods are just the entropy and the information gain. 

In [2]:
import findspark
findspark.init('/home/ubuntu/Spark/spark-3.3.0-bin-hadoop3')

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Tree_Methods_Theory_and_Reading').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/14 10:16:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
from pyspark.ml import Pipeline

In [5]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

We can also import some of the Classifier types for our regression operations. Just simply use: 'from pyspark.ml.classification import' and choose from the dropdown values. 

In [6]:
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

22/09/14 10:33:53 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [7]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [8]:
train_data, test_data = data.randomSplit([0.7, 0.3])

In [11]:
# Note:  we are not providing any column names into the dtc, rfc, gbt vars. As the only columns available are just the ones we need so the
# label and features.
# The more trees the more computation time.
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [17]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [18]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [19]:
print('dtc')
dtc_preds.show()
print('rfc')
rfc_preds.show()
print('gbt')
gbt_preds.show()

dtc
+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[121,122,123...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,148...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[150,151,152...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[151,152,153...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0

In [20]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [21]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [24]:
print('DTC ACCURACY: ')
acc_eval.evaluate(dtc_preds)

DTC ACCURACY: 


0.96875

In [25]:
print('RFC ACCURACY: ')
acc_eval.evaluate(rfc_preds)

RFC ACCURACY: 


1.0

In [26]:
print('GBT ACCURACY: ')
acc_eval.evaluate(gbt_preds)

GBT ACCURACY: 


0.96875

In [27]:
rfc_model.featureImportances

SparseVector(692, {100: 0.0006, 101: 0.0004, 125: 0.0013, 153: 0.0016, 182: 0.0002, 188: 0.0015, 207: 0.0005, 208: 0.0076, 212: 0.0004, 213: 0.0013, 215: 0.0026, 216: 0.0089, 217: 0.0015, 230: 0.001, 233: 0.0037, 234: 0.0014, 240: 0.001, 243: 0.0038, 244: 0.0088, 262: 0.0005, 263: 0.0196, 268: 0.0006, 270: 0.0009, 271: 0.0019, 273: 0.0006, 274: 0.0038, 286: 0.0006, 288: 0.0007, 289: 0.0071, 290: 0.0012, 294: 0.0003, 295: 0.001, 300: 0.0142, 303: 0.0028, 318: 0.0008, 319: 0.0008, 322: 0.0158, 323: 0.0077, 324: 0.0017, 327: 0.0006, 329: 0.0078, 330: 0.0099, 341: 0.0021, 342: 0.0019, 344: 0.007, 345: 0.018, 346: 0.0016, 347: 0.0022, 350: 0.0169, 351: 0.0291, 352: 0.002, 356: 0.0065, 357: 0.0198, 373: 0.0173, 374: 0.0024, 376: 0.0013, 377: 0.0209, 378: 0.044, 379: 0.01, 380: 0.001, 382: 0.0005, 384: 0.013, 385: 0.0197, 386: 0.0077, 388: 0.0012, 399: 0.0071, 402: 0.0024, 405: 0.0361, 406: 0.0429, 407: 0.0212, 408: 0.0005, 412: 0.0154, 414: 0.0065, 415: 0.0042, 416: 0.0029, 426: 0.0078, 427: