In [2]:
import findspark

In [3]:
findspark.init('/home/i-sip_iot/spark-3.0.1-bin-hadoop2.7')

In [4]:
import pyspark

In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('DTRF').getOrCreate()

In [7]:
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier

In [8]:
df = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [9]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [10]:
RFC = RandomForestClassifier(numTrees=100)
DTC = DecisionTreeClassifier()
GBT = GBTClassifier()

In [11]:
train_df, test_df = df.randomSplit([0.7, 0.3])

In [12]:
RFC_trained = RFC.fit(train_df)
DTC_trained = DTC.fit(train_df)
GBT_trained = GBT.fit(train_df)

In [14]:
RFC_test = RFC_trained.transform(test_df)
DTC_test = DTC_trained.transform(test_df)
GBT_test = GBT_trained.transform(test_df)

In [15]:
RFC_test.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[122,123,148...|   [93.0,7.0]|[0.93,0.07]|       0.0|
|  0.0|(692,[124,125,126...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[126,127,128...|   [91.0,9.0]|[0.91,0.09]|       0.0|
|  0.0|(692,[126,127,128...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[128,129,130...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[150,151,152...|  [87.0,13.0]|[0.87,0.13]|       0.0|
|  0.0|(692,[153,154,155...|   [93.0,7.0]|[0.93,0.07]|       0.0|
|  0.0|(692,[154,155,156...|   [93.0,7.0]|[0.93,0.07]|       0.0|
|  0.0|(692,[234,235,237...|  [81.0,19.0]|[0.81,0.19]|       0.0|
|  1.0|(692,[100,101,102...|   [2.0,98.0]|[0.02,0.98]|       1.0|
|  1.0|(692,[119,120,121...|  [22.0,78.0]|[0.22,0.78]|       1.0|
|  1.0|(692,[123,124,125...|   [1.0,99.0]|[0.01,0.99]|       1.0|
|  1.0|(69

In [16]:
GBT_test.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[122,123,148...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.50735770516607...|[0.95323450996218...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[128,129,130...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[150,151,152...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[153,154,155...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[154,155,156...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[234,235,237...|[1.18123643056328...|[0.91392054418160...|       0.0|
|  1.0|(692,[100

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [18]:
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

In [19]:
evaluator.evaluate(RFC_test)  

1.0

In [20]:
evaluator.evaluate(DTC_test)  

1.0

In [21]:
evaluator.evaluate(GBT_test)  

1.0