In [2]:
import findspark

In [3]:
findspark.init('/home/i-sip_iot/spark-3.0.1-bin-hadoop2.7')

In [4]:
import pyspark

In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('DTRF').getOrCreate()

In [7]:
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier

In [8]:
df = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [9]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [10]:
RFC = RandomForestClassifier(numTrees=100)
DTC = DecisionTreeClassifier()
GBT = GBTClassifier()

In [11]:
train_df, test_df = df.randomSplit([0.7, 0.3])

In [12]:
RFC_trained = RFC.fit(train_df)
DTC_trained = DTC.fit(train_df)
GBT_trained = GBT.fit(train_df)

In [14]:
RFC_test = RFC_trained.transform(test_df)
DTC_test = DTC_trained.transform(test_df)
GBT_test = GBT_trained.transform(test_df)

In [15]:
RFC_test.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[122,123,148...|   [93.0,7.0]|[0.93,0.07]|       0.0|
|  0.0|(692,[124,125,126...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[126,127,128...|   [91.0,9.0]|[0.91,0.09]|       0.0|
|  0.0|(692,[126,127,128...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[128,129,130...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[150,151,152...|  [87.0,13.0]|[0.87,0.13]|       0.0|
|  0.0|(692,[153,154,155...|   [93.0,7.0]|[0.93,0.07]|       0.0|
|  0.0|(692,[154,155,156...|   [93.0,7.0]|[0.93,0.07]|       0.0|
|  0.0|(692,[234,235,237...|  [81.0,19.0]|[0.81,0.19]|       0.0|
|  1.0|(692,[100,101,102...|   [2.0,98.0]|[0.02,0.98]|       1.0|
|  1.0|(692,[119,120,121...|  [22.0,78.0]|[0.22,0.78]|       1.0|
|  1.0|(692,[123,124,125...|   [1.0,99.0]|[0.01,0.99]|       1.0|
|  1.0|(69

In [16]:
GBT_test.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[122,123,148...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.50735770516607...|[0.95323450996218...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[128,129,130...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[150,151,152...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[153,154,155...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[154,155,156...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[234,235,237...|[1.18123643056328...|[0.91392054418160...|       0.0|
|  1.0|(692,[100

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [18]:
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

In [19]:
evaluator.evaluate(RFC_test)  

1.0

In [20]:
evaluator.evaluate(DTC_test)  

1.0

In [21]:
evaluator.evaluate(GBT_test)  

1.0

In [22]:
RFC_trained.featureImportances

SparseVector(692, {100: 0.0005, 101: 0.001, 129: 0.0003, 132: 0.0005, 154: 0.0002, 156: 0.0005, 157: 0.0005, 180: 0.0004, 181: 0.0009, 188: 0.0005, 203: 0.0005, 208: 0.0064, 212: 0.0005, 214: 0.0011, 216: 0.0057, 234: 0.0071, 238: 0.0005, 239: 0.0011, 243: 0.0006, 244: 0.0082, 258: 0.0035, 263: 0.0004, 267: 0.0005, 268: 0.0005, 271: 0.001, 272: 0.007, 273: 0.0003, 291: 0.0066, 294: 0.0017, 295: 0.0005, 296: 0.0026, 298: 0.0006, 299: 0.0008, 300: 0.0145, 301: 0.0305, 317: 0.0285, 322: 0.0013, 323: 0.0015, 324: 0.0005, 326: 0.0013, 327: 0.0005, 328: 0.0088, 329: 0.007, 342: 0.0011, 345: 0.0144, 346: 0.001, 349: 0.001, 350: 0.0086, 351: 0.019, 352: 0.001, 353: 0.001, 354: 0.0014, 355: 0.0006, 356: 0.0085, 357: 0.0026, 358: 0.0051, 372: 0.0082, 374: 0.0006, 375: 0.0005, 378: 0.0178, 379: 0.0299, 380: 0.0048, 383: 0.0027, 384: 0.0071, 385: 0.0238, 386: 0.0054, 398: 0.0006, 399: 0.0062, 400: 0.0072, 405: 0.0104, 406: 0.0484, 407: 0.0285, 413: 0.0168, 425: 0.003, 427: 0.017, 429: 0.0009, 431:

### Another Example

In [23]:
df = spark.read.csv('College.csv', inferSchema=True, header=True)

In [25]:
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [26]:
df.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [29]:
df.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [27]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [35]:
assembler = VectorAssembler(inputCols=['Apps','Accept','Enroll','Top10perc','Top25perc','F_Undergrad',
                                      'P_Undergrad','Outstate','Room_Board','Books','Personal','PhD',
                                      'Terminal','S_F_Ratio','perc_alumni','Expend','Grad_Rate'], outputCol='features')

In [36]:
df_n = assembler.transform(df)

In [37]:
pi_indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndexer')

In [38]:
df_f = pi_indexer.fit(df_n).transform(df_n)

In [39]:
df_final = df_f.select(['features','PrivateIndexer'])

In [40]:
df_final.show()

+--------------------+--------------+
|            features|PrivateIndexer|
+--------------------+--------------+
|[1660.0,1232.0,72...|           0.0|
|[2186.0,1924.0,51...|           0.0|
|[1428.0,1097.0,33...|           0.0|
|[417.0,349.0,137....|           0.0|
|[193.0,146.0,55.0...|           0.0|
|[587.0,479.0,158....|           0.0|
|[353.0,340.0,103....|           0.0|
|[1899.0,1720.0,48...|           0.0|
|[1038.0,839.0,227...|           0.0|
|[582.0,498.0,172....|           0.0|
|[1732.0,1425.0,47...|           0.0|
|[2652.0,1900.0,48...|           0.0|
|[1179.0,780.0,290...|           0.0|
|[1267.0,1080.0,38...|           0.0|
|[494.0,313.0,157....|           0.0|
|[1420.0,1093.0,22...|           0.0|
|[4302.0,992.0,418...|           0.0|
|[1216.0,908.0,423...|           0.0|
|[1130.0,704.0,322...|           0.0|
|[3540.0,2001.0,10...|           1.0|
+--------------------+--------------+
only showing top 20 rows



In [41]:
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier

In [42]:
RFC = RandomForestClassifier(labelCol='PrivateIndexer', numTrees=150)
DTC = DecisionTreeClassifier(labelCol='PrivateIndexer')
GBT = GBTClassifier(labelCol='PrivateIndexer')

In [43]:
df_train, df_test = df_final.randomSplit([0.7, 0.3])

In [44]:
RFC_trained = RFC.fit(df_train)
DTC_trained = DTC.fit(df_train)
GBT_trained = GBT.fit(df_train)

In [45]:
RFC_test = RFC_trained.transform(df_test)
DTC_test = DTC_trained.transform(df_test)
GBT_test = GBT_trained.transform(df_test)

In [47]:
RFC_test.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndexer: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [51]:
GBT_test.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndexer: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [46]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [48]:
eval_ = BinaryClassificationEvaluator(labelCol='PrivateIndexer')

In [49]:
eval_.evaluate(RFC_test)

0.9610709732256697

In [50]:
eval_.evaluate(DTC_test)

0.8848278793030175

In [52]:
eval_.evaluate(GBT_test)

0.9285592860178498