In [0]:
import pyspark

In [0]:
from pyspark.sql import SparkSession



In [0]:
df = spark.read.csv('/FileStore/tables/Iris.csv', header = True, inferSchema = True)


In [0]:
df.show(5)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [0]:
import pandas as pd
pd.DataFrame(df.take(5), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4
Id,1,2,3,4,5
SepalLengthCm,5.1,4.9,4.7,4.6,5.0
SepalWidthCm,3.5,3.0,3.2,3.1,3.6
PetalLengthCm,1.4,1.4,1.3,1.5,1.4
PetalWidthCm,0.2,0.2,0.2,0.2,0.2
Species,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa


In [0]:
numeric_features = [t[0] for t in df.dtypes if t[1] == 'double']
df.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
SepalLengthCm,150,5.843333333333335,0.8280661279778637,4.3,7.9
SepalWidthCm,150,3.0540000000000007,0.43359431136217375,2.0,4.4
PetalLengthCm,150,3.7586666666666693,1.764420419952262,1.0,6.9
PetalWidthCm,150,1.1986666666666672,0.7631607417008414,0.1,2.5


In [0]:
df.columns

Out[13]: ['Id',
 'SepalLengthCm',
 'SepalWidthCm',
 'PetalLengthCm',
 'PetalWidthCm',
 'Species']

In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [0]:
numericCols = ['SepalLengthCm',
 'SepalWidthCm',
 'PetalLengthCm',
 'PetalWidthCm']

assembler = VectorAssembler(inputCols=numericCols, outputCol="features")


In [0]:
df = assembler.transform(df)
df.show()

+---+-------------+------------+-------------+------------+-----------+-----------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|
+---+-------------+------------+-------------+------------+-----------+-----------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|  9|     

In [0]:
label_stringIdx = StringIndexer(inputCol = 'Species', outputCol = 'labelIndex')
df = label_stringIdx.fit(df).transform(df)
df.show()

+---+-------------+------------+-------------+------------+-----------+-----------------+----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|labelIndex|
+---+-------------+------------+-------------+------------+-----------+-----------------+----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|       0.0|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|       0.0|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|       0.0|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|  

In [0]:
pd.DataFrame(df.take(110), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,100,101,102,103,104,105,106,107,108,109
Id,1,2,3,4,5,6,7,8,9,10,...,101,102,103,104,105,106,107,108,109,110
SepalLengthCm,5.1,4.9,4.7,4.6,5.0,5.4,4.6,5.0,4.4,4.9,...,6.3,5.8,7.1,6.3,6.5,7.6,4.9,7.3,6.7,7.2
SepalWidthCm,3.5,3.0,3.2,3.1,3.6,3.9,3.4,3.4,2.9,3.1,...,3.3,2.7,3.0,2.9,3.0,3.0,2.5,2.9,2.5,3.6
PetalLengthCm,1.4,1.4,1.3,1.5,1.4,1.7,1.4,1.5,1.4,1.5,...,6.0,5.1,5.9,5.6,5.8,6.6,4.5,6.3,5.8,6.1
PetalWidthCm,0.2,0.2,0.2,0.2,0.2,0.4,0.3,0.2,0.2,0.1,...,2.5,1.9,2.1,1.8,2.2,2.1,1.7,1.8,1.8,2.5
Species,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,...,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica
features,"[5.1, 3.5, 1.4, 0.2]","[4.9, 3.0, 1.4, 0.2]","[4.7, 3.2, 1.3, 0.2]","[4.6, 3.1, 1.5, 0.2]","[5.0, 3.6, 1.4, 0.2]","[5.4, 3.9, 1.7, 0.4]","[4.6, 3.4, 1.4, 0.3]","[5.0, 3.4, 1.5, 0.2]","[4.4, 2.9, 1.4, 0.2]","[4.9, 3.1, 1.5, 0.1]",...,"[6.3, 3.3, 6.0, 2.5]","[5.8, 2.7, 5.1, 1.9]","[7.1, 3.0, 5.9, 2.1]","[6.3, 2.9, 5.6, 1.8]","[6.5, 3.0, 5.8, 2.2]","[7.6, 3.0, 6.6, 2.1]","[4.9, 2.5, 4.5, 1.7]","[7.3, 2.9, 6.3, 1.8]","[6.7, 2.5, 5.8, 1.8]","[7.2, 3.6, 6.1, 2.5]"
labelIndex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [0]:
train, test = df.randomSplit([0.7, 0.3], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 103
Test Dataset Count: 47


In [0]:
from pyspark.ml.classification import RandomForestClassifier

In [0]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'labelIndex')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.show(25)

+---+-------------+------------+-------------+------------+---------------+-----------------+----------+--------------------+--------------------+----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|        Species|         features|labelIndex|       rawPrediction|         probability|prediction|
+---+-------------+------------+-------------+------------+---------------+-----------------+----------+--------------------+--------------------+----------+
|  1|          5.1|         3.5|          1.4|         0.2|    Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  9|          4.4|         2.9|          1.4|         0.2|    Iris-setosa|[4.4,2.9,1.4,0.2]|       0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
| 12|          4.8|         3.4|          1.6|         0.2|    Iris-setosa|[4.8,3.4,1.6,0.2]|       0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
| 13|          4.8|         3.0|          1.4|      

In [0]:
predictions.select("labelIndex", "prediction").show(10)

+----------+----------+
|labelIndex|prediction|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 10 rows



In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 1.0
Test Error = 0.0


In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

preds_and_labels = predictions.select(['prediction','labelIndex']).withColumn('labelIndex', F.col('labelIndex').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','labelIndex'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())

[[15.  0.  0.]
 [ 0. 13.  0.]
 [ 0.  0. 19.]]


In [0]:
from pyspark.ml.classification import DecisionTreeClassifier

In [0]:
DT=DecisionTreeClassifier(labelCol='labelIndex', 
                            featuresCol='features',
                            maxDepth=5)

In [0]:
model=DT.fit(train)

In [0]:
pre=model.transform(test)

In [0]:
pre.show(5)

+---+-------------+------------+-------------+------------+-----------+-----------------+----------+--------------+-------------+----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|labelIndex| rawPrediction|  probability|prediction|
+---+-------------+------------+-------------+------------+-----------+-----------------+----------+--------------+-------------+----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|[35.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|       0.0|[35.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
| 12|          4.8|         3.4|          1.6|         0.2|Iris-setosa|[4.8,3.4,1.6,0.2]|       0.0|[35.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
| 13|          4.8|         3.0|          1.4|         0.1|Iris-setosa|[4.8,3.0,1.4,0.1]|       0.0|[35.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
| 17|        

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
Lr=LogisticRegression(labelCol='labelIndex',featuresCol='features',maxIter=100)

In [0]:
model1=Lr.fit(train)

In [0]:
pred=model1.transform(test)

In [0]:
pred.show(5)

+---+-------------+------------+-------------+------------+-----------+-----------------+----------+--------------------+--------------------+----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|labelIndex|       rawPrediction|         probability|prediction|
+---+-------------+------------+-------------+------------+-----------+-----------------+----------+--------------------+--------------------+----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|[75.7592630558382...|[1.0,2.4407301286...|       0.0|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|       0.0|[60.7878932582113...|[1.0,9.1115658038...|       0.0|
| 12|          4.8|         3.4|          1.6|         0.2|Iris-setosa|[4.8,3.4,1.6,0.2]|       0.0|[72.4706197742107...|[1.0,7.8095854497...|       0.0|
| 13|          4.8|         3.0|          1.4|         0.1|Iris-setosa|[4.8,

In [0]:

from pyspark.ml.classification import NaiveBayes

In [0]:
N=NaiveBayes(featuresCol='features',labelCol='labelIndex')

In [0]:
modell=N.fit(train)

In [0]:
predi=modell.transform(test)

In [0]:
predi.show(5)

+---+-------------+------------+-------------+------------+-----------+-----------------+----------+--------------------+--------------------+----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|labelIndex|       rawPrediction|         probability|prediction|
+---+-------------+------------+-------------+------------+-----------+-----------------+----------+--------------------+--------------------+----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|[-11.939019600880...|[0.75274834450338...|       0.0|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|       0.0|[-10.787228281999...|[0.65150031968770...|       0.0|
| 12|          4.8|         3.4|          1.6|         0.2|Iris-setosa|[4.8,3.4,1.6,0.2]|       0.0|[-12.001851209966...|[0.70001683723394...|       0.0|
| 13|          4.8|         3.0|          1.4|         0.1|Iris-setosa|[4.8,