In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
sp=SparkSession.builder.appName('irisapp').getOrCreate()

In [11]:
diris=sp.read.csv('d:\\iris.csv',header=True,inferSchema=True)
diris.show(5)
diris.count()

+-----------+----------+-----------+----------+-------+
|sepallength|sepalwidth|petallength|petalwidth|variety|
+-----------+----------+-----------+----------+-------+
|        5.1|       3.5|        1.4|       0.2| Setosa|
|        4.9|       3.0|        1.4|       0.2| Setosa|
|        4.7|       3.2|        1.3|       0.2| Setosa|
|        4.6|       3.1|        1.5|       0.2| Setosa|
|        5.0|       3.6|        1.4|       0.2| Setosa|
+-----------+----------+-----------+----------+-------+
only showing top 5 rows



150

In [12]:
diris.printSchema()
diris.select('variety').distinct().show()

root
 |-- sepallength: double (nullable = true)
 |-- sepalwidth: double (nullable = true)
 |-- petallength: double (nullable = true)
 |-- petalwidth: double (nullable = true)
 |-- variety: string (nullable = true)

+----------+
|   variety|
+----------+
| Virginica|
|    Setosa|
|Versicolor|
+----------+



In [13]:
diris.columns[:4]

['sepallength', 'sepalwidth', 'petallength', 'petalwidth']

In [24]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
va=VectorAssembler(inputCols=diris.columns[:4],outputCol='inputfeat')
df=va.transform(diris)
df.show(5)

+-----------+----------+-----------+----------+-------+-----------------+
|sepallength|sepalwidth|petallength|petalwidth|variety|        inputfeat|
+-----------+----------+-----------+----------+-------+-----------------+
|        5.1|       3.5|        1.4|       0.2| Setosa|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2| Setosa|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2| Setosa|[4.7,3.2,1.3,0.2]|
|        4.6|       3.1|        1.5|       0.2| Setosa|[4.6,3.1,1.5,0.2]|
|        5.0|       3.6|        1.4|       0.2| Setosa|[5.0,3.6,1.4,0.2]|
+-----------+----------+-----------+----------+-------+-----------------+
only showing top 5 rows



In [25]:
ind=StringIndexer(inputCol='variety',outputCol='Sp')
df1=ind.fit(df).transform(df)
df1.show()

+-----------+----------+-----------+----------+-------+-----------------+---+
|sepallength|sepalwidth|petallength|petalwidth|variety|        inputfeat| Sp|
+-----------+----------+-----------+----------+-------+-----------------+---+
|        5.1|       3.5|        1.4|       0.2| Setosa|[5.1,3.5,1.4,0.2]|0.0|
|        4.9|       3.0|        1.4|       0.2| Setosa|[4.9,3.0,1.4,0.2]|0.0|
|        4.7|       3.2|        1.3|       0.2| Setosa|[4.7,3.2,1.3,0.2]|0.0|
|        4.6|       3.1|        1.5|       0.2| Setosa|[4.6,3.1,1.5,0.2]|0.0|
|        5.0|       3.6|        1.4|       0.2| Setosa|[5.0,3.6,1.4,0.2]|0.0|
|        5.4|       3.9|        1.7|       0.4| Setosa|[5.4,3.9,1.7,0.4]|0.0|
|        4.6|       3.4|        1.4|       0.3| Setosa|[4.6,3.4,1.4,0.3]|0.0|
|        5.0|       3.4|        1.5|       0.2| Setosa|[5.0,3.4,1.5,0.2]|0.0|
|        4.4|       2.9|        1.4|       0.2| Setosa|[4.4,2.9,1.4,0.2]|0.0|
|        4.9|       3.1|        1.5|       0.1| Setosa|[4.9,3.1,

In [27]:
finaldata=df1.select('inputfeat','sp')
finaldata.show()

+-----------------+---+
|        inputfeat| sp|
+-----------------+---+
|[5.1,3.5,1.4,0.2]|0.0|
|[4.9,3.0,1.4,0.2]|0.0|
|[4.7,3.2,1.3,0.2]|0.0|
|[4.6,3.1,1.5,0.2]|0.0|
|[5.0,3.6,1.4,0.2]|0.0|
|[5.4,3.9,1.7,0.4]|0.0|
|[4.6,3.4,1.4,0.3]|0.0|
|[5.0,3.4,1.5,0.2]|0.0|
|[4.4,2.9,1.4,0.2]|0.0|
|[4.9,3.1,1.5,0.1]|0.0|
|[5.4,3.7,1.5,0.2]|0.0|
|[4.8,3.4,1.6,0.2]|0.0|
|[4.8,3.0,1.4,0.1]|0.0|
|[4.3,3.0,1.1,0.1]|0.0|
|[5.8,4.0,1.2,0.2]|0.0|
|[5.7,4.4,1.5,0.4]|0.0|
|[5.4,3.9,1.3,0.4]|0.0|
|[5.1,3.5,1.4,0.3]|0.0|
|[5.7,3.8,1.7,0.3]|0.0|
|[5.1,3.8,1.5,0.3]|0.0|
+-----------------+---+
only showing top 20 rows



In [28]:
train,test=finaldata.randomSplit([.70,.30])

In [30]:
from pyspark.ml.classification  import DecisionTreeClassifier
dtcmodel=DecisionTreeClassifier(labelCol='sp',featuresCol='inputfeat')

In [31]:
model=dtcmodel.fit(train)

In [32]:
model

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c9b3157c7af0, depth=4, numNodes=13, numClasses=3, numFeatures=4

In [33]:
pre_res=model.transform(test)

In [35]:
pre_res.select('inputfeat','sp','prediction').show()

+-----------------+---+----------+
|        inputfeat| sp|prediction|
+-----------------+---+----------+
|[4.4,3.2,1.3,0.2]|0.0|       0.0|
|[4.6,3.2,1.4,0.2]|0.0|       0.0|
|[4.8,3.0,1.4,0.1]|0.0|       0.0|
|[4.8,3.0,1.4,0.3]|0.0|       0.0|
|[4.8,3.4,1.9,0.2]|0.0|       0.0|
|[5.0,2.3,3.3,1.0]|1.0|       1.0|
|[5.0,3.2,1.2,0.2]|0.0|       0.0|
|[5.0,3.4,1.5,0.2]|0.0|       0.0|
|[5.0,3.6,1.4,0.2]|0.0|       0.0|
|[5.1,3.4,1.5,0.2]|0.0|       0.0|
|[5.1,3.5,1.4,0.3]|0.0|       0.0|
|[5.1,3.8,1.6,0.2]|0.0|       0.0|
|[5.2,3.4,1.4,0.2]|0.0|       0.0|
|[5.2,4.1,1.5,0.1]|0.0|       0.0|
|[5.5,2.4,3.8,1.1]|1.0|       1.0|
|[5.6,2.8,4.9,2.0]|2.0|       2.0|
|[5.7,2.8,4.1,1.3]|1.0|       1.0|
|[5.7,3.8,1.7,0.3]|0.0|       0.0|
|[5.7,4.4,1.5,0.4]|0.0|       0.0|
|[5.8,2.7,5.1,1.9]|2.0|       2.0|
+-----------------+---+----------+
only showing top 20 rows



In [37]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [40]:
ev=MulticlassClassificationEvaluator(labelCol='sp',predictionCol='prediction')
acc=ev.evaluate(pre_res)
print('Accuaracy of model:',acc)
print('Test Error of model:',(1-acc))

Accuaracy of model: 0.9018583042973286
Test Error of model: 0.0981416957026714


In [56]:
#IndexToString converstion
from pyspark.ml.feature import IndexToString
itos=IndexToString(inputCol='Sp',outputCol='va_category')
b=itos.transform(df1)
b.select('Sp','va_category').distinct().show()

+---+-----------+
| Sp|va_category|
+---+-----------+
|1.0| Versicolor|
|0.0|     Setosa|
|2.0|  Virginica|
+---+-----------+



In [63]:
from pyspark.ml.feature import IndexToString
itos=IndexToString(inputCol='Sp',outputCol='va_category')
b=itos.transform(df1)
b.show()

+-----------+----------+-----------+----------+-------+-----------------+---+-----------+
|sepallength|sepalwidth|petallength|petalwidth|variety|        inputfeat| Sp|va_category|
+-----------+----------+-----------+----------+-------+-----------------+---+-----------+
|        5.1|       3.5|        1.4|       0.2| Setosa|[5.1,3.5,1.4,0.2]|0.0|     Setosa|
|        4.9|       3.0|        1.4|       0.2| Setosa|[4.9,3.0,1.4,0.2]|0.0|     Setosa|
|        4.7|       3.2|        1.3|       0.2| Setosa|[4.7,3.2,1.3,0.2]|0.0|     Setosa|
|        4.6|       3.1|        1.5|       0.2| Setosa|[4.6,3.1,1.5,0.2]|0.0|     Setosa|
|        5.0|       3.6|        1.4|       0.2| Setosa|[5.0,3.6,1.4,0.2]|0.0|     Setosa|
|        5.4|       3.9|        1.7|       0.4| Setosa|[5.4,3.9,1.7,0.4]|0.0|     Setosa|
|        4.6|       3.4|        1.4|       0.3| Setosa|[4.6,3.4,1.4,0.3]|0.0|     Setosa|
|        5.0|       3.4|        1.5|       0.2| Setosa|[5.0,3.4,1.5,0.2]|0.0|     Setosa|
|        4

In [62]:
from pyspark.ml.classification import RandomForestClassifier

In [65]:
dt=RandomForestClassifier(labelCol='sp',featuresCol='inputfeat')
model=dt.fit(train)

In [66]:
pre_result=model.transform(test)
pre_result.show()

+-----------------+---+--------------+---------------+----------+
|        inputfeat| sp| rawPrediction|    probability|prediction|
+-----------------+---+--------------+---------------+----------+
|[4.4,3.2,1.3,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.6,3.2,1.4,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.8,3.0,1.4,0.1]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.8,3.0,1.4,0.3]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.9,0.2]|0.0|[16.0,4.0,0.0]|  [0.8,0.2,0.0]|       0.0|
|[5.0,2.3,3.3,1.0]|1.0|[0.0,20.0,0.0]|  [0.0,1.0,0.0]|       1.0|
|[5.0,3.2,1.2,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[5.0,3.4,1.5,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[5.0,3.6,1.4,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[5.1,3.4,1.5,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[5.1,3.5,1.4,0.3]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[5.1,3.8,1.6,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[5.2,3.4,

In [67]:
ev=MulticlassClassificationEvaluator(labelCol='sp',predictionCol='prediction')
acc=ev.evaluate(pre_result)
print('Accuaracy of model:',acc)
print('Test Error of model:',(1-acc))

Accuaracy of model: 0.9509291521486644
Test Error of model: 0.04907084785133564
