In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegressionModel

In [2]:
sc = SparkContext()
spark = SparkSession(sc)

In [3]:
df = spark.read.option('header', True)\
.option('inferSchema', True)\
.csv('iris.csv')

In [4]:
df.columns

['ID', 'petal_length', 'petal_width', 'sepal_length', 'sepal_width', 'species']

In [5]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [6]:
df.toPandas().head()

Unnamed: 0,ID,petal_length,petal_width,sepal_length,sepal_width,species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [18]:
cols=df.columns
cols.remove('species')
cols.remove('ID')
cols

['petal_length', 'petal_width', 'sepal_length', 'sepal_width']

In [19]:
assembler = VectorAssembler(inputCols=cols,outputCol="features")

In [20]:
dfTransform=assembler.transform(df)
dfTransform.select("features").show()

+-----------------+
|         features|
+-----------------+
|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|
|[5.4,3.9,1.7,0.4]|
|[4.6,3.4,1.4,0.3]|
|[5.0,3.4,1.5,0.2]|
|[4.4,2.9,1.4,0.2]|
|[4.9,3.1,1.5,0.1]|
|[5.4,3.7,1.5,0.2]|
|[4.8,3.4,1.6,0.2]|
|[4.8,3.0,1.4,0.1]|
|[4.3,3.0,1.1,0.1]|
|[5.8,4.0,1.2,0.2]|
|[5.7,4.4,1.5,0.4]|
|[5.4,3.9,1.3,0.4]|
|[5.1,3.5,1.4,0.3]|
|[5.7,3.8,1.7,0.3]|
|[5.1,3.8,1.5,0.3]|
+-----------------+
only showing top 20 rows



In [52]:
stringIndexer = StringIndexer(inputCol="species", outputCol="indexed")
model = stringIndexer.fit(dfTransform)

dfIndexed = model.transform(dfTransform)
dfIndexed.show()

+---+------------+-----------+------------+-----------+-----------+-----------------+-------+
| ID|petal_length|petal_width|sepal_length|sepal_width|    species|         features|indexed|
+---+------------+-----------+------------+-----------+-----------+-----------------+-------+
|  1|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|    0.0|
|  2|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|    0.0|
|  3|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|    0.0|
|  4|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|    0.0|
|  5|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|    0.0|
|  6|         5.4|        3.9|         1.7|        0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|    0.0|
|  7|         4.6|        3.4|         1.4|        0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|    0.0|
|  8|         5.0|        3.4|         1.5|        0.2|Iris-

In [42]:
encoder = OneHotEncoder(inputCol="indexed", outputCol="label")
dfOneHotLabel = encoder.transform(dfIndexed)
dfOneHotLabel.show()

+---+------------+-----------+------------+-----------+-----------+-----------------+-------+-------------+
| ID|petal_length|petal_width|sepal_length|sepal_width|    species|         features|indexed|        label|
+---+------------+-----------+------------+-----------+-----------+-----------------+-------+-------------+
|  1|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|    0.0|(2,[0],[1.0])|
|  2|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|    0.0|(2,[0],[1.0])|
|  3|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|    0.0|(2,[0],[1.0])|
|  4|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|    0.0|(2,[0],[1.0])|
|  5|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|    0.0|(2,[0],[1.0])|
|  6|         5.4|        3.9|         1.7|        0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|    0.0|(2,[0],[1.0])|
|  7|         4.6|        3.

In [53]:
model = LogisticRegression(labelCol='indexed')

In [55]:
lr = model.fit(dfIndexed)

In [57]:
pred = lr.transform(dfIndexed)

In [62]:
l = pred.toPandas()['probability']

In [63]:
import numpy as np

In [70]:
np.argmax(list(l), axis =1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])