# Iris - Logistic Regression
This example applies Logistic Regression in order to predict the type of iris plant based on sepal and petal length and width.

## Import some useful libraries and get Spark Context

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sc= SparkContext()
sqlContext = SQLContext(sc)

## Load the database, convert it to Spark and split data into train and test sets

[Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris)

Attribute Information:
 	
- sepal length in cm
- sepal width in cm
- petal length in cm
- petal width in cm
- class:
    - Iris-Setosa
    - Iris-Versicolour
    - Iris-Virginica

In [2]:
pdf = pd.read_csv('iris.data')
pdf.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
data = sqlContext.createDataFrame(pdf)
pdf.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


## Data exploration

In [3]:
pdf.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,149.0,149.0,149.0,149.0
mean,5.848322,3.051007,3.774497,1.205369
std,0.828594,0.433499,1.759651,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [4]:
data.printSchema()
data.show(3)

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 3 rows



In [5]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="species", outputCol="label")
indexed = indexer.fit(data).transform(data)
indexed.select(['label']).distinct().show()

+-----+
|label|
+-----+
|  0.0|
|  1.0|
|  2.0|
+-----+



In [6]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['sepal_length', "sepal_width", "petal_length", "petal_width"], outputCol = 'features')
vData = vectorAssembler.transform(indexed)
vData = vData.select(['features', 'label'])
vData.show(3)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[4.9,3.0,1.4,0.2]|  2.0|
|[4.7,3.2,1.3,0.2]|  2.0|
|[4.6,3.1,1.5,0.2]|  2.0|
+-----------------+-----+
only showing top 3 rows



In [7]:
train, test = vData.randomSplit([0.7, 0.3])

print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 97
Test Dataset Count: 52


## Logistic Regression

In [8]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)

In [9]:
print(lrModel.coefficientMatrix)

DenseMatrix([[ 1.85224054, -2.49059757,  0.3630131 , -0.47410595],
             [ 1.07119956, -6.86851579,  2.01729437,  5.78217843],
             [-2.9234401 ,  9.35911336, -2.38030747, -5.30807248]])


In [10]:
predictions = lrModel.transform(test)
predictions.select('prediction', 'label', 'rawPrediction', 'probability').show(10)

+----------+-----+--------------------+--------------------+
|prediction|label|       rawPrediction|         probability|
+----------+-----+--------------------+--------------------+
|       2.0|  2.0|[0.74074967469340...|[2.46649027073291...|
|       2.0|  2.0|[0.14483551492912...|[4.74599798305744...|
|       2.0|  2.0|[2.20225799972517...|[2.78870341416500...|
|       2.0|  2.0|[2.17472360667128...|[1.80869178562501...|
|       2.0|  2.0|[1.95457337772487...|[6.34904082638008...|
|       2.0|  2.0|[1.56535779453205...|[1.35176035798657...|
|       2.0|  2.0|[1.03093697044323...|[9.60531135365334...|
|       2.0|  2.0|[1.93001244057738...|[5.25775144121538...|
|       2.0|  2.0|[1.41781018603363...|[8.21251882416524...|
|       2.0|  2.0|[0.90858138739628...|[1.63796198167632...|
+----------+-----+--------------------+--------------------+
only showing top 10 rows



In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))
print('Higher the AUC, better the model is at predicting')

Test Area Under ROC 0.9616575375089308
Higher the AUC, better the model is at predicting
