# Classification with Multi Layer Perceptron

General Pipeline:

- Importing Data
- RFormula transformation
- Split into train and test
- Building the model
- Prediction on the test set
- Evaluation

## Importing

In [1]:
import pyspark, findspark
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder.appName("nb").getOrCreate()

In [13]:
from pyspark.ml.feature    import RFormula, VectorAssembler, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Loading Data

In [3]:
data = spark.read.load(
    "../../data/iris.csv",
    format="csv",
    sep=",",
    header = True, 
    inferSchema=True)

data.show(2)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 2 rows



## Data Preparation

In [6]:
asb = VectorAssembler(
    inputCols=["sepallength","sepalwidth","petallength","petalwidth"],
    outputCol="features"
)

data = asb.transform(data)
data.show(2)

+-----------+----------+-----------+----------+-----------+-----------------+
|sepallength|sepalwidth|petallength|petalwidth|      class|         features|
+-----------+----------+-----------+----------+-----------+-----------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
+-----------+----------+-----------+----------+-----------+-----------------+
only showing top 2 rows



In [8]:
ind = StringIndexer(
    inputCol="class",
    outputCol="target"
)

data = ind.fit(data).transform(data)
data.show(2)

+-----------+----------+-----------+----------+-----------+-----------------+------+
|sepallength|sepalwidth|petallength|petalwidth|      class|         features|target|
+-----------+----------+-----------+----------+-----------+-----------------+------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|   0.0|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|   0.0|
+-----------+----------+-----------+----------+-----------+-----------------+------+
only showing top 2 rows



## Split into Train and Test

In [9]:
dataTrain, dataTest = data.randomSplit([0.7, 0.3], seed=11)

In [10]:
dataTrain.count(), dataTest.count()

(106, 44)

## Model Development and Training

In [18]:
mlp = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="target",
    maxIter=1000,
    layers=[4,5,4,3]
)

model = mlp.fit(dataTrain)

## Predicting on Test Set

In [19]:
predictions = model.transform(dataTest)
predictions.select("target", "prediction", "probability", "rawPrediction").show(truncate=False)

+------+----------+-----------------------------------------------------------------+------------------------------------------------------------+
|target|prediction|probability                                                      |rawPrediction                                               |
+------+----------+-----------------------------------------------------------------+------------------------------------------------------------+
|0.0   |0.0       |[1.0,4.035458900236947E-108,3.4681781007377533E-189]             |[227.37834072831674,-19.90572928986262,-206.56661243449756] |
|0.0   |0.0       |[1.0,4.035180726191833E-108,3.4679658876130403E-189]             |[227.37838413891598,-19.90575481408345,-206.5666302144172]  |
|0.0   |0.0       |[1.0,4.0352818996242034E-108,3.4680430708738136E-189]            |[227.3783683498968,-19.905745530579026,-206.56662374762288] |
|0.0   |0.0       |[1.0,4.035391202268137E-108,3.4681264555344E-189]                |[227.37835129268382,-19.905735501

## Model Evaluation

In [20]:
evaluation = MulticlassClassificationEvaluator(
    predictionCol="prediction",
    labelCol="target",
    metricName="accuracy"
)

auroc = evaluation.evaluate(predictions)

print(auroc)

0.9318181818181818
