# Multi-Class with NaiveBayes

General Pipeline:

- Importing Data
- RFormula transformation
- Split into train and test
- Building the model
- Prediction on the test set
- Evaluation

## Importing

In [2]:
import pyspark, findspark
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder.appName("nb").getOrCreate()

In [18]:
from pyspark.ml.feature    import RFormula
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Loading Data

In [4]:
data = spark.read.load(
    "../../data/iris.csv",
    format="csv",
    sep=",",
    header = True, 
    inferSchema=True)

data.show(2)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 2 rows



## Data Preparation

RFormula automatically applies a categorical encoding to the target column.

In [7]:
rformula = RFormula(
    formula='class ~ .',
    featuresCol="features",
    labelCol="target"
)
data = rformula.fit(data).transform(data)

In [9]:
data.select("features", "target").show(10)

+-----------------+------+
|         features|target|
+-----------------+------+
|[5.1,3.5,1.4,0.2]|   0.0|
|[4.9,3.0,1.4,0.2]|   0.0|
|[4.7,3.2,1.3,0.2]|   0.0|
|[4.6,3.1,1.5,0.2]|   0.0|
|[5.0,3.6,1.4,0.2]|   0.0|
|[5.4,3.9,1.7,0.4]|   0.0|
|[4.6,3.4,1.4,0.3]|   0.0|
|[5.0,3.4,1.5,0.2]|   0.0|
|[4.4,2.9,1.4,0.2]|   0.0|
|[4.9,3.1,1.5,0.1]|   0.0|
+-----------------+------+
only showing top 10 rows



## Split into Train and Test

In [10]:
dataTrain, dataTest = data.randomSplit([0.7, 0.3], seed=11)

In [11]:
dataTrain.count(), dataTest.count()

(106, 44)

## Model Development and Training

In [14]:
nb = NaiveBayes(
    featuresCol="features",
    labelCol="target",
    smoothing=1.0,
    modelType="multinomial"
)

model = nb.fit(dataTrain)

## Predicting on Test Set

In [16]:
predictions = model.transform(dataTest)
predictions.select("target", "prediction", "probability", "rawPrediction").show(truncate=False)

+------+----------+-------------------------------------------------------------+-------------------------------------------------------------+
|target|prediction|probability                                                  |rawPrediction                                                |
+------+----------+-------------------------------------------------------------+-------------------------------------------------------------+
|0.0   |0.0       |[0.6587987467330385,0.21653768475482915,0.1246635685121324]  |[-11.39899773898843,-12.511651239114011,-13.063797178011296] |
|0.0   |0.0       |[0.6887599627973885,0.19882345920313785,0.11241657799947365] |[-11.3137265639653,-12.556202098068914,-13.12640797168702]   |
|0.0   |0.0       |[0.7005249907870801,0.19239017802355332,0.10708483118936661] |[-10.875038987983933,-12.167343541611542,-12.753247693562319]|
|0.0   |0.0       |[0.6405418590167965,0.22709220985084544,0.13236593113235806] |[-11.601408769851181,-12.638367096911905,-13.1781529500

## Model Evaluation

In [20]:
evaluation = MulticlassClassificationEvaluator(
    predictionCol="prediction",
    labelCol="target",
    metricName="accuracy"
)

auroc = evaluation.evaluate(predictions)

print(auroc)

0.8636363636363636
