In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import findspark
from pyspark.mllib.evaluation import MulticlassMetrics
import numpy as np
import pandas as pd
import shap

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder.appName("ClassificationMultiModels").getOrCreate()

In [4]:
spark

In [5]:
df = spark.read.csv(r"C:\Users\Robyi\Documents\Data Science Dataset\iris.csv", header = True, inferSchema = True)

In [6]:
df.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [7]:
df = df.dropDuplicates()

In [8]:
df = df.dropna(how="any")

In [9]:
assembler = VectorAssembler(inputCols=[
    "sepal_length", "sepal_width", "sepal_length", "petal_width"], 
                            outputCol="features")
df = assembler.transform(df)

In [10]:
Standarization = StandardScaler(inputCol="features", outputCol="features_scaled")
df = Standarization.fit(df).transform(df)

In [11]:
Labeler = StringIndexer(inputCol="species", outputCol="target")
df = Labeler.fit(df).transform(df)

In [12]:
df.show()

+------------+-----------+------------+-----------+----------+-----------------+--------------------+------+
|sepal_length|sepal_width|petal_length|petal_width|   species|         features|     features_scaled|target|
+------------+-----------+------------+-----------+----------+-----------------+--------------------+------+
|         5.1|        3.5|         1.4|        0.3|    setosa|[5.1,3.5,5.1,0.3]|[6.15124937481215...|   2.0|
|         5.0|        3.4|         1.6|        0.4|    setosa|[5.0,3.4,5.0,0.4]|[6.03063664197270...|   2.0|
|         4.4|        3.2|         1.3|        0.2|    setosa|[4.4,3.2,4.4,0.2]|[5.30696024493597...|   2.0|
|         4.8|        3.4|         1.6|        0.2|    setosa|[4.8,3.4,4.8,0.2]|[5.78941117629379...|   2.0|
|         5.0|        3.3|         1.4|        0.2|    setosa|[5.0,3.3,5.0,0.2]|[6.03063664197270...|   2.0|
|         6.3|        2.5|         5.0|        1.9| virginica|[6.3,2.5,6.3,1.9]|[7.59860216888560...|   1.0|
|         4.5|     

In [13]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

In [14]:
evaluator = MulticlassClassificationEvaluator(labelCol="target", metricName="accuracy")

In [None]:
mlp = MultilayerPerceptronClassifier(featuresCol="features_scaled", labelCol="target")

paramGrid = ParamGridBuilder() \
    .addGrid(mlp.layers, [[4, 5, 3], [4, 10,3]]) \
    .addGrid(mlp.stepSize, [0.01, 0.1]) \
    .build()
# layer di akhir 3 karena kelas targetnya 3
# layer di awal 4 karena adaa 4 fitur
# dan hidden layer 5 dan 10

crossval = CrossValidator(estimator=mlp, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
mlp_model = crossval.fit(train)

In [18]:
best_model = mlp_model.bestModel
predictions = best_model.transform(test)

In [19]:
predictions.select("features_scaled", "target", "prediction", "probability").show(truncate=False)

+---------------------------------------------------------------------------+------+----------+-----------------------------------------------------------------+
|features_scaled                                                            |target|prediction|probability                                                      |
+---------------------------------------------------------------------------+------+----------+-----------------------------------------------------------------+
|[5.306960244935977,6.864851782803006,5.306960244935977,0.26389603791390986]|2.0   |2.0       |[1.2470917410435777E-5,2.6814929004228116E-17,0.9999875290825896]|
|[5.548185710614884,7.322508568323207,5.548185710614884,0.26389603791390986]|2.0   |2.0       |[6.0718926317053424E-6,6.767045148240526E-18,0.9999939281073682] |
|[5.548185710614884,8.237822139363608,5.548185710614884,0.26389603791390986]|2.0   |2.0       |[8.790392276017615E-7,1.677816090346063E-19,0.9999991209607724]  |
|[5.789411176293792,7.093680

In [21]:
predictionAndLabels = predictions.select("prediction", "target").rdd.map(tuple)

In [23]:
predictionAndLabels = predictions.select(
    col("prediction").cast("double"),
    col("target").cast("double")
).rdd.map(tuple)

In [24]:
metrics = MulticlassMetrics(predictionAndLabels)



In [25]:
accuracy = evaluator.evaluate(predictions)
print(f"Multi-Class Accuracy: {accuracy:.4f}")

Multi-Class Accuracy: 0.9583


In [26]:
print("Confusion Matrix:")
print(metrics.confusionMatrix().toArray())

Confusion Matrix:
[[ 6.  0.  0.]
 [ 1.  3.  0.]
 [ 0.  0. 14.]]


In [27]:
metrics = ["accuracy", "weightedPrecision", "weightedRecall", "f1", "weightedFMeasure"]

for metric in metrics:
    evaluator = MulticlassClassificationEvaluator(
        labelCol="target", 
        predictionCol="prediction", 
        metricName=metric
    )
    score = evaluator.evaluate(predictions)
    print(f"{metric}: {score:.4f}")

accuracy: 0.9583
weightedPrecision: 0.9643
weightedRecall: 0.9583
f1: 0.9570
weightedFMeasure: 0.9570


In [28]:
predictions.crosstab("target", "prediction").show()

+-----------------+---+---+---+
|target_prediction|0.0|1.0|2.0|
+-----------------+---+---+---+
|              1.0|  1|  3|  0|
|              0.0|  6|  0|  0|
|              2.0|  0|  0| 14|
+-----------------+---+---+---+

