## Import libraries

In [18]:
from sklearn.datasets import load_digits
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.torch.distributor import TorchDistributor
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Create Spark Session

In [19]:
def create_spark():
    """ Create a SparkSession object. """
    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("TestSuite") \
        .config(key='spark.sql.shuffle.partitions', value='4') \
        .config(key='spark.default.parallelism', value='4') \
        .config(key='spark.sql.session.timeZone', value='UTC') \
        .config(key='spark.ui.enabled', value='false') \
        .config(key='spark.app.id', value='Test') \
        .config(key='spark.driver.host', value='localhost') \
        .getOrCreate()

    return spark

In [20]:
spark = create_spark()

## Load Data

In [29]:
digits = load_digits()

In [30]:
digits_pandas_df = pd.DataFrame(data=digits.data, columns=digits.feature_names)
digits_pandas_df['target'] = digits.target

In [31]:
digits_pandas_df.head(1)

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0


In [32]:
digits_spark_df = spark.createDataFrame(digits_pandas_df)

In [33]:
features = digits_spark_df.columns[:-1]
assembler = VectorAssembler(inputCols=features, outputCol='features')
digits_spark_df = assembler.transform(digits_spark_df)

In [34]:
train, test = digits_spark_df.randomSplit([0.8, 0.2], seed=42)

## Build Model

In [None]:
layers = [len(features), 64, 32, 10]
classifier = MultilayerPerceptronClassifier(
    featuresCol='features',
    labelCol='target',
    maxIter=100,
    layers=layers,
    blockSize=128,
    seed=42)

model = classifier.fit(train)

predicted = model.transform(test)

## Evaluate Model

In [None]:
metrics = {
    "accuracy": MulticlassClassificationEvaluator(metricName="accuracy"),
    "precision": MulticlassClassificationEvaluator(metricName="weightedPrecision"),
    "recall": MulticlassClassificationEvaluator(metricName="weightedRecall"),
    "f1": MulticlassClassificationEvaluator(metricName="f1"),
}

for name, evaluator in metrics.items():
    evaluator.setLabelCol("target").setPredictionCol("prediction") # in case custom columns are used
    print(name, evaluator.evaluate(predicted))

# accuracy 0.9442815249266863
# precision 0.9456355927351696
# recall 0.9442815249266864
# f1 0.9440330464833463

In [None]:
conf_matrix = predicted.groupBy('target').pivot('prediction').count().fillna(0).orderBy('target')

In [None]:
conf_matrix.show(truncate=False)

In [None]:
# +------+---+---+---+---+---+---+---+---+---+---+
# |target|0.0|1.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|
# +------+---+---+---+---+---+---+---+---+---+---+
# |0     |29 |0  |0  |0  |0  |0  |1  |0  |0  |0  |
# |1     |0  |29 |0  |1  |0  |0  |1  |0  |3  |0  |
# |2     |0  |0  |33 |0  |0  |0  |0  |0  |0  |0  |
# |3     |0  |0  |0  |35 |0  |2  |0  |0  |0  |3  |
# |4     |0  |0  |0  |0  |32 |0  |0  |1  |0  |0  |
# |5     |2  |0  |0  |0  |0  |33 |0  |0  |0  |0  |
# |6     |0  |0  |0  |0  |0  |0  |32 |0  |0  |0  |
# |7     |0  |0  |0  |0  |0  |0  |0  |35 |0  |0  |
# |8     |0  |1  |0  |0  |1  |0  |0  |0  |30 |1  |
# |9     |0  |0  |0  |0  |0  |0  |0  |0  |2  |34 |
# +------+---+---+---+---+---+---+---+---+---+---+