# Spark-ML Classification

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler, StringIndexer
from pyspark.ml.classification import NaiveBayes, LogisticRegression, MultilayerPerceptronClassifier, \
                                      DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
data_path = '/home/lorenzo/spark-repo/0_data/iris.csv'

df = spark.read.option('header', 'True') \
                .option('inferSchema', 'True') \
                .csv(data_path)

df.show(10)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 10 rows



### Vectorized dataframe

In [4]:
va = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
                     outputCol="features")
vect_df = va.transform(df)

In [5]:
vect_df.show(10)

+------------+-----------+------------+-----------+-------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|species|         features|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4| setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3| setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2| setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2| setosa|[4.4,2.9,1.4,0.2]|
|         4.9|        3.1|         1.5|        0.1| setosa|[4.9,

### Encode target label to integer value

In [6]:
indexer = StringIndexer(inputCol='species', outputCol="label")
ind_vect_df = indexer.fit(vect_df).transform(vect_df)

In [7]:
ind_vect_df.show(10)

+------------+-----------+------------+-----------+-------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|species|         features|label|
+------------+-----------+------------+-----------+-------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2| setosa|[5.1,3.5,1.4,0.2]|  2.0|
|         4.9|        3.0|         1.4|        0.2| setosa|[4.9,3.0,1.4,0.2]|  2.0|
|         4.7|        3.2|         1.3|        0.2| setosa|[4.7,3.2,1.3,0.2]|  2.0|
|         4.6|        3.1|         1.5|        0.2| setosa|[4.6,3.1,1.5,0.2]|  2.0|
|         5.0|        3.6|         1.4|        0.2| setosa|[5.0,3.6,1.4,0.2]|  2.0|
|         5.4|        3.9|         1.7|        0.4| setosa|[5.4,3.9,1.7,0.4]|  2.0|
|         4.6|        3.4|         1.4|        0.3| setosa|[4.6,3.4,1.4,0.3]|  2.0|
|         5.0|        3.4|         1.5|        0.2| setosa|[5.0,3.4,1.5,0.2]|  2.0|
|         4.4|        2.9|         1.4|        0.2| setosa|[4.4,2.9,1.4,0.2]

### Train-test split

In [8]:
train_df, test_df = ind_vect_df.randomSplit([0.6, 0.4], 1)
print(f'Train df length: {train_df.count()}')
print(f'Test df length: {test_df.count()}')

Train df length: 92
Test df length: 58


### Naive Bayes

In [9]:
nb = NaiveBayes(modelType='multinomial')
nb_model = nb.fit(train_df)
nb_preds = nb_model.transform(test_df)

In [10]:
nb_preds.select('species', 'label', 'probability', 'prediction').show(10)

+----------+-----+--------------------+----------+
|   species|label|         probability|prediction|
+----------+-----+--------------------+----------+
|    setosa|  2.0|[0.29235086158619...|       2.0|
|    setosa|  2.0|[0.22557259321273...|       2.0|
|    setosa|  2.0|[0.20910187293567...|       2.0|
|    setosa|  2.0|[0.22598041891034...|       2.0|
|versicolor|  0.0|[0.53256925394281...|       0.0|
|    setosa|  2.0|[0.21550771742625...|       2.0|
|    setosa|  2.0|[0.20020012489469...|       2.0|
|    setosa|  2.0|[0.23510176487596...|       2.0|
|    setosa|  2.0|[0.23996071368133...|       2.0|
|    setosa|  2.0|[0.16817219327217...|       2.0|
+----------+-----+--------------------+----------+
only showing top 10 rows



In [11]:
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
nb_acc = evaluator.evaluate(nb_preds)
print(f'Naive Bayes accuracy: {nb_acc}')

Naive Bayes accuracy: 0.5862068965517241


### Logistic Regression

In [12]:
lr = LogisticRegression()
lr_model = lr.fit(train_df)
lr_preds = lr_model.transform(test_df)

In [13]:
lr_preds.select('species', 'label', 'probability', 'prediction').show(10)

+----------+-----+--------------------+----------+
|   species|label|         probability|prediction|
+----------+-----+--------------------+----------+
|    setosa|  2.0|       [1.0,0.0,0.0]|       0.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
|versicolor|  0.0|       [1.0,0.0,0.0]|       0.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|[8.20855181417647...|       2.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
+----------+-----+--------------------+----------+
only showing top 10 rows



In [14]:
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
lr_acc = evaluator.evaluate(lr_preds)
print(f'Logistic regression accuracy: {lr_acc}')

Logistic regression accuracy: 0.9310344827586207


### Multilayer Perceptron

In [15]:
mlp = MultilayerPerceptronClassifier(layers=[4, 4, 4, 3], seed=875)
mlp_model = mlp.fit(train_df)
mlp_preds = mlp_model.transform(test_df)

In [16]:
mlp_preds.select('species', 'label', 'probability', 'prediction').show(10)

+----------+-----+--------------------+----------+
|   species|label|         probability|prediction|
+----------+-----+--------------------+----------+
|    setosa|  2.0|[4.75256285651320...|       2.0|
|    setosa|  2.0|[4.75258142890000...|       2.0|
|    setosa|  2.0|[4.75262266491023...|       2.0|
|    setosa|  2.0|[4.75256989739500...|       2.0|
|versicolor|  0.0|[1.0,2.0441260514...|       0.0|
|    setosa|  2.0|[4.75258621898566...|       2.0|
|    setosa|  2.0|[4.75258399298743...|       2.0|
|    setosa|  2.0|[4.75255843096186...|       2.0|
|    setosa|  2.0|[4.75256252147323...|       2.0|
|    setosa|  2.0|[4.75265373795343...|       2.0|
+----------+-----+--------------------+----------+
only showing top 10 rows



In [17]:
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
mlp_acc = evaluator.evaluate(mlp_preds)
print(f'Multilayer perceptron accuracy: {mlp_acc}')

Multilayer perceptron accuracy: 0.9655172413793104


### Random Forest

In [18]:
rf = RandomForestClassifier(numTrees=100, seed = 543)
rf_model = rf.fit(train_df)
rf_preds = rf_model.transform(test_df)

In [19]:
rf_preds.select('species', 'label', 'probability', 'prediction').show(10)

+----------+-----+--------------------+----------+
|   species|label|         probability|prediction|
+----------+-----+--------------------+----------+
|    setosa|  2.0|    [0.01,0.03,0.96]|       2.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|     [0.01,0.0,0.99]|       2.0|
|versicolor|  0.0|[0.75907618151222...|       0.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|     [0.02,0.0,0.98]|       2.0|
|    setosa|  2.0|     [0.01,0.0,0.99]|       2.0|
|    setosa|  2.0|       [0.0,0.0,1.0]|       2.0|
+----------+-----+--------------------+----------+
only showing top 10 rows



In [20]:
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
rf_acc = evaluator.evaluate(rf_preds)
print(f'Random forest accuracy: {rf_acc}')

Random forest accuracy: 0.9482758620689655
