# Classification using pyspark

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install Java8
!wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz # download spark 3.1.1
!tar xf spark-3.1.1-bin-hadoop2.7.tgz # unzip it
!pip install -q findspark # install findspark
!pip install pyspark==3.1.1 # install pyspark
!ls

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

import findspark
findspark.init()

import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors
from pyspark.ml import feature
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

clustering_dataset.csv	iris.data    spark-3.1.1-bin-hadoop2.7	    spark-3.1.1-bin-hadoop2.7.tgz.1
gas_emissions.csv	sample_data  spark-3.1.1-bin-hadoop2.7.tgz


In [2]:
df = spark.read.csv("/content/iris.data")
df.show()

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
|5.4|3.9|1.7|0.4|Iris-setosa|
|4.6|3.4|1.4|0.3|Iris-setosa|
|5.0|3.4|1.5|0.2|Iris-setosa|
|4.4|2.9|1.4|0.2|Iris-setosa|
|4.9|3.1|1.5|0.1|Iris-setosa|
|5.4|3.7|1.5|0.2|Iris-setosa|
|4.8|3.4|1.6|0.2|Iris-setosa|
|4.8|3.0|1.4|0.1|Iris-setosa|
|4.3|3.0|1.1|0.1|Iris-setosa|
|5.8|4.0|1.2|0.2|Iris-setosa|
|5.7|4.4|1.5|0.4|Iris-setosa|
|5.4|3.9|1.3|0.4|Iris-setosa|
|5.1|3.5|1.4|0.3|Iris-setosa|
|5.7|3.8|1.7|0.3|Iris-setosa|
|5.1|3.8|1.5|0.3|Iris-setosa|
+---+---+---+---+-----------+
only showing top 20 rows



In [3]:
df = df.select(F.col('_c0').alias('sepal_length'),
               F.col('_c1').alias('sepal_width'),
               F.col('_c2').alias('petal_length'),
               F.col('_c3').alias('petal_width'),
               F.col('_c4').alias('species'),)
df.show()

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|
|         4.8|        3.0|         1.4| 

In [4]:
df.dtypes

[('sepal_length', 'string'),
 ('sepal_width', 'string'),
 ('petal_length', 'string'),
 ('petal_width', 'string'),
 ('species', 'string')]

In [5]:
df2 = df.selectExpr("cast(sepal_length as float)",
                    "cast(sepal_width as float)",
                    "cast(petal_length as float)",
                    "cast(petal_width as float)",
                    "species")
df2.show()

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|
|         4.8|        3.0|         1.4| 

In [6]:
df2.dtypes

[('sepal_length', 'float'),
 ('sepal_width', 'float'),
 ('petal_length', 'float'),
 ('petal_width', 'float'),
 ('species', 'string')]

In [7]:
df.columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [8]:
vector_assembler = feature.VectorAssembler(inputCols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
                                           outputCol = 'features')
df2 = vector_assembler.transform(df2)

In [9]:
indexer = feature.StringIndexer(inputCol = 'species', outputCol = 'label')
df2 = indexer.fit(df2).transform(df2)
df2.show(n = 2, truncate = False)

+------------+-----------+------------+-----------+-----------+-------------------------------------------------------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|species    |features                                                     |label|
+------------+-----------+------------+-----------+-----------+-------------------------------------------------------------+-----+
|5.1         |3.5        |1.4         |0.2        |Iris-setosa|[5.099999904632568,3.5,1.399999976158142,0.20000000298023224]|0.0  |
|4.9         |3.0        |1.4         |0.2        |Iris-setosa|[4.900000095367432,3.0,1.399999976158142,0.20000000298023224]|0.0  |
+------------+-----------+------------+-----------+-----------+-------------------------------------------------------------+-----+
only showing top 2 rows



### Dataset split

In [10]:
splits = df2.randomSplit([0.7, 0.3], seed = 101)
df_tr = splits[0]
df_tst = splits[1]

In [11]:
df_tr.show()

+------------+-----------+------------+-----------+---------------+--------------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|        species|            features|label|
+------------+-----------+------------+-----------+---------------+--------------------+-----+
|         4.3|        3.0|         1.1|        0.1|    Iris-setosa|[4.30000019073486...|  0.0|
|         4.4|        2.9|         1.4|        0.2|    Iris-setosa|[4.40000009536743...|  0.0|
|         4.4|        3.0|         1.3|        0.2|    Iris-setosa|[4.40000009536743...|  0.0|
|         4.4|        3.2|         1.3|        0.2|    Iris-setosa|[4.40000009536743...|  0.0|
|         4.6|        3.2|         1.4|        0.2|    Iris-setosa|[4.59999990463256...|  0.0|
|         4.6|        3.4|         1.4|        0.3|    Iris-setosa|[4.59999990463256...|  0.0|
|         4.6|        3.6|         1.0|        0.2|    Iris-setosa|[4.59999990463256...|  0.0|
|         4.7|        3.2|         1.3|        0.2

In [12]:
df_tr.count()

102

In [13]:
df_tst.show()

+------------+-----------+------------+-----------+---------------+--------------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|        species|            features|label|
+------------+-----------+------------+-----------+---------------+--------------------+-----+
|         4.5|        2.3|         1.3|        0.3|    Iris-setosa|[4.5,2.2999999523...|  0.0|
|         4.6|        3.1|         1.5|        0.2|    Iris-setosa|[4.59999990463256...|  0.0|
|         4.8|        3.4|         1.6|        0.2|    Iris-setosa|[4.80000019073486...|  0.0|
|         4.8|        3.4|         1.9|        0.2|    Iris-setosa|[4.80000019073486...|  0.0|
|         4.9|        3.0|         1.4|        0.2|    Iris-setosa|[4.90000009536743...|  0.0|
|         4.9|        3.1|         1.5|        0.1|    Iris-setosa|[4.90000009536743...|  0.0|
|         5.0|        2.0|         3.5|        1.0|Iris-versicolor|   [5.0,2.0,3.5,1.0]|  1.0|
|         5.0|        2.3|         3.3|        1.0

In [14]:
df_tst.count()

48

### Models

In [15]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [16]:
models = ['naive bayes', 'decision tree']
accuracy = []
f1 = []

#### Naive Bayes

In [17]:
nb = NaiveBayes(modelType = 'multinomial')
nb_model = nb.fit(df_tr)
pred = nb_model.transform(df_tst)

In [18]:
pred.columns

['sepal_length',
 'sepal_width',
 'petal_length',
 'petal_width',
 'species',
 'features',
 'label',
 'rawPrediction',
 'probability',
 'prediction']

In [19]:
pred.select('label', 'probability', 'prediction').show(truncate = False)

+-----+--------------------------------------------------------------+----------+
|label|probability                                                   |prediction|
+-----+--------------------------------------------------------------+----------+
|0.0  |[0.5750219584755774,0.25057822528261237,0.1743998162418101]   |0.0       |
|0.0  |[0.6864575093386172,0.19265338304282892,0.12088910761855388]  |0.0       |
|0.0  |[0.7161675808925836,0.1771046575420379,0.10672776156537848]   |0.0       |
|0.0  |[0.6682399101371,0.20500435727907615,0.1267557325838238]      |0.0       |
|0.0  |[0.7030077977232515,0.18390043525156033,0.11309176702518807]  |0.0       |
|0.0  |[0.7280960194734826,0.17001235008514878,0.10189163044136856]  |0.0       |
|1.0  |[0.08110989922547217,0.46331366566630805,0.45557643510821977] |1.0       |
|1.0  |[0.1095242305646629,0.4603095643688824,0.4301662050664548]    |1.0       |
|0.0  |[0.6067242108824067,0.23923972426575532,0.15403606485183807]  |0.0       |
|0.0  |[0.767710

In [20]:
evaluator = MulticlassClassificationEvaluator(metricName = 'accuracy')
accuracy.append(evaluator.evaluate(pred))

In [21]:
evaluator = MulticlassClassificationEvaluator(metricName = 'f1')
f1.append(evaluator.evaluate(pred))

#### Decision Tree

In [22]:
dt = DecisionTreeClassifier()
dt_model = dt.fit(df_tr)
pred = dt_model.transform(df_tst)

In [23]:
pred.show()

+------------+-----------+------------+-----------+---------------+--------------------+-----+--------------+-------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|        species|            features|label| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+---------------+--------------------+-----+--------------+-------------+----------+
|         4.5|        2.3|         1.3|        0.3|    Iris-setosa|[4.5,2.2999999523...|  0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.6|        3.1|         1.5|        0.2|    Iris-setosa|[4.59999990463256...|  0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.8|        3.4|         1.6|        0.2|    Iris-setosa|[4.80000019073486...|  0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.8|        3.4|         1.9|        0.2|    Iris-setosa|[4.80000019073486...|  0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.9|        3.0|         1.4|        0.2|    

In [24]:
evaluator = MulticlassClassificationEvaluator(metricName = 'accuracy')
accuracy.append(evaluator.evaluate(pred))

In [25]:
evaluator = MulticlassClassificationEvaluator(metricName = 'f1')
f1.append(evaluator.evaluate(pred))

### Model selection

In [26]:
models

['naive bayes', 'decision tree']

In [27]:
accuracy

[0.9375, 0.9791666666666666]

In [28]:
f1

[0.9373376623376624, 0.9791121766928219]

In [29]:
df_results = spark.createDataFrame(data = zip(models, accuracy, f1),
                                   schema = ['models', 'accuracy', 'f1'])

In [30]:
df_results.dtypes

[('models', 'string'), ('accuracy', 'double'), ('f1', 'double')]

In [31]:
df_results.printSchema()

root
 |-- models: string (nullable = true)
 |-- accuracy: double (nullable = true)
 |-- f1: double (nullable = true)



In [32]:
df_results.show()

+-------------+------------------+------------------+
|       models|          accuracy|                f1|
+-------------+------------------+------------------+
|  naive bayes|            0.9375|0.9373376623376624|
|decision tree|0.9791666666666666|0.9791121766928219|
+-------------+------------------+------------------+

