# Lab 5 for Big Data programming.
# Apache Spark Machine Learning using Dataframes in Google Colab




# 1.	Setup an Apache Spark instance in Google Colab

In [None]:
# Run once.

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.2/spark-3.0.2-bin-hadoop2.7.tgz
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!pip install -q findspark

#Run Once
import os
os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"
import findspark
findspark.init()


# 2.	Create a Spark session

In [47]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()
spark


# 3.	Download the Iris dataset and another dataset of your choosing



In [50]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" -O sample_data/iris.data

--2022-03-20 20:19:50--  https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4551 (4.4K) [application/x-httpd-php]
Saving to: ‘sample_data/iris.data’


2022-03-20 20:19:50 (185 MB/s) - ‘sample_data/iris.data’ saved [4551/4551]



# 4.	Import the Iris dataset into a dataframe and insert screenshot of df.show()command output:

In [51]:
#df = spark.read.csv('sample_data/iris.data', header=False, sep=",", inferSchema=True)
df = spark.read.csv('sample_data/iris.data', inferSchema=True)\
.toDF("SepalLength","SepalWidth","PetalLength","PetalWidth","Class")

# 5.	Spark ML can only deal with one features column - so we need to vectorise the multiple columns into one:

In [52]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


In [63]:
vector_assembler=VectorAssembler(\
inputCols=["SepalLength","SepalWidth","PetalLength","PetalWidth"],\
outputCol="features")
df=vector_assembler.transform(df)
df.show(3)

+-----------+----------+-----------+----------+-----------+----------+-----------------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|      Class|ClassIndex|         features|
+-----------+----------+-----------+----------+-----------+----------+-----------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|       0.0|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|       0.0|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|       0.0|[4.7,3.2,1.3,0.2]|
+-----------+----------+-----------+----------+-----------+----------+-----------------+
only showing top 3 rows



In [64]:
df_temp=df.drop("SepalLength","SepalWidth","PetalLength","PetalWidth")
df_temp.show(3)


+-----------+----------+-----------------+
|      Class|ClassIndex|         features|
+-----------+----------+-----------------+
|Iris-setosa|       0.0|[5.1,3.5,1.4,0.2]|
|Iris-setosa|       0.0|[4.9,3.0,1.4,0.2]|
|Iris-setosa|       0.0|[4.7,3.2,1.3,0.2]|
+-----------+----------+-----------------+
only showing top 3 rows



# 6.	The final data preparation step is to index the Class column - to use numeric rather than text values - run the following command and display your output of Class, features & ClassIndex columns:

In [65]:
#from pyspark.ml.feature import StringIndexer
#l_indexer=StringIndexer(inputCol="Class", outputCol="ClassIndex")
#df = l_indexer.fit(df).transform(df)

IllegalArgumentException: ignored

In [67]:
df.show(10)

+-----------+----------+-----------+----------+-----------+----------+-----------------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|      Class|ClassIndex|         features|
+-----------+----------+-----------+----------+-----------+----------+-----------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|       0.0|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|       0.0|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|       0.0|[4.7,3.2,1.3,0.2]|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|       0.0|[4.6,3.1,1.5,0.2]|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|       0.0|[5.0,3.6,1.4,0.2]|
|        5.4|       3.9|        1.7|       0.4|Iris-setosa|       0.0|[5.4,3.9,1.7,0.4]|
|        4.6|       3.4|        1.4|       0.3|Iris-setosa|       0.0|[4.6,3.4,1.4,0.3]|
|        5.0|       3.4|        1.5|       0.2|Iris-setosa|       0.0|[5.0,3.4,1.5,0.2]|
|        4.4|       2

# 7.	Split your data into training and test datasets:

In [69]:
(trainingData,testData) = df.randomSplit([0.7,0.3])

# 8.	Decision Tree Classifier 
## Specify the DecisionTreeClassifier and train the model on your training dataset:


In [70]:
from pyspark.ml.classification import DecisionTreeClassifier 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [71]:
dt = DecisionTreeClassifier(labelCol="ClassIndex",featuresCol="features")
model=dt.fit(trainingData)

# 9.	Test your model with your test dataset: 

In [72]:
predictions = model.transform(testData)

In [74]:
predictions.select("prediction","ClassIndex").show(15)

+----------+----------+
|prediction|ClassIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       1.0|       1.0|
+----------+----------+
only showing top 15 rows



# 10.	Run an evaluator function to show the accuracy of your model:

In [75]:
evaluator= MulticlassClassificationEvaluator(\
labelCol="ClassIndex", predictionCol="prediction",\
metricName="accuracy")
accuracy=evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test Set accuracy = " +str(accuracy))

Test Error = 0.047619
Test Set accuracy = 0.9523809523809523


# 11.	Random Forest Classifier

## Specify the RandomForestClassifier, train the model on your training dataset, predict using your test dataset, and run an evaluator to test accuracy:


In [76]:
from pyspark.ml.classification import RandomForestClassifier 
rf=RandomForestClassifier(labelCol="ClassIndex",\
featuresCol="features",numTrees=10)
model=rf.fit(trainingData)
predictions=model.transform(testData)
predictions.select("prediction","ClassIndex").show(5)

+----------+----------+
|prediction|ClassIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
+----------+----------+
only showing top 5 rows



In [77]:
evaluator= \
MulticlassClassificationEvaluator(labelCol="ClassIndex",\
predictionCol="prediction",metricName="accuracy")
accuracy=evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test Set accuracy = " +str(accuracy))

Test Error = 0.0714286
Test Set accuracy = 0.9285714285714286


# 12.	Naive Bayes Classifier
## Specify the NaiveBayes classifier, train the model on your training dataset, predict using your test dataset, and run an evaluator to test accuracy:


In [79]:
from pyspark.ml.classification import NaiveBayes 
nb=NaiveBayes(labelCol="ClassIndex",\
featuresCol="features",smoothing=1.0,\
modelType="multinomial")
model=nb.fit(trainingData)


In [81]:
predictions=model.transform(testData)
predictions.select("Class","ClassIndex",
"probability","prediction").show(5)

+---------------+----------+--------------------+----------+
|          Class|ClassIndex|         probability|prediction|
+---------------+----------+--------------------+----------+
|    Iris-setosa|       0.0|[0.67469033867968...|       0.0|
|    Iris-setosa|       0.0|[0.72261167673864...|       0.0|
|    Iris-setosa|       0.0|[0.65606788034250...|       0.0|
|    Iris-setosa|       0.0|[0.71533253396407...|       0.0|
|Iris-versicolor|       1.0|[0.11076393934278...|       1.0|
+---------------+----------+--------------------+----------+
only showing top 5 rows



In [82]:
evaluator= \
MulticlassClassificationEvaluator(labelCol="ClassIndex",\
predictionCol="prediction",metricName="accuracy")
accuracy=evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test Set accuracy = " +str(accuracy))

Test Error = 0.0714286
Test Set accuracy = 0.9285714285714286
