In [1]:
sc.version # version of spark

'2.4.8'

In [2]:
!python --version 

Python 3.7.4


# Q4) Apply SVM on IRIS dataset uploaded over hadoop cluster and print the confusion matrix.

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a Spark session
spark = SparkSession.builder.appName("SVM for Iris Classification").getOrCreate()

# Read the IRIS dataset
path = "gs://iriscsvfiles/iris.csv"
df = spark.read.option("inferSchema", "true").option("header", "true").csv(path)

In [6]:
df.show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [7]:
# Drop the first column
df = df.drop("Id")

# Convert the target column to numeric labels
indexer = StringIndexer(inputCol="Species", outputCol="label")
df = indexer.fit(df).transform(df)

# Assemble input features
assembler = VectorAssembler(inputCols=df.columns[1:4], outputCol="features")
df = assembler.transform(df)

# Split the data into training and testing sets
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

In [9]:
# Train the One-vs-Rest (OvR) SVM model
svm = LinearSVC(maxIter=10, regParam=0.1)
ovr = OneVsRest(classifier=svm, labelCol="label", featuresCol="features")
model = ovr.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Cast the "label" column to Double type
#predictions = predictions.withColumn("label", col("label").cast("double"))

In [10]:
# Evaluate the model using a confusion matrix
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.6388888888888888


In [11]:
# Print the confusion matrix
predictions.select("label", "prediction").groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       2.0|   12|
|  1.0|       2.0|   13|
|  0.0|       0.0|   11|
+-----+----------+-----+

