# Spark LogisticRegression Demo :

Run the colab Demo :  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1EVPji_9wCPQizaNF-N9fSfyRoI9UpgH6?usp=sharing)

## Setup Spark in Colab

In [1]:
# Install Java, Spark 3.3.2 and py4j
!apt-get install openjdk-11-jdk -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar xf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark


In [2]:
# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"


In [3]:
#  Initialize Spark
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Colab Spark MLlib Setup") \
    .getOrCreate()

print("Spark Session started successfully!")


Spark Session started successfully!



# Load the Iris Dataset

In [4]:
# Download Iris dataset
!wget -q https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data -O iris.csv


In [5]:
# Load into Spark DataFrame
df = spark.read.csv("iris.csv", inferSchema=True, header=False)
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "label"]
df = df.toDF(*columns)
df.show(5)


+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|      label|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



# Prepare Data for MLlib


In [6]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

# Convert labels to numeric
indexer = StringIndexer(inputCol="label", outputCol="label_index")
df = indexer.fit(df).transform(df)

# Combine features into a single vector
assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)
df = assembler.transform(df).select("features", "label_index")
df.show(5)


+-----------------+-----------+
|         features|label_index|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|        0.0|
|[4.9,3.0,1.4,0.2]|        0.0|
|[4.7,3.2,1.3,0.2]|        0.0|
|[4.6,3.1,1.5,0.2]|        0.0|
|[5.0,3.6,1.4,0.2]|        0.0|
+-----------------+-----------+
only showing top 5 rows



# Train & Test Split + Train Model

In [7]:
# Split data
train_data, test_data = df.randomSplit([0.7, 0.3], seed=123)

# Train logistic regression model
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label_index", featuresCol="features", maxIter=10)
model = lr.fit(train_data)


# Make Predictions & Evaluate

In [8]:
# Make Predictions & Evaluate

# Predict on test data
predictions = model.transform(test_data)
predictions.select("features", "label_index", "prediction", "probability").show(5)

# Import evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize evaluators
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="f1")

# Calculate metrics
accuracy = accuracy_evaluator.evaluate(predictions)
precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)
f1_score = f1_evaluator.evaluate(predictions)

# Print results
print("\n Evaluation Metrics:")
print(f" Accuracy  : {accuracy:.2f}")
print(f" Precision : {precision:.2f}")
print(f" Recall    : {recall:.2f}")
print(f" F1 Score  : {f1_score:.2f}")


+-----------------+-----------+----------+--------------------+
|         features|label_index|prediction|         probability|
+-----------------+-----------+----------+--------------------+
|[4.4,3.0,1.3,0.2]|        0.0|       0.0|[0.99997707075701...|
|[4.6,3.2,1.4,0.2]|        0.0|       0.0|[0.99997157586396...|
|[4.7,3.2,1.3,0.2]|        0.0|       0.0|[0.99995112558622...|
|[4.8,3.0,1.4,0.3]|        0.0|       0.0|[0.99913046469204...|
|[4.8,3.1,1.6,0.2]|        0.0|       0.0|[0.99959022777438...|
+-----------------+-----------+----------+--------------------+
only showing top 5 rows


 Evaluation Metrics:
 Accuracy  : 0.95
 Precision : 0.96
 Recall    : 0.95
 F1 Score  : 0.95
