In [1]:
# Install Java
!apt-get install openjdk-11-jdk -y > /dev/null

# Download Spark (correct URL & version)
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz

# Extract Spark
!tar -xzf spark-3.4.1-bin-hadoop3.tgz

# Install Python packages
!pip install -q pyspark findspark


In [2]:
import os
from pyspark.sql import SparkSession

# Set environment variables manually
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"
os.environ["PATH"] += f":{os.environ['SPARK_HOME']}/bin"

# Directly create SparkSession (bypass findspark)
spark = SparkSession.builder \
    .appName("WorkingSparkInColab") \
    .master("local[*]") \
    .getOrCreate()

spark


In [3]:
import pandas as pd

# Load Iris dataset using pandas
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']
iris_pd = pd.read_csv(url, names=columns)

# Convert to Spark DataFrame
iris_spark = spark.createDataFrame(iris_pd)
iris_spark.show(5)


+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|      label|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import StringIndexer

# Convert label from string to numeric
indexer = StringIndexer(inputCol="label", outputCol="label_index")
iris_indexed = indexer.fit(iris_spark).transform(iris_spark)
iris_indexed.select("label", "label_index").distinct().show()


+---------------+-----------+
|          label|label_index|
+---------------+-----------+
|    Iris-setosa|        0.0|
|Iris-versicolor|        1.0|
| Iris-virginica|        2.0|
+---------------+-----------+



In [5]:
from pyspark.ml.feature import VectorAssembler

# Combine features into a single vector
assembler = VectorAssembler(
    inputCols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
    outputCol='features'
)

iris_features = assembler.transform(iris_indexed)
iris_features.select("features", "label_index").show(5, truncate=False)


+-----------------+-----------+
|features         |label_index|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|0.0        |
|[4.9,3.0,1.4,0.2]|0.0        |
|[4.7,3.2,1.3,0.2]|0.0        |
|[4.6,3.1,1.5,0.2]|0.0        |
|[5.0,3.6,1.4,0.2]|0.0        |
+-----------------+-----------+
only showing top 5 rows



In [6]:
# Split into training and test sets
train_data, test_data = iris_features.randomSplit([0.8, 0.2], seed=42)

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='label_index', maxIter=10)
lr_model = lr.fit(train_data)


In [7]:
# Predict
predictions = lr_model.transform(test_data)
predictions.select("features", "label_index", "prediction").show(10)


+-----------------+-----------+----------+
|         features|label_index|prediction|
+-----------------+-----------+----------+
|[4.4,3.0,1.3,0.2]|        0.0|       0.0|
|[4.6,3.2,1.4,0.2]|        0.0|       0.0|
|[4.6,3.6,1.0,0.2]|        0.0|       0.0|
|[4.8,3.1,1.6,0.2]|        0.0|       0.0|
|[4.9,3.1,1.5,0.1]|        0.0|       0.0|
|[5.0,3.2,1.2,0.2]|        0.0|       0.0|
|[5.0,3.6,1.4,0.2]|        0.0|       0.0|
|[5.1,3.8,1.5,0.3]|        0.0|       0.0|
|[5.4,3.7,1.5,0.2]|        0.0|       0.0|
|[5.4,3.9,1.3,0.4]|        0.0|       0.0|
+-----------------+-----------+----------+
only showing top 10 rows



In [8]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.2f}")


Test Accuracy = 1.00
