In [1]:
# Install Java
!apt-get install openjdk-11-jdk -y > /dev/null

# Download Spark
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar -xzf spark-3.4.1-bin-hadoop3.tgz

# Install Python dependencies
!pip install -q pyspark


In [2]:
import os
from pyspark.sql import SparkSession

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"
os.environ["PATH"] += f":{os.environ['SPARK_HOME']}/bin"

spark = SparkSession.builder \
    .appName("SparkClusteringExample") \
    .master("local[*]") \
    .getOrCreate()


In [3]:
import pandas as pd

# Load Wholesale Customers dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00292/Wholesale%20customers%20data.csv"
df_pd = pd.read_csv(url)

# Convert to Spark DataFrame
df_spark = spark.createDataFrame(df_pd)
df_spark.show(5)


+-------+------+-----+----+-------+------+----------------+----------+
|Channel|Region|Fresh|Milk|Grocery|Frozen|Detergents_Paper|Delicassen|
+-------+------+-----+----+-------+------+----------------+----------+
|      2|     3|12669|9656|   7561|   214|            2674|      1338|
|      2|     3| 7057|9810|   9568|  1762|            3293|      1776|
|      2|     3| 6353|8808|   7684|  2405|            3516|      7844|
|      1|     3|13265|1196|   4221|  6404|             507|      1788|
|      2|     3|22615|5410|   7198|  3915|            1777|      5185|
+-------+------+-----+----+-------+------+----------------+----------+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import VectorAssembler

feature_cols = df_spark.columns[2:]  # Exclude 'Channel' and 'Region'
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_features = assembler.transform(df_spark)
df_features.select("features").show(5, truncate=False)


+--------------------------------------------+
|features                                    |
+--------------------------------------------+
|[12669.0,9656.0,7561.0,214.0,2674.0,1338.0] |
|[7057.0,9810.0,9568.0,1762.0,3293.0,1776.0] |
|[6353.0,8808.0,7684.0,2405.0,3516.0,7844.0] |
|[13265.0,1196.0,4221.0,6404.0,507.0,1788.0] |
|[22615.0,5410.0,7198.0,3915.0,1777.0,5185.0]|
+--------------------------------------------+
only showing top 5 rows



In [5]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol='features', k=3, seed=1)
model = kmeans.fit(df_features)

# Predict cluster for each row
clusters = model.transform(df_features)
clusters.select("features", "prediction").show(10, truncate=False)


+---------------------------------------------+----------+
|features                                     |prediction|
+---------------------------------------------+----------+
|[12669.0,9656.0,7561.0,214.0,2674.0,1338.0]  |1         |
|[7057.0,9810.0,9568.0,1762.0,3293.0,1776.0]  |1         |
|[6353.0,8808.0,7684.0,2405.0,3516.0,7844.0]  |1         |
|[13265.0,1196.0,4221.0,6404.0,507.0,1788.0]  |1         |
|[22615.0,5410.0,7198.0,3915.0,1777.0,5185.0] |0         |
|[9413.0,8259.0,5126.0,666.0,1795.0,1451.0]   |1         |
|[12126.0,3199.0,6975.0,480.0,3140.0,545.0]   |1         |
|[7579.0,4956.0,9426.0,1669.0,3321.0,2566.0]  |1         |
|[5963.0,3648.0,6192.0,425.0,1716.0,750.0]    |1         |
|[6006.0,11093.0,18881.0,1159.0,7425.0,2098.0]|1         |
+---------------------------------------------+----------+
only showing top 10 rows



In [6]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(clusters)
print(f"Silhouette Score = {silhouette:.3f}")


Silhouette Score = 0.648


In [7]:
centers = model.clusterCenters()
print("Cluster Centers:")
for i, center in enumerate(centers):
    print(f"Cluster {i}: {center}")


Cluster Centers:
Cluster 0: [32768.01333333  4827.68        5723.14666667  5535.92
  1074.12        2066.64      ]
Cluster 1: [7390.95845697 4439.76854599 6292.1958457  2495.53412463 2238.65281899
 1158.44807122]
Cluster 2: [11849.17857143 24717.10714286 33887.71428571  3409.32142857
 15459.71428571  4483.85714286]
