In [2]:
pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=48ba56a9d5bceed54320c05a8e8f67a6afbe1060b9cee1b33119f3dc9d091f9f
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


# Building a KMeans Clustering algorithm on a Spark cluster.

In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors

In [7]:
#load the data.
spark = SparkSession.builder.appName('clustering').getOrCreate()
dataset =spark.read.format("libsvm").load("/content/sample_kmeans_data.txt")

dataset.show(10,False)

+-----+-------------------------+
|label|features                 |
+-----+-------------------------+
|0.0  |(3,[],[])                |
|1.0  |(3,[0,1,2],[0.1,0.1,0.1])|
|2.0  |(3,[0,1,2],[0.2,0.2,0.2])|
|3.0  |(3,[0,1,2],[9.0,9.0,9.0])|
|4.0  |(3,[0,1,2],[9.1,9.1,9.1])|
|5.0  |(3,[0,1,2],[9.2,9.2,9.2])|
+-----+-------------------------+



In [8]:
# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

In [9]:
# Make predictions
predictions = model.transform(dataset)

In [10]:
# Evaluate clustering by computing Silhouette score

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))


# Silhouette score indicates perfect clusters since it is almost 1,
# This happened due to using the same dataset while making the predictions

Silhouette with squared euclidean distance = 0.9997530305375207


In [11]:
# Showing the clusters' centroids

centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
  print(center)


Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]
