### Simple Kmeans Clustering example

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans

In [2]:
spark = SparkSession.builder.appName('Clustring').getOrCreate()

In [4]:
df = spark.read.format('libsvm').load('../datasets/sample_kmeans_data.txt')
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [5]:
final_data = df.select('features')
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [7]:
kmeans = KMeans().setK(2).setSeed(101)

In [8]:
model = kmeans.fit(final_data)

In [11]:
#With set sum of squared errors
wssse = model.computeCost(final_data)
print(wssse)

0.11999999999994547


In [12]:
#Cluster Centers
centers = model.clusterCenters()
centers

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

In [15]:
#Results
result = model.transform(final_data)
result.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



#### That's how we build simple K-Means Clustring model using Spark