## K-Means Algorithm

In [70]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.feature import StringIndexer

#### read the iris dataset

In [None]:
def mapLibSVM(row): 
    return (row[5],Vectors.dense(row[:3]))

In [None]:
df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("datasets/iris.data")

In [None]:
df.show()

In [None]:
indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
indexer = indexer.fit(df).transform(df)
indexer.show()
df = indexer.rdd.map(mapLibSVM).toDF(["label", "features"])
df.show()

In [None]:
# Trains a k-means model (Estimator).
kmeans = KMeans().setK(3).setSeed(3)


In [None]:
model = kmeans.fit(df)

In [None]:
# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(df)
print("Within Set Sum of Squared Errors = " + str(wssse))

In [None]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

In [None]:
result = model.transform(df)

In [None]:
predictions = result.select(["prediction","label"])
predictions.show()