In [186]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, DoubleType, StringType, IntegerType
import math
from operator import add
import numpy as np

In [127]:
spark = SparkSession.builder.appName("KMeansWithMapReduce").getOrCreate()

schema = StructType() \
    .add("sepal_length", DoubleType(), True) \
    .add("sepal_width", DoubleType(), True) \
    .add("petal_length", DoubleType(), True) \
    .add("petal_width", DoubleType(), True) \
    .add("species", StringType(), True)

df = spark.read.format("csv").option("header", True).schema(schema).load("./data/IRIS.csv")
df.na.drop()

DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]

In [128]:
df.show()

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|
|         4.8|        3.0|         1.4| 

In [129]:
# k clusters
k = 3
# threshold between new and old centroid
threshold = 0.4

# randomly select centroids
centroids = df.rdd.takeSample(False, k, seed=0)
centroids = [[c.sepal_length, c.sepal_width, c.petal_length, c.petal_width] for c in centroids]
# centroid structure: (index, [centroid features]) e.g., (1, [2.0, 3.0, 5.0, 0.2])
centroids = [(idx, centroid) for idx, centroid in enumerate(centroids)]

In [130]:
# show centroid structure
for _ in range(3):
    print(centroids[_])

(0, [5.8, 2.7, 5.1, 1.9])
(1, [6.0, 3.0, 4.8, 1.8])
(2, [7.4, 2.8, 6.1, 1.9])


In [131]:
# set points
points = df.rdd
# point structure: ([point features], count) e.g., ([2.0, 5.0, 2.5, 0.6], 1)
points_rdd = points.map(lambda p: ([p.sepal_length, p.sepal_width, p.petal_length, p.petal_width], 1))
points_rdd.cache()

PythonRDD[119] at RDD at PythonRDD.scala:53

In [132]:
points_rdd.take(3)

[([5.1, 3.5, 1.4, 0.2], 1),
 ([4.9, 3.0, 1.4, 0.2], 1),
 ([4.7, 3.2, 1.3, 0.2], 1)]

In [133]:
""" calculate distance """
def calculateDistance(point, centroid):
    distance = 0
    for index in range(len(point)):
        distance += (point[index]-centroid[index])**2
    return math.sqrt(distance)

In [134]:
""" belongs to Centroid """
def belongCluster(point, centroids):
    centroidIndex = 0
    closest = float("+inf")
    for centroid in centroids:
        dist = calculateDistance(point, centroid[1])
        if dist < closest:
            closest = dist
            centroidIndex = centroid[0]
    return centroidIndex

In [135]:
""" Reduce all points in each centroid """
def accumulatedCluster(p1, p2):
    cluster_sum = list(map(add, p1[0], p2[0]))
    cluster_count = p1[1]+p2[1]
    p = (cluster_sum, cluster_count)
    return p

In [136]:
# Map Phase
pointMapCentroid_rdd = points_rdd.keyBy(lambda point: belongCluster(point[0], centroids))

In [137]:
pointMapCentroid_rdd.take(5)

[(1, ([5.1, 3.5, 1.4, 0.2], 1)),
 (1, ([4.9, 3.0, 1.4, 0.2], 1)),
 (1, ([4.7, 3.2, 1.3, 0.2], 1)),
 (1, ([4.6, 3.1, 1.5, 0.2], 1)),
 (1, ([5.0, 3.6, 1.4, 0.2], 1))]

In [138]:
# Reduce Phase
pointReducedCentroid_rdd = pointMapCentroid_rdd.reduceByKey(lambda p1, p2: accumulatedCluster(p1, p2))

In [139]:
pointReducedCentroid_rdd.take(3)

[(1,
  ([630.0000000000002,
    349.00000000000006,
    353.00000000000006,
    104.19999999999996],
   113)),
 (0, ([96.1, 43.900000000000006, 84.09999999999998, 31.199999999999996], 16)),
 (2, ([150.40000000000003, 65.2, 126.7, 44.4], 21))]

In [140]:
pointReducedCentroid_rdd = pointReducedCentroid_rdd.map(lambda p: (p[0], np.divide(p[1][0], p[1][1]).tolist()))

In [141]:
pointReducedCentroid_rdd.take(3)

[(1,
  [5.575221238938055,
   3.0884955752212395,
   3.123893805309735,
   0.9221238938053093]),
 (0, [6.00625, 2.7437500000000004, 5.256249999999999, 1.9499999999999997]),
 (2,
  [7.161904761904763,
   3.104761904761905,
   6.033333333333333,
   2.1142857142857143])]

In [142]:
reduced_points = pointReducedCentroid_rdd.collect()

In [143]:
for _ in reduced_points:
    print(_)

(1, [5.575221238938055, 3.0884955752212395, 3.123893805309735, 0.9221238938053093])
(0, [6.00625, 2.7437500000000004, 5.256249999999999, 1.9499999999999997])
(2, [7.161904761904763, 3.104761904761905, 6.033333333333333, 2.1142857142857143])


In [144]:
# create new centroids
new_centroids = sorted(reduced_points)
centroids.sort()

In [145]:
# create new centroids
new_centroids

[(0, [6.00625, 2.7437500000000004, 5.256249999999999, 1.9499999999999997]),
 (1,
  [5.575221238938055,
   3.0884955752212395,
   3.123893805309735,
   0.9221238938053093]),
 (2,
  [7.161904761904763,
   3.104761904761905,
   6.033333333333333,
   2.1142857142857143])]

In [146]:
# create new centroids
centroids

[(0, [5.8, 2.7, 5.1, 1.9]),
 (1, [6.0, 3.0, 4.8, 1.8]),
 (2, [7.4, 2.8, 6.1, 1.9])]

In [147]:
# check convergence or not
convergence_percentage = 0
for index, centroid in enumerate(centroids):
    dist = calculateDistance(centroid[1], new_centroids[index][1])
    print(dist)
    
    if dist < threshold:
        convergence_percentage += 1
        
centroids = new_centroids
percentage = len(centroids)*80/100

if convergence_percentage > percentage:
    print("Centroids converged")
else:
    print("not converged, still run")

0.2671463784145307
1.9412024357056865
0.4471375322975196
not converged, still run


In [175]:
schema_result2 = StructType() \
    .add("cluster_index", IntegerType(), True) \
    .add("sepal_length", DoubleType(), True) \
    .add("sepal_width", DoubleType(), True) \
    .add("petal_length", DoubleType(), True) \
    .add("petal_width", DoubleType(), True)

df_result1 = spark.createDataFrame(pointMapCentroid_rdd.map(lambda p: (p[0], p[1][0][0], p[1][0][1],
                                                                    p[1][0][2], p[1][0][3])), schema=schema_result)
df_result1.show()

+-------------+------------+-----------+------------+-----------+
|cluster_index|sepal_length|sepal_width|petal_length|petal_width|
+-------------+------------+-----------+------------+-----------+
|            1|         5.1|        3.5|         1.4|        0.2|
|            1|         4.9|        3.0|         1.4|        0.2|
|            1|         4.7|        3.2|         1.3|        0.2|
|            1|         4.6|        3.1|         1.5|        0.2|
|            1|         5.0|        3.6|         1.4|        0.2|
|            1|         5.4|        3.9|         1.7|        0.4|
|            1|         4.6|        3.4|         1.4|        0.3|
|            1|         5.0|        3.4|         1.5|        0.2|
|            1|         4.4|        2.9|         1.4|        0.2|
|            1|         4.9|        3.1|         1.5|        0.1|
|            1|         5.4|        3.7|         1.5|        0.2|
|            1|         4.8|        3.4|         1.6|        0.2|
|         

In [168]:
df_result1.write.option("header", True).csv("./result1")

In [187]:
df_result2 = spark.createDataFrame(spark.sparkContext.parallelize(centroids).map(lambda p: (p[0], p[1][0], p[1][1],
                                                    p[1][2], p[1][3])), schema=schema_result)
df_result2.show()

+-------------+-----------------+------------------+-----------------+------------------+
|cluster_index|     sepal_length|       sepal_width|     petal_length|       petal_width|
+-------------+-----------------+------------------+-----------------+------------------+
|            0|          6.00625|2.7437500000000004|5.256249999999999|1.9499999999999997|
|            1|5.575221238938055|3.0884955752212395|3.123893805309735|0.9221238938053093|
|            2|7.161904761904763| 3.104761904761905|6.033333333333333|2.1142857142857143|
+-------------+-----------------+------------------+-----------------+------------------+



In [190]:
df_result2.coalesce(1).write.option("header", True).csv("./result2")