In [1]:
"""
    2018/03/14, a local search algorithm for K-Means with no outliers (LS).
    Based on (Kmeans PaperVersion OO), this is a new version without class KMeans.
    
    Result:
        2018/03/14, version 0.1 origin K-Means (Result is different from the result of Kmeans origin.)
        2018/03/15, version 0.1 correctness check!
                    version 0.2 aims to add local search part. (from Kmeans PaperVersion OO)
                    synthetic test : n=100, pass
"""

'\n    2018/03/14, a local search algorithm for K-Means with no outliers (LS).\n    Based on (Kmeans PaperVersion OO), this is a new version without class KMeans.\n    \n    Result:\n        2018/03/14, version 0.1 origin K-Means (Result is different from the result of Kmeans origin.)\n        2018/03/15, version 0.1 correctness check!\n                    version 0.2 aims to add local search part. (from Kmeans PaperVersion OO)\n                    synthetic test : n=100, pass\n'

In [2]:
# -------- import -------- (version 0.1)

import math
import copy
import time
import random

# -------- parameteres -------- (version 0.1)

k             = 3
dimension     = 4
threshold     = 1e-4
iterationTime = 10

path          = "Skin_NonSkin.txt"
splitChar     = "\t"


# -------- variables -------- (version 0.1)

size          = 100
iterCount     = 0
centers       = []
costTotal     = 99999999

timeBeg       = 0

In [3]:
# -------- create synthetic data -------- (version 0.2)
"""
k = 3, dimension = 4, clusterSize = [ , , ]
"""
def createData(n):
    listData = []
    for i in range(n):
        choice = random.randint(1,3)
        if choice == 1:
            newIn = [random.randint(0,10),random.randint(0,10),random.randint(10,20),random.randint(10,20),1]
            listData.append(newIn)
        elif choice == 2:
            newIn = [random.randint(50,60),random.randint(0,10),random.randint(10,30),random.randint(30,40),2]
            listData.append(newIn)
        else: 
            newIn = [random.randint(0,10),random.randint(20,30),random.randint(60,80),random.randint(100,200),3]
            listData.append(newIn)
    return listData


In [4]:
# -------- functions --------- (version 0.1)

def showTime():
    timeEnd = time.time()
    print(timeEnd-timeBeg)

def costKM(centerTemp):
    # centerTemp are supposed to be new centers
    # sum(x[i]-c[i]**2 for i in range(dimension)) is the distance between two points
    # min(distance(x,c) for c in centerTemp) is the distance between a point and center points
    costSum = data.map(lambda x:(min(sum((x[i]-c[i])**2 for i in range(dimension)) for c in centerTemp))).reduce(lambda x,y : x+y)
    return costSum

def updateCluster(x):
    clusterTemp = -1
    distanceMin = 999999 
    for i in range(k):
        cTemp = centers[i]
        distanceTemp = sum((x[j]-cTemp[j])**2 for j in range(dimension))
        if distanceTemp < distanceMin:
            clusterTemp = i
            distanceMin = distanceTemp
    return clusterTemp+1

def swap(A,centerTemp,i):
    """ the swap is performed by replacing C by (C+[u])\[v] (in fact for each u in U and each v in C)"""
    return centerTemp[:i] + [A] + centerTemp[i+1:]
   
# version 0.1 : showTime,costKM,swap

def originResult():
    """show the origin cluster: size of the ith cluster"""
    for i in range(k):
        csize = data.filter(lambda x: x[-1]==i+1).count()
        print("Origin cluster " + str(i+1) + " : " + str(csize))
        
# version 0.2 : originResult

def compareCenters(cA,cB,p):
    """now we have two set of centers, cA=[c1A,c2A,c3A...], ciA=[d1,d2,d3...]
       int(di*10^p), for example, if p==2, 16.87!=16.88 && 16.887==16.888
    """
    for i in range(k):
        ciA = cA[i]
        ciB = cB[i]
        for j in range(dimension):
            if int(ciA[j]*10**p)-int(ciB[j]*10**p) >=1:
                return False
    return True

# version 0.3 : compare centers before and after

In [5]:
# -------- preparation -------- (version 0.1)

# create RDD

# 1. from files (real data)
# data = sc.textFile(path).map(lambda line: line.split(splitChar)).map(lambda x : [float(a) for a in x] )

# 2. random lists (synthetic data) (version 0.2)
datalist = createData(size)
data = sc.parallelize(datalist)
originResult()

size = data.count()
print("All " + str(size) + " points." )

# start timing
timeBeg = time.time()

Origin cluster 1 : 37
Origin cluster 2 : 32
Origin cluster 3 : 31
All 100 points.


In [6]:
# -------- find initial centers --------

# 1. random pick
centers = data.takeSample(False, k, random.randint(1,1000))
# 2. k-means++

# 3. cheat
# centers = []

print("Begin with these " + str(k) + " points.")
print(centers)

Begin with these 3 points.
[[7, 23, 65, 198, 3], [0, 30, 65, 123, 3], [6, 7, 19, 13, 1]]


In [7]:
# -------- iterations --------
print iterCount
print iterationTime
while iterCount < iterationTime :
    
    iterCount += 1
    print("Iteration " + str(iterCount) + " : ")
    
    # update costTotal
    costTotal = costKM(centers)
    print("origin cost = " + str(costTotal))
    
    # update cluster data in RDD
    data = data.map( lambda x: x[0:-1]+[updateCluster(x)] )
    
    icenters = copy.deepcopy(centers)
    # a temporary improved set of centers
    for i in range(k):
        tempCluster = data.filter(lambda x: x[-1]==i+1)
        sizeCluster = tempCluster.count()
        sumCluster  = tempCluster.reduce(lambda x,y: [x[j]+y[j] for j in range(dimension+1)])
        icenters[i]  = [1.0*sumCluster[h]/sizeCluster for h in range(dimension)] + [i+1]
        print("Cluster " + str(i+1) + ", size : " + str(sizeCluster))
    print("A temporary improved set of centers:")
    print(icenters)
    costTemp = costKM(icenters)
    
    # version 0.3 break immediately
    if(compareCenters(icenters,centers,2)):
        print("No update!")
        break
    
    """
    centers is the origin set of centers
    costTotal is the cost with center set 'centers'
    
    icenters is a temporary improved set of centers
    costTemp is the cost with center set 'icenters'
    
    costSwap is the cost with swap
    
    costSwapT is the cost with swap() (temporary)
    """
    # for each center and non-center, perform a swap
    listAllPoints = data.collect()
    costSwap      = costTemp
    for u in listAllPoints:
        swapF = -1
        for swapPos in range(k):
            costSwapT = costKM(swap(u,icenters,swapPos))
            if costSwapT < costSwap  :
                swapF = swapPos
                costSwap = costSwapT
        if swapF != -1:
            icenters = swap(u,icenters,swapF)
    del listAllPoints
    print("After swap:")
    print(icenters)
    # break
    
    costTemp = costKM(icenters)
    if costTemp < (1-threshold)*costTotal:
        print(costTemp)
        centers = copy.deepcopy(icenters)
        costTotal = costTemp
    else:
        print(costTemp)
        print("GG2...")
        break

showTime()

0
10
Iteration 1 : 
origin cost = 109007
Cluster 1, size : 8
Cluster 2, size : 23
Cluster 3, size : 69
A temporary improved set of centers:
[[6.875, 24.875, 71.5, 186.375, 1], [4.956521739130435, 25.217391304347824, 69.52173913043478, 133.7391304347826, 2], [27.52173913043478, 5.072463768115942, 17.681159420289855, 24.043478260869566, 3]]
After swap:
[[56, 4, 21, 37, 3], [5, 25, 70, 151, 2], [4, 7, 16, 17, 3]]
29957
Iteration 2 : 
origin cost = 29957
Cluster 1, size : 32
Cluster 2, size : 31
Cluster 3, size : 37
A temporary improved set of centers:
[[54.25, 4.875, 20.9375, 34.8125, 1], [5.451612903225806, 25.129032258064516, 70.03225806451613, 147.32258064516128, 2], [4.405405405405405, 5.243243243243243, 14.864864864864865, 14.72972972972973, 3]]
After swap:
[[54.25, 4.875, 20.9375, 34.8125, 1], [5.451612903225806, 25.129032258064516, 70.03225806451613, 147.32258064516128, 2], [4.405405405405405, 5.243243243243243, 14.864864864864865, 14.72972972972973, 3]]
28896.5045772
Iteration 3 :