In [1]:
"""
2018/03/23, KMEANS multiple version
"""

'\n2018/03/23, KMEANS multiple version\n'

In [2]:
# -------- import --------

import math
import time
import copy
import random

# -------- parameters --------

k             = 3
z             = 2
dimension     = 2
threshold     = 5 * 1e-1
iterationTime = 10

path          = "syntheticdata"
splitChar     = " "

dataFromFile  = True  # if True, read from files; else, create a random dataset (size)
centerChoice  = 1 # 1->random, 2->kmeans++, 3->fixed

# -------- variables -------- (version 0.1)

iterCount     = 0
centers       = []
costTotal     = 99999999

outliers      = []
timeBeg       = 0

In [3]:
def compareCenters(cA,cB,p):
    """version 0.1 from KMeans PaperVersion, compare centers before and after
            now we have two set of centers, cA=[c1A,c2A,c3A...], ciA=[d1,d2,d3...]
            int(di*10^p), for example, if p==2, 16.87!=16.88 && 16.887==16.888
       version 0.3 fit new data
    """
    for i in range(k):
        ciA = cA[i]
        ciB = cB[i]
        for j in range(dimension):
            if int(ciA[j]*10**p)-int(ciB[j]*10**p) >=1:
                return False
    return True

def costKM():
    """need to update clustering before call this function"""
    cost = data.map(lambda x: x[-1]).reduce(lambda x,y: x+y)
    return cost

def updateCluster(u,centerT):
    """except outliers(x[1]==-1)"""
    if u[1] == -1:
        return [u[0],-1,0]
    costMin = 9999999
    pos     = -1
    for i in range(k):
        costT = sum([(u[0][j]-centerT[i][j])**2 for j in range(dimension)])
        if costT < costMin:
            costMin = costT
            pos     = i
    return [u[0],pos+1,costMin]
    
def isOutlier(x,standardValue):
    """if the cost of a point is bigger than standardValue, it's a outlier point which belongs to cluster -1
    """
    if (x[-1] < standardValue):
        return x
    else:
        return [x[0],-1,0]
    
def isOutlierTemp(x,standardValue):
    """version 0.5 we have a standard value, but we haven't decide to discard these points permanently.
            if the cost of a point is bigger than standardValue, it's a outlier point
            for those outliers, x[-1]>=standardValue, set x[1]=0, x[-1]=0
    """
    if (x[-1] < standardValue):
        return x
    else:
        return [x[0],0,0]
    
def updateClusterSwap(x,centerT,p):
    """except outliers(x[1]==-1)"""
    if x[1] == -1:
        return 0
    
    if x[1] != p+1 :
        newCenter = centerT[p]
        distance = sum([(x[0][i]-newCenter[i])**2 for i in range(dimension)])
        if distance > x[-1]:
            return x[-1]
        else:
            return distance
    else:
        costMin = 9999999
        pos     = -1
        for i in range(k):
            costT = sum([(x[0][j]-centerT[i][j])**2 for j in range(dimension)])
            if costT < costMin:
                costMin = costT
                pos = i
        return costMin
    
def swap(A,centerT,i):
    """the swap is performed by replacing C by (C+[u])\[v] (in fact for each u in U and each v in C)"""
    return centerT[:i] + [A] + centerT[i+1:]

def costWO(centerT,p):
    """
    version 0.5 for part 3, a new center set with outliers
    version 0.6 updateCost return 0 for outliers(lambda x:x[1]<1), hence we don't need .filter() here
    """
    newData = data.map(lambda x: updateClusterSwap(x,centerT,p))
    costSum = newData.reduce(lambda x,y:x+y) - sum(newData.top(z))
    return costSum

def confirmOutlier(x):
    """check isOutlierTemp
       version 0.6 x[1]==0 -> x[1]=-1
    """
    if (x[1]!=0):
        return x
    else:
        return [x[0],-1,0]

In [4]:
data = sc.textFile(path).map(lambda line: [[float(i) for i in line.split(splitChar)],0,0])

# pick centers

if centerChoice == 1:
    sample = data.takeSample(False,k,random.randint(1,1000))
    centers = [i[0] for i in sample]
elif centerChoice == 2:
    centers = []
else:
    centers = []
    #kmeans++

In [5]:
timeBeg = time.time()

data = data.map(lambda x: updateCluster(x,centers))

standard = (data.map(lambda x:x[-1]).top(z))[-1]
data = data.map(lambda x: isOutlier(x,standard))

costTotal = costKM()

while (iterationTime > iterCount):
    
    # centerN,costN are the best result now. centerTemp,costTemp are improved results.
    centerN  = copy.deepcopy(centers)
    costN = costTotal
    
    ########################################################
    # -------- part 1 local search with no outliers --------
    while (True):
        centerTemp = copy.deepcopy(centerN)
        # a temporary improved set of centerN : centerTemp
        for i in range(k):
            tempCluster = data.filter(lambda x: x[1]==i+1 ).map(lambda x:x[0])
            sizeCluster = tempCluster.count()
            sumCluster  = tempCluster.reduce(lambda x,y:[x[i]+y[i] for i in range(dimension)])
            centerTemp[i]  = [sumCluster[h]/sizeCluster for h in range(dimension)]
        # break immediately
        if(compareCenters(centerN,centerTemp,2)):
            break
            
        data.map(lambda x: updateCluster(centerTemp))
        costTemp = costKM()
        # for each center and non-center, perform a swap
        listAllPoints  = data.filter(lambda x: x[1]!= -1).collect()
        costBestSwap   = costTemp
        centerbestSwap = copy.deepcopy(centerTemp)
        for u in listAllPoints:
            for swapPos in range(k):
                centerSwap = swap(u[0],centerTemp,swapPos)
                costSwap   = data.map(lambda x:updateClusterSwap(x,centerSwap,swapPos)).reduce(lambda x,y:x+y)
                if costSwap < costBestSwap :
                    costBestSwap   = costSwap
                    centerbestSwap = swap(u[0],centerTemp,swapPos)
        del listAllPoints
        centerTemp = copy.deepcopy(centerbestSwap)
        data = data.map(lambda x: updateCluster(x,centerTemp))

        costTemp = data.map(lambda x:x[-1]).reduce(lambda x,y:x+y)
        if costTemp < (1-threshold)*costN:
            centerN = copy.deepcopy(centerTemp)
            costN   = costTemp
        else:
            data = data.map(lambda x: updateCluster(x,centerN))
            # centerN and costN don't have to change
            break   
            
    # -------- part 2 cost of discarding z additional outliers --------
    """there is a problem: a outlier point x should belong to cluster -1, we will calculate cost without outliers
       but we havent decide whether 'outliers' in this part should be outliers or not
       therefore, we set these x: x[1] = 0 instead."""
    # find new temporary outliers (cost)
    outlierTemp = data.map(lambda x:x[-1]).top(z)
    if sum(outlierTemp) > threshold*costN:
        # costTotal2 = costTotal1 - sum(...)
        # ostTotal2 < (1-threshold)* costTotal1 <=> sum(...) > threshold*costTotal
        data = data.map(lambda x: isOutlierTemp(x,outlierTemp[-1])) # add some temporary outliers
        costN = costN - sum(outlierTemp)  # <=> costKM()
    
    # part 3
    listAllPoints = data.filter(lambda x:[1]>0).collect()
    
    centerTemp = copy.deepcopy(centerN)
    costTemp   = costN
    for u in listAllPoints:
        for swapPos in range(k):
            centerSwap = swap(u[0],centerN,swapPos)
            costSwap   = costWO(centerSwap,swapPos)
            if costSwap < costTemp :
                costTemp   = costSwap
                centerTemp = copy.deepcopy(centerSwap)
    # centerN is centers with the most improved swap
    if(costN!=costTemp): # if there's a swap
        centerN = copy.deepcopy(centerTemp)
        # update data (updateCluster update those x[1]!=-1, but there exists some temporary outliers)
        data = data.map(lambda x: x if x[1]==0 else updateCluster(x,centerN))
        # calculate new outliers, x[1]=0 (temporary outliers)
        standardTemp = data.map(lambda x:x[-1]).top(z)
        data = data.map(lambda x: isOutlierTemp(x,standardTemp[-1]))

    # -------- part 4 final check --------
    # update the solution allowing additional outliers if the solution value improved significantly
    if ( costN < (1-threshold)*costTotal ):
        centers = copy.deepcopy(centerN)
        data = data.map(lambda x: confirmOutlier(x))
        costTotal = costN
        print costTotal, centers
    else:
        data = data.map(lambda x: updateCluster(x,centers))
        break
    
timeEnd = time.time()

3581.35414164 [[3.0, 25.0], [12.269565217391305, 7.869565217391305], [12.0, 16.0]]


In [6]:
for i in range(k):
    print(data.filter(lambda x:x[1]==i+1).count())

54
113
128


In [7]:
print(data.filter(lambda x:x[1]==-1).collect())

[[[20.0, 1.0], -1, 0], [[19.0, 1.0], -1, 0], [[20.0, 2.0], -1, 0], [[20.0, 1.0], -1, 0], [[20.0, 1.0], -1, 0]]


In [8]:
print timeEnd-timeBeg

571.098846912


In [9]:
print centers

[[3.0, 25.0], [12.269565217391305, 7.869565217391305], [12.0, 16.0]]
