In [1]:
"""
    2018/03/15, K-Means PaperVersion with Outliers
    Local Search Methods for k-Means with Outliers, VLDB 2017
    from KMeans PaperVersion
"""

'\n    2018/03/15, K-Means PaperVersion with Outliers\n    Local Search Methods for k-Means with Outliers, VLDB 2017\n    from KMeans PaperVersion\n'

In [2]:
# -------- import --------

import math
import time
import copy
import random

# -------- parameters --------

k             = 3
z             = 3
dimension     = 4
threshold     = 1e-4
iterationTime = 10

path          = "Skin_NonSkin.txt"
splitChar     = "\t"

dataFromFile  = False # if True, read from files; else, create a random dataset (size)

# -------- variables -------- (version 0.1)

size          = 100
iterCount     = 0
centers       = []
costTotal     = 99999999

outliers      = []

timeBeg       = 0

In [3]:
# -------- create synthetic data -------- from KMeans PaperVersion (LS)

"""
k = 3, dimension = 4, clusterSize = [ , , ]
"""
def createData(n):
    listData = []
    for i in range(n):
        choice = random.randint(1,3)
        if choice == 1:
            newIn = [random.randint(0,10),random.randint(0,10),random.randint(10,20),random.randint(10,20),1]
            newIn = [float(newIn[a]) for a in range(4)] + [1]
            listData.append(newIn)
        elif choice == 2:
            newIn = [random.randint(50,60),random.randint(0,10),random.randint(10,30),random.randint(30,40),2]
            newIn = [float(newIn[a]) for a in range(4)] + [2]
            listData.append(newIn)
        else: 
            newIn = [random.randint(0,10),random.randint(20,30),random.randint(60,80),random.randint(100,200),3]
            newIn = [float(newIn[a]) for a in range(4)] + [3]
            listData.append(newIn)
    return listData


In [4]:
# -------- functions --------

def clusterResult():
    """
    version 0.1 from KMeans PV: show the origin cluster: size of the ith cluster
    version 0.3 modified: fit new data type
    rename as clusterResult (used to be originResult)
    """
    outlierCount = size
    for i in range(k):
        csize = data.filter(lambda x: x[1]==i+1).count()
        outlierCount = outlierCount - csize
        print("Origin cluster " + str(i+1) + " : " + str(csize))
    print("Outliers: " + str(outlierCount))

def costKM():
    """
    version 0.1 from KMeans PV: show the total cost of the cluster with center set centerTemp
    version 0.2 discard some outliers (lambda x: x[-1]== -1)
    version 0.3 discard some outliers(RDD), now x[-1] is the cost
        costSum = data.filter(lambda x: x[1]!= -1).map(lambda x: x[-1]).reduce(lambda x,y: x+y)
    version 0.4 now we assume that all outliers have a 0 cost
    """
    costSum = data.map(lambda x: x[-1]).reduce(lambda x,y: x+y)
    return costSum

def updateCluster(u,centerTemp):
    """
    version 0.3 find a cluster for just one point and return [clusterNum,costMin]
    version 0.4 
        data = data.map(lambda x: [[x[0]]]+updateCluster(x,centerTemp) if x[1]!= -1 else x)
        is replaced by data = data.map(lambda x: [[x[0]]]+updateCluster(x,centerTemp))
        hence, if x[1]==-1, we return [-1,0] in this function
    """
    if u[1] == -1:
        return [u[0],-1,0]
    costMin = 9999999
    pos     = -1
    for i in range(k):
        A = u[0]
        B = centerTemp[i][0]
        costTemp = sum([(A[j]-B[j])**2 for j in range(dimension)])
        if costTemp < costMin:
            costMin = costTemp
            pos = i
    return [u[0],pos+1,costMin]

def costKMTemp(centerTemp):
    # centerTemp are supposed to be new centers
    # sum(x[i]-c[i]**2 for i in range(dimension)) is the distance between two points
    # min(distance(x,c) for c in centerTemp) is the distance between a point and center points
    costSum = data.map(lambda x:(min(sum((x[0][i]-c[0][i])**2 for i in range(dimension)) for c in centerTemp))).reduce(lambda x,y : x+y)
    return costSum
    
def compareCenters(cA,cB,p):
    """version 0.1 from KMeans PaperVersion, compare centers before and after
            now we have two set of centers, cA=[c1A,c2A,c3A...], ciA=[d1,d2,d3...]
            int(di*10^p), for example, if p==2, 16.87!=16.88 && 16.887==16.888
       version 0.3 fit new data
    """
    for i in range(k):
        ciA = cA[i][0]
        ciB = cB[i][0]
        for j in range(dimension):
            if int(ciA[j]*10**p)-int(ciB[j]*10**p) >=1:
                return False
    return True

def swap(A,centerTemp,i):
    """version 0.1 from KMeans PaperVersion, perform a swap.
        the swap is performed by replacing C by (C+[u])\[v] (in fact for each u in U and each v in C)"""
    return centerTemp[:i] + [A] + centerTemp[i+1:]

def isOutlier(x,standardValue):
    """version 0.5 if the cost of a point is bigger than standardValue, it's a outlier point
            for those outliers, x[-1]>=standardValue, set x[1]=-1, x[-1]=0
    """
    if (x[-1] < standardValue):
        return x
    else:
        return [x[0],-1,0]
    
def isOutlierTemp(x,standardValue):
    """version 0.5 we have a standard value, but we haven't decide to discard these points permanently.
            if the cost of a point is bigger than standardValue, it's a outlier point
            for those outliers, x[-1]>=standardValue, set x[1]=0, x[-1]=0
    """
    if (x[-1] < standardValue):
        return x
    else:
        return [x[0],0,0]
    
def isNotOutlier(x,centerTemp):
    """UNDO isOutlierTemp
       version 0.5 we've already set some 'outliers' x[1]=0, but if cost doesn't decrease significantly, 
           we set points x[1] = clusterNo they should be in again.
    """
    if (x[1]!=0):
        return x
    else:
        return updateCluster(x,centerTemp)
        

In [5]:
# -------- create RDD -------- from KMeans PaperVersion (LS)

""" version 0.3
        line            = [d0,d1...clusterNo]
        x               = [line[:-1],clusterNo,cost]
        clusterNo       = line[-1] = x[1]   
        cost(x,centers) = x[2] = 999999(temporary) """

if dataFromFile :
    # 1. from files (real data)
    data = sc.textFile(path).map(lambda line: line.split(splitChar)).map(lambda x : [float(a) for a in x[:-1]],int(x[-1]), 999999)
else:
    # 2. random lists (synthetic data) (version 0.2)
    datalist = createData(size)
    datalist = [[x[:-1],x[-1],0] for x in datalist]
    print datalist[1]
    data = sc.parallelize(datalist)

clusterResult()

size = data.count()
print("All " + str(size) + " points." )

# start timing
timeBeg = time.time()


[[6.0, 9.0, 15.0, 15.0], 1, 0]
Origin cluster 1 : 33
Origin cluster 2 : 25
Origin cluster 3 : 42
Outliers: 0
All 100 points.


In [6]:
# -------- pick centers --------

# an arbitrary set of k points from U
centers = data.takeSample(False,k,random.randint(1,1000))
print(centers)

# K-means++ (k+2z)
# TODO


data = data.map(lambda x: updateCluster(x,centers))

# -------- compute outliers --------
# outliers : lambda x : x[-1] == -1, x[-1] == 0 
# (cost=0 means that we dont need to .filter() before sum up)
# sort all cost and set standard
standard = (data.map(lambda x:x[-1]).top(z))[-1]
data = data.map(lambda x: isOutlier(x,standard))

"""
# wanna know whether we've found z outliers or not, try these codes:
outlierCount = data.filter(lambda x:x[1]==-1).count()
print outlierCount
"""

[[[9.0, 23.0, 62.0, 195.0], 3, 0], [[4.0, 27.0, 63.0, 142.0], 3, 0], [[56.0, 7.0, 23.0, 40.0], 2, 0]]


"\n# wanna know whether we've found z outliers or not, try these codes:\noutlierCount = data.filter(lambda x:x[1]==-1).count()\nprint outlierCount\n"

In [7]:
# -------- iterations --------
"""
we already have:
data(RDD), centers[], outliers[],costTotal(Float) 
as final results.

new variables:

as temporary results.
"""

while (iterCount < iterationTime):
    
    costTemp = costKM()
    
    ########################################################
    # -------- part 1 local search with no outliers --------
    # costTotal1,centers1,centerN,costSwap,costTemp
    centers1 = copy.deepcopy(centers)
    
    while True :
        # update costTotal1
        costTotal1 = costKM() #result of last iteration
        print("costTotal1"+ str(costTotal1))
        # update cluster data in RDD
        data = data.map(lambda x: updateCluster(x,centers1))

        centerN = copy.deepcopy(centers1)
        # a temporary improved set of centers1
        for i in range(k):
            tempCluster = data.filter(lambda x: x[1]==i+1 ).map(lambda x:x[0])
            sizeCluster = tempCluster.count()
            sumCluster  = tempCluster.reduce(lambda x,y:[x[i]+y[i] for i in range(dimension)])
            centerN[i]  = [[sumCluster[h]/sizeCluster for h in range(dimension)],i+1,0]
        costTemp = costKMTemp(centerN)
        
        # break immediately
        if(compareCenters(centerN,centers1,2)):
            break
            
        # for each center and non-center, perform a swap
        listAllPoints = data.filter(lambda x: x[1]!= -1).collect()
        costSwap      = costTemp
        for u in listAllPoints:
            swapF = -1
            for swapPos in range(k):
                centerSwap = swap(u,centerN,swapPos)
                costSwapT = costKMTemp(centerSwap)
                if costSwapT < costSwap  :
                    swapF = swapPos
                    costSwap = costSwapT
            if swapF != -1:
                centerN = swap(u,centerN,swapF)
        del listAllPoints

        costTemp = costKMTemp(centerN)
        if costTemp < (1-threshold)*costTotal1:
            costTotal1 = costTemp
            centers1 = copy.deepcopy(centerN)
            data = data.map(lambda x: updateCluster(x,centers1))
        else:
            break      
    # now, centers1 is a temporary improved set of centers, costTotal1 is the improved cost
    print centers1
        
    ###################################################################
    # -------- part 2 cost of discarding z additional outliers --------
    """there is a problem: a outlier point x should belong to cluster -1, we will calculate cost without outliers
       but we havent decide whether 'outliers' in this part should be outliers or not
       therefore, we set these x: x[1] = 0 instead."""
    # find new temporary outliers (cost)
    outlierTemp = data.map(lambda x:x[-1]).top(z)
    costTotal2 = costTotal1
    if sum(outlierTemp) > threshold*costTotal1:
        # costTotal2 = costTotal1 - sum(...)
        # ostTotal2 < (1-threshold)* costTotal1 <=> sum(...) >threshold*costTotal
        data = data.map(lambda x: isOutlierTemp(x,outlierTemp[-1]))
        costTotal2 = costTotal1 - sum(outlierTemp)
    del outlierTemp
    # centers1 is still the set of centers, costTotal2 is the improved cost
    
    ########################################################################################################
    # -------- part 3 for each center an non-center, perform a swap and discard additional outliers --------
    listAllPoints = data.filter(lambda x: x[1]>=1).collect()
    centerN = copy.deepcopy(centers1)
    for u in listAllPoints:
        swapF = -1
        for swapPos in range(k):
            centerSwap = swap(u,centerN,swapPos)
            costSwapT = costKMTemp(centerSwap)
                if costSwapT < costSwap  :
                    swapF = swapPos
                    costSwap = costSwapT
            if swapF != -1:
                centerN = swap(u,centerN,swapF)
    
    
    
    
    
    
    
    
    
    break
    # update the solution allowing additional outliers if the solution value improved significantly

costTotal1119585.0
costTotal158642.9908154
[[[5.818181818181818, 24.272727272727273, 68.54545454545455, 187.36363636363637], 1, 0], [[5.451612903225806, 24.580645161290324, 71.25806451612904, 133.0], 2, 0], [[28.89090909090909, 5.1454545454545455, 18.69090909090909, 25.10909090909091], 3, 0]]


In [8]:
timeEnd = time.time()
print("Time: " + str(timeEnd-timeBeg))

Time: 52.0346910954


In [9]:
clusterResult()

Origin cluster 1 : 14
Origin cluster 2 : 28
Origin cluster 3 : 52
Outliers: 6
