In [1]:
import csv
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = [5, 5]

In [2]:
def draw_point(x, y, color, size=20):
    plt.scatter(x, y, size, color)

In [3]:
import numpy as np

In [4]:
def read_points(dataset_name):
    points = []
    labels = []
    with open (dataset_name, 'r') as fin:
        csvReader = csv.reader(fin, delimiter=',')
        for row in csvReader:
            if csvReader.line_num != 1:
                float_row = []
                for i in range(1, len(row)):
                    float_row.append(float(row[i]))
                points.append(float_row)
                labels.append(row[0])
    return points, labels

In [5]:
def generateMinMax(points):
    mins = []
    maxs = []
    for point in points:
        if mins == []:
            for i in range(len(point)):
                mins.append(point[i])
                maxs.append(point[i])
        else:
            for i in range(len(point)):
                mins[i] = min(mins[i], point[i])
                maxs[i] = max(maxs[i], point[i])
    return (mins, maxs)

In [6]:
def distance(x, y):
    sum = 0
    for i in range(len(x)):
        sum += (x[i] - y[i]) ** 2
    return sum ** (0.5)

In [7]:
def kMeans(dataset_name, cluster_num):
    colorPoints = [ [] for i in range(cluster_num)]
    centers = []
    points, labels = read_points(dataset_name) 
    
    def nextIteration(centers):
        for point in points:
            minD = distance(point, centers[0])
            color = 0
            for i in range(1, len(centers)):
                d = distance(point, centers[i])
                if minD > d:
                    minD = d
                    color = i
            colorPoints[color].append(point)
        centers = []            
        for i in range(cluster_num):
            if len(colorPoints[i]) == 0:
                return []
            summ = [0] * len(colorPoints[i][0])
            for point in colorPoints[i]:
                for j in range(len(point)):
                    summ[j] += point[j]
            for j in range(len(point)):
                summ[j] = summ[j] / len(colorPoints[i])
            centers.append(summ)
        return centers
    
    def start():
        mins, maxs = generateMinMax(points)
        ccenters = []
        for j in range(cluster_num):
            center = []
            for i in range(len(mins)):
                center.append(np.random.uniform(mins[i], maxs[i]))
            centers.append(center)
        colorPoints = [ [] for i in range(cluster_num)]
        return centers
    
    centers = start()
    while (True):
        oldCenters = centers    
        centers = nextIteration(centers)
        if centers == []:
            centers = start()
        else:
            if centers == oldCenters:
                purity = 0
                for i in range(cluster_num):
                    count, lbl = 0, '' 
                    lbls = {}
                    for point in colorPoints[i]:
                        label = labels[points.index(point)]
                        lbls[label] = lbls.get(label, 0) + 1
                        if lbls[label] >= count:
                            count, lbl = lbls[label], label
                    purity += count
                purity = purity * 1.0 / len(points)    
                return purity
            colorPoints = [ [] for i in range(cluster_num)]

In [8]:
for i in [2, 3, 5, 10]:
    print(kMeans('datasets/cancer.csv', i))

0.854130052724
0.866432337434
0.84007029877
0.898066783831
