In [4]:
import numpy as np
from multiprocessing import Pool
import pickle
import pandas as pd
import sys

In [5]:
# load data from csv file
data = np.loadtxt('data.csv', delimiter=',')

In [6]:
# load labels from csv file
labels = np.loadtxt('label.csv', delimiter=',', dtype=np.int64)

In [7]:
# distance functions
def euclidean(x,y):
    return np.sqrt(np.sum((x - y) ** 2))

def cosine(x,y):
    return 1 - np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))

def jaccard(x,y):
    return 1 - np.sum(np.minimum(x,y)) / np.sum(np.maximum(x,y))

In [8]:
# total classes
total_classes = len(set(labels))

In [9]:
# k means function
def k_means(X, k, distance, intital_centroids=None, max_iter=None, sse_check = False):

    if intital_centroids is None:
        # initialize centers randomly
        centroids = X[np.random.choice(X.shape[0], k, replace=False), :]
    else:
        centroids = intital_centroids
    # initialize clusters
    clusters = np.zeros(X.shape[0])
    # initialize previous clusters
    clusters_old = np.ones(X.shape[0])
    old_sse = sys.maxsize

    iteration = 1
    # loop until clusters stop changing
    while iteration != (max_iter+1) if max_iter != None else (not np.array_equal(clusters, clusters_old)):
        # assign previous clusters
        clusters_old = clusters
        # assign clusters
        clusters = np.array([np.argmin([distance(x, c) for c in centroids]) for x in X])

        old_centroids = centroids
        # update centroids
        centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        # compute SSE
        sse = np.sum([np.sqrt(np.sum([(centroids[i] - j)**2 for j in X[clusters == i]])) for i in range(k)])
        if sse > old_sse and sse_check:
          return iteration, old_sse, old_centroids, clusters_old
        else:
          old_sse = sse    

        iteration +=1
    return iteration-1, sse, centroids, clusters

In [10]:
# intialize 10 random centroids
np.random.seed(0)
centroids_list = [data[np.random.choice(data.shape[0], total_classes, replace=False), :] for i in range(10)]

In [None]:
# run k means 10 times for each distance function until centroids don't change
with Pool(10) as p:
    euclidean_results_list = p.starmap(k_means, [(data, total_classes, euclidean, centroids_list[i]) for i in range(10)])
    cosine_results_list = p.starmap(k_means, [(data, total_classes, cosine, centroids_list[i]) for i in range(10)])
    jaccard_results_list = p.starmap(k_means, [(data, total_classes, jaccard, centroids_list[i]) for i in range(10)])

In [None]:
print(f"euclidean mean iterations: {np.mean([i for i in map(lambda x: x[0], euclidean_results_list)])}")
print(f"cosine mean iterations: {np.mean([i for i in map(lambda x: x[0], cosine_results_list)])}")
print(f"jaccard mean iterations: {np.mean([i for i in map(lambda x: x[0], jaccard_results_list)])}")

euclidean mean iterations: 57.4
cosine mean iterations: 71.8
jaccard mean iterations: 76.4


In [None]:
print(f"euclidean mean sse: {np.mean([i for i in map(lambda x: x[1], euclidean_results_list)])}")
print(f"cosine mean sse: {np.mean([i for i in map(lambda x: x[1], cosine_results_list)])}")
print(f"jaccard mean sse: {np.mean([i for i in map(lambda x: x[1], jaccard_results_list)])}")

euclidean mean sse: 498957.90150772256
cosine mean sse: 492985.15354220837
jaccard mean sse: 493908.68078405876


In [None]:
df_sse = pd.DataFrame(columns=['distance_metric','centroid_1', 'centroid_2', 'centroid_3', 'centroid_4', 'centroid_5', 'centroid_6', 'centroid_7', 'centroid_8', 'centroid_9', 'centroid_10', 'average_sse'])

In [None]:
# first row euclidean results
df_sse.loc[0] = ['euclidean'] + [i for i in map(lambda x: x[1], euclidean_results_list)] + [np.mean([i for i in map(lambda x: x[1], euclidean_results_list)])]

# second row cosine results
df_sse.loc[1] = ['cosine'] + [i for i in map(lambda x: x[1], cosine_results_list)] + [np.mean([i for i in map(lambda x: x[1], cosine_results_list)])]

# third row jaccard results
df_sse.loc[2] = ['jaccard'] + [i for i in map(lambda x: x[1], jaccard_results_list)] + [np.mean([i for i in map(lambda x: x[1], jaccard_results_list)])]

In [None]:
df_sse

Unnamed: 0,distance_metric,centroid_1,centroid_2,centroid_3,centroid_4,centroid_5,centroid_6,centroid_7,centroid_8,centroid_9,centroid_10,average_sse
0,euclidean,497369.973138,498373.008594,501979.549921,497595.886745,495970.603712,502902.106489,502485.014038,495941.604503,495792.986439,501168.281499,498957.901508
1,cosine,494538.400801,492155.810466,489785.107294,492909.321142,492074.469673,492264.15788,491927.031693,493610.321457,502670.959054,487915.955962,492985.153542
2,jaccard,493502.620326,493619.351553,491948.97831,493335.894878,493596.69571,493684.9621,493679.364233,492691.474397,499455.106675,493572.359661,493908.680784


In [11]:
def k_means_accuracy(clusters, labels, k):
  assigned_labels = np.zeros(len(labels))
  for i in range(k):
    points_indices = np.where(clusters == i)[0]
    true_labels = labels[points_indices]
    majority_vote = np.argmax(np.bincount(true_labels))
    assigned_labels[points_indices] = majority_vote

  label_comparison = np.equal(assigned_labels,labels).astype(int)
  return np.sum(label_comparison)/len(labels) * 100

In [None]:
df_accuracy = pd.DataFrame(columns=['distance_metric','centroid_1', 'centroid_2', 'centroid_3', 'centroid_4', 'centroid_5', 'centroid_6', 'centroid_7', 'centroid_8', 'centroid_9', 'centroid_10', 'average_accuracy'])

In [None]:
# first row euclidean results
df_accuracy.loc[0] = ['euclidean'] + [k_means_accuracy(i, labels, total_classes) for i in map(lambda x: x[3], euclidean_results_list)] + [0]

# second row cosine results
df_accuracy.loc[1] = ['cosine'] + [k_means_accuracy(i, labels, total_classes) for i in map(lambda x: x[3], cosine_results_list)] + [0]

# third row jaccard results
df_accuracy.loc[2] = ['jaccard'] + [k_means_accuracy(i, labels, total_classes) for i in map(lambda x: x[3], jaccard_results_list)] + [0]

In [None]:
df_accuracy['average_accuracy'] = df_accuracy[[c for c in df_accuracy.columns if c != 'distance_metric' and c != 'average_accuracy']].mean(axis=1)

In [None]:
df_accuracy

Unnamed: 0,distance_metric,centroid_1,centroid_2,centroid_3,centroid_4,centroid_5,centroid_6,centroid_7,centroid_8,centroid_9,centroid_10,average_accuracy
0,euclidean,59.77,53.32,64.95,53.9,60.12,59.98,60.25,60.13,60.17,60.35,59.294
1,cosine,54.78,61.48,64.1,62.41,61.38,61.42,61.33,62.48,60.76,62.69,61.283
2,jaccard,54.43,60.25,62.93,54.74,60.21,60.5,60.5,62.4,62.79,60.27,59.902


In [None]:
df_iterations = pd.DataFrame(columns=['distance_metric','centroid_1', 'centroid_2', 'centroid_3', 'centroid_4', 'centroid_5', 'centroid_6', 'centroid_7', 'centroid_8', 'centroid_9', 'centroid_10', 'average_iteration'])

In [None]:
# first row euclidean results
df_iterations.loc[0] = ['euclidean'] + [i for i in map(lambda x: x[0], euclidean_results_list)] + [np.mean([i for i in map(lambda x: x[0], euclidean_results_list)])]

# second row cosine results
df_iterations.loc[1] = ['cosine'] + [i for i in map(lambda x: x[0], cosine_results_list)] + [np.mean([i for i in map(lambda x: x[0], cosine_results_list)])]

# third row jaccard results
df_iterations.loc[2] = ['jaccard'] + [i for i in map(lambda x: x[0], jaccard_results_list)] + [np.mean([i for i in map(lambda x: x[0], jaccard_results_list)])]

In [None]:
df_iterations

Unnamed: 0,distance_metric,centroid_1,centroid_2,centroid_3,centroid_4,centroid_5,centroid_6,centroid_7,centroid_8,centroid_9,centroid_10,average_iteration
0,euclidean,48,86,45,49,63,39,57,57,40,90,57.4
1,cosine,46,97,36,46,116,102,92,40,88,55,71.8
2,jaccard,55,80,38,50,73,96,87,39,83,163,76.4


In [None]:
# run k means 10 times for each distance function until sse increases in next iteration
with Pool(10) as p:
    euclidean_results_list = p.starmap(k_means, [(data, total_classes, euclidean, centroids_list[i],None,True) for i in range(10)])
    cosine_results_list = p.starmap(k_means, [(data, total_classes, cosine, centroids_list[i],None,True) for i in range(10)])
    jaccard_results_list = p.starmap(k_means, [(data, total_classes, jaccard, centroids_list[i],None,True) for i in range(10)])

In [None]:
df_iterations_2 = pd.DataFrame(columns=['distance_metric','centroid_1', 'centroid_2', 'centroid_3', 'centroid_4', 'centroid_5', 'centroid_6', 'centroid_7', 'centroid_8', 'centroid_9', 'centroid_10', 'average_iteration'])

# first row euclidean results
df_iterations_2.loc[0] = ['euclidean'] + [i for i in map(lambda x: x[0], euclidean_results_list)] + [np.mean([i for i in map(lambda x: x[0], euclidean_results_list)])]

# second row cosine results
df_iterations_2.loc[1] = ['cosine'] + [i for i in map(lambda x: x[0], cosine_results_list)] + [np.mean([i for i in map(lambda x: x[0], cosine_results_list)])]

# third row jaccard results
df_iterations_2.loc[2] = ['jaccard'] + [i for i in map(lambda x: x[0], jaccard_results_list)] + [np.mean([i for i in map(lambda x: x[0], jaccard_results_list)])]

In [None]:
df_iterations_2

Unnamed: 0,distance_metric,centroid_1,centroid_2,centroid_3,centroid_4,centroid_5,centroid_6,centroid_7,centroid_8,centroid_9,centroid_10,average_iteration
0,euclidean,14,21,6,13,2,2,8,9,2,8,8.5
1,cosine,2,2,2,2,17,10,2,2,2,35,7.6
2,jaccard,2,2,2,2,15,4,2,2,2,7,4.0


In [None]:
df_sse_2 = pd.DataFrame(columns=['distance_metric','centroid_1', 'centroid_2', 'centroid_3', 'centroid_4', 'centroid_5', 'centroid_6', 'centroid_7', 'centroid_8', 'centroid_9', 'centroid_10', 'average_sse'])

# first row euclidean results
df_sse_2.loc[0] = ['euclidean'] + [i for i in map(lambda x: x[1], euclidean_results_list)] + [np.mean([i for i in map(lambda x: x[1], euclidean_results_list)])]

# second row cosine results
df_sse_2.loc[1] = ['cosine'] + [i for i in map(lambda x: x[1], cosine_results_list)] + [np.mean([i for i in map(lambda x: x[1], cosine_results_list)])]

# third row jaccard results
df_sse_2.loc[2] = ['jaccard'] + [i for i in map(lambda x: x[1], jaccard_results_list)] + [np.mean([i for i in map(lambda x: x[1], jaccard_results_list)])]

In [None]:
df_sse_2

Unnamed: 0,distance_metric,centroid_1,centroid_2,centroid_3,centroid_4,centroid_5,centroid_6,centroid_7,centroid_8,centroid_9,centroid_10,average_sse
0,euclidean,497757.027836,498091.531892,501758.414452,496813.624607,507523.354156,497535.823629,498338.139281,502637.784706,505962.107824,503386.724611,500980.453299
1,cosine,493111.897942,491740.206231,492045.660223,449112.073113,490244.149407,489577.408135,497395.027002,481519.460185,469246.264764,488172.126172,484216.427317
2,jaccard,487233.036854,493062.680351,496257.702022,446912.596335,491951.534792,499517.555272,498025.107537,475020.696965,466124.957274,501359.763018,485546.563042


In [None]:
df_accuracy_2 = pd.DataFrame(columns=['distance_metric','centroid_1', 'centroid_2', 'centroid_3', 'centroid_4', 'centroid_5', 'centroid_6', 'centroid_7', 'centroid_8', 'centroid_9', 'centroid_10', 'average_accuracy'])

# first row euclidean results
df_accuracy_2.loc[0] = ['euclidean'] + [k_means_accuracy(i, labels, total_classes) for i in map(lambda x: x[3], euclidean_results_list)] + [0]

# second row cosine results
df_accuracy_2.loc[1] = ['cosine'] + [k_means_accuracy(i, labels, total_classes) for i in map(lambda x: x[3], cosine_results_list)] + [0]

# third row jaccard results
df_accuracy_2.loc[2] = ['jaccard'] + [k_means_accuracy(i, labels, total_classes) for i in map(lambda x: x[3], jaccard_results_list)] + [0]

df_accuracy_2['average_accuracy'] = df_accuracy_2[[c for c in df_accuracy_2.columns if c != 'distance_metric' and c != 'average_accuracy']].mean(axis=1)

In [None]:
df_accuracy_2

Unnamed: 0,distance_metric,centroid_1,centroid_2,centroid_3,centroid_4,centroid_5,centroid_6,centroid_7,centroid_8,centroid_9,centroid_10,average_accuracy
0,euclidean,58.69,53.59,62.88,53.96,38.33,34.59,60.05,55.48,33.65,52.68,50.39
1,cosine,34.43,48.4,42.41,38.62,58.39,57.94,41.16,33.98,37.52,62.62,45.547
2,jaccard,34.6,48.69,41.73,38.28,57.0,52.35,39.76,34.45,36.62,54.89,43.837


In [12]:
# run k means 10 times for each distance function until 50 iterations
with Pool(10) as p:
    euclidean_results_list = p.starmap(k_means, [(data, total_classes, euclidean, centroids_list[i],50,False) for i in range(10)])
    cosine_results_list = p.starmap(k_means, [(data, total_classes, cosine, centroids_list[i],50,False) for i in range(10)])
    jaccard_results_list = p.starmap(k_means, [(data, total_classes, jaccard, centroids_list[i],50,False) for i in range(10)])

In [None]:
df_iterations_3 = pd.DataFrame(columns=['distance_metric','centroid_1', 'centroid_2', 'centroid_3', 'centroid_4', 'centroid_5', 'centroid_6', 'centroid_7', 'centroid_8', 'centroid_9', 'centroid_10', 'average_iteration'])

# first row euclidean results
df_iterations_3.loc[0] = ['euclidean'] + [i for i in map(lambda x: x[0], euclidean_results_list)] + [np.mean([i for i in map(lambda x: x[0], euclidean_results_list)])]

# second row cosine results
df_iterations_3.loc[1] = ['cosine'] + [i for i in map(lambda x: x[0], cosine_results_list)] + [np.mean([i for i in map(lambda x: x[0], cosine_results_list)])]

# third row jaccard results
df_iterations_3.loc[2] = ['jaccard'] + [i for i in map(lambda x: x[0], jaccard_results_list)] + [np.mean([i for i in map(lambda x: x[0], jaccard_results_list)])]

In [None]:
df_iterations_3

Unnamed: 0,distance_metric,centroid_1,centroid_2,centroid_3,centroid_4,centroid_5,centroid_6,centroid_7,centroid_8,centroid_9,centroid_10,average_iteration
0,euclidean,50,50,50,50,50,50,50,50,50,50,50.0
1,cosine,50,50,50,50,50,50,50,50,50,50,50.0
2,jaccard,50,50,50,50,50,50,50,50,50,50,50.0


In [13]:
df_sse_3 = pd.DataFrame(columns=['distance_metric','centroid_1', 'centroid_2', 'centroid_3', 'centroid_4', 'centroid_5', 'centroid_6', 'centroid_7', 'centroid_8', 'centroid_9', 'centroid_10', 'average_sse'])

# first row euclidean results
df_sse_3.loc[0] = ['euclidean'] + [i for i in map(lambda x: x[1], euclidean_results_list)] + [np.mean([i for i in map(lambda x: x[1], euclidean_results_list)])]

# second row cosine results
df_sse_3.loc[1] = ['cosine'] + [i for i in map(lambda x: x[1], cosine_results_list)] + [np.mean([i for i in map(lambda x: x[1], cosine_results_list)])]

# third row jaccard results
df_sse_3.loc[2] = ['jaccard'] + [i for i in map(lambda x: x[1], jaccard_results_list)] + [np.mean([i for i in map(lambda x: x[1], jaccard_results_list)])]

In [14]:
df_sse_3

Unnamed: 0,distance_metric,centroid_1,centroid_2,centroid_3,centroid_4,centroid_5,centroid_6,centroid_7,centroid_8,centroid_9,centroid_10,average_sse
0,euclidean,497369.973138,498100.235591,501979.549921,497595.886745,496508.829192,502902.106489,502475.062772,495991.109806,495792.986439,500707.045889,498942.278598
1,cosine,494538.400801,492907.287718,489785.107294,492909.321142,492030.715887,493129.890258,491404.052853,493610.321457,502668.933836,487900.842141,493088.487339
2,jaccard,493543.675119,493474.184151,491948.97831,493335.894878,493579.070339,492502.217121,492927.340666,492691.474397,501785.098723,491046.471475,493683.440518


In [15]:
df_accuracy_3 = pd.DataFrame(columns=['distance_metric','centroid_1', 'centroid_2', 'centroid_3', 'centroid_4', 'centroid_5', 'centroid_6', 'centroid_7', 'centroid_8', 'centroid_9', 'centroid_10', 'average_accuracy'])

# first row euclidean results
df_accuracy_3.loc[0] = ['euclidean'] + [k_means_accuracy(i, labels, total_classes) for i in map(lambda x: x[3], euclidean_results_list)] + [0]

# second row cosine results
df_accuracy_3.loc[1] = ['cosine'] + [k_means_accuracy(i, labels, total_classes) for i in map(lambda x: x[3], cosine_results_list)] + [0]

# third row jaccard results
df_accuracy_3.loc[2] = ['jaccard'] + [k_means_accuracy(i, labels, total_classes) for i in map(lambda x: x[3], jaccard_results_list)] + [0]

df_accuracy_3['average_accuracy'] = df_accuracy_3[[c for c in df_accuracy_3.columns if c != 'distance_metric' and c != 'average_accuracy']].mean(axis=1)

In [16]:
df_accuracy_3 

Unnamed: 0,distance_metric,centroid_1,centroid_2,centroid_3,centroid_4,centroid_5,centroid_6,centroid_7,centroid_8,centroid_9,centroid_10,average_accuracy
0,euclidean,59.77,52.95,64.95,53.9,60.39,59.98,60.27,60.15,60.17,59.2,59.173
1,cosine,54.78,56.74,64.1,62.41,57.07,61.83,61.65,62.48,60.29,62.68,60.403
2,jaccard,54.42,60.71,62.93,54.74,60.11,60.44,60.21,62.4,63.1,58.91,59.797
