In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
def euclidean_distance(point, data):
    return np.sqrt(np.sum((point - data)**2, axis=1))

def cosine_distance(point, data):
    dot = np.dot(point, data)
    data_norm = np.linalg.norm(data)
    point_norm = np.linalg.norm(point, axis=1)
    return 1 - dot / (data_norm * point_norm)

def jaccard_distance(point, data):
    # print(point[0].shape, data[0].shape)
    minm = np.sum(np.minimum(point, data), axis=1)
    maxm = np.sum(np.maximum(point, data), axis=1)
    return 1 - minm / maxm

In [3]:
df = pd.read_csv('kmeans_data/data.csv', header=None)
labels = pd.read_csv('kmeans_data/label.csv', header=None)

In [4]:
df.shape

(10000, 784)

In [5]:
def calculate_distances(data, distance_function, centroids):
    clusters = [[] for _ in range(len(centroids))]
    preds = []
    for x in data:
        distances = [distance_function(centroids, x)]
        cluster_idx = np.argmin(distances)
        preds.append(cluster_idx)
        clusters[cluster_idx].append(x)
    return clusters, preds


# def calculate_distances(data, distance_function, centroids):
#     distances = distance_function(centroids, data)
#     cluster_labels = np.argmin(distances, axis=1)
#     return cluster_labels

In [6]:
def kmeans(data, k, dist_metric, iterations):
    centroids = data.sample(n=k).values
    for _ in range(iterations):
        clusters, preds = calculate_distances(data.values, dist_metric, centroids)
        prev_centroids = centroids
        centroids = [np.mean(cluster, axis=0) for cluster in clusters]
        for i, centroid in enumerate(centroids):
            if np.isnan(centroid).any():
                centroids[i] = prev_centroids[i]
    assert len(centroids) == k

    return centroids, preds

In [7]:
def calculate_sse(data, centroids, preds):
    sse = 0
    for i,center in enumerate(centroids):
        # print(euclidean_distance(center, data[np.array(preds) == i].values))
        sse += np.sum(euclidean_distance(center, data[np.array(preds) == i].values)**2)
    return sse

## Kmeans using Euclidean Distance

In [8]:
eu_centroids, eu_preds = kmeans(df, 10, euclidean_distance, 100)
calculate_sse(df, eu_centroids, eu_preds)


25322070190.63885

## Kmeans using Cosine Distance

In [9]:
co_centroids, co_preds = kmeans(df, 10, cosine_distance, 100)
calculate_sse(df, co_centroids, co_preds)

25419338339.34366

## Kmeans using Jaccard Distance

In [10]:
jc_centroids, jc_preds = kmeans(df, 10, jaccard_distance, 100)
calculate_sse(df, jc_centroids, jc_preds)

25697520677.576557

--------------------------------------------------------------

In [11]:
from sklearn.metrics import confusion_matrix
from scipy.stats import mode

In [17]:
def majority_vote(preds, true_labels, k):
    from collections import Counter
    label_mapping = {}
    for cluster in range(k):
        cluster_labels = true_labels[np.array(preds) == cluster]
        if len(cluster_labels) > 0:
            most_common_label = Counter(cluster_labels.dropna()[0]).most_common(1)[0][0]
            label_mapping[cluster] = most_common_label
    return label_mapping

In [18]:
np.array(eu_preds) == 1

array([False, False, False, ..., False, False, False])

In [19]:
eu_label_assignment = majority_vote(eu_preds, labels, 10)
co_label_assignment = majority_vote(co_preds, labels, 10)
jc_label_assignment = majority_vote(jc_preds, labels, 10)

In [20]:
predicted_labels_eu = [eu_label_assignment[pred] for pred in eu_preds]
eu_acc = np.mean(predicted_labels_eu == labels[0]) * 100
print("Kmeans Accuracy with Euclidean Distance:", eu_acc, "%")

predicted_labels_co = [co_label_assignment[pred] for pred in co_preds]
co_acc = np.mean(predicted_labels_co == labels[0]) * 100
print("Kmeans Accuracy with Cosine Distance:", co_acc, "%")

predicted_labels_jc = [jc_label_assignment[pred] for pred in jc_preds]
jc_acc = np.mean(predicted_labels_jc == labels[0]) * 100
print("Kmeans Accuracy with Jaccard Distance:", jc_acc, "%")

Kmeans Accuracy with Euclidean Distance: 59.88 %
Kmeans Accuracy with Cosine Distance: 61.21 %
Kmeans Accuracy with Jaccard Distance: 55.96 %


In [21]:
def kmeans_convergence(data, k, dist_metric):
    centroids = data.sample(n=k).values
    iteration = 0
    while True:
        clusters, preds = calculate_distances(data.values, dist_metric, centroids)
        prev_centroids = centroids
        centroids = [np.mean(cluster, axis=0) for cluster in clusters]
        for i, centroid in enumerate(centroids):
            if np.isnan(centroid).any():
                centroids[i] = prev_centroids[i]
        
        iteration += 1

        if np.allclose(prev_centroids, centroids):
            break
        
    assert len(centroids) == k

    return centroids, preds, iteration

In [22]:
eu_conv_centroids, eu_conv_preds, eu_iter = kmeans_convergence(df, 10, euclidean_distance)
co_conv_centroids, co_conv_preds, co_iter = kmeans_convergence(df, 10, cosine_distance)
jc_conv_centroids, jc_conv_preds, jc_iter = kmeans_convergence(df, 10, jaccard_distance)

print("Kmeans Convergence Iterations with Euclidean Distance:", eu_iter)
print("Kmeans Convergence Iterations with Cosine Distance:", co_iter)
print("Kmeans Convergence Iterations with Jaccard Distance:", jc_iter)

print("Kmeans Convergence SSE with Euclidean Distance:", calculate_sse(df, eu_conv_centroids, eu_conv_preds))
print("Kmeans Convergence SSE with Cosine Distance:", calculate_sse(df, co_conv_centroids, co_conv_preds))
print("Kmeans Convergence SSE with Jaccard Distance:", calculate_sse(df, jc_conv_centroids, jc_conv_preds))

Kmeans Convergence Iterations with Euclidean Distance: 68
Kmeans Convergence Iterations with Cosine Distance: 71
Kmeans Convergence Iterations with Jaccard Distance: 61
Kmeans Convergence SSE with Euclidean Distance: 25465533501.70494
Kmeans Convergence SSE with Cosine Distance: 25554525140.559067
Kmeans Convergence SSE with Jaccard Distance: 25515634343.188545


In [23]:
def calculate_distances_sse(data, distance_function, centroids):
    clusters = [[] for _ in range(len(centroids))]
    preds = []
    dist = []
    for x in data:
        distances = [distance_function(centroids, x)]
        cluster_idx = np.argmin(distances)
        preds.append(cluster_idx)
        dist.append(distances[0][cluster_idx])
        clusters[cluster_idx].append(x)
    return dist, clusters, preds

In [28]:
def kmeans_sse(data, k, dist_metric):
    centroids = data.sample(n=k).values
    prev_sse = np.inf
    iteration = 0
    while True:
        distances, clusters, preds = calculate_distances_sse(data.values, dist_metric, centroids)
        prev_centroids = centroids
        centroids = [np.mean(cluster, axis=0) for cluster in clusters]
        for i, centroid in enumerate(centroids):
            if np.isnan(centroid).any():
                centroids[i] = prev_centroids[i]
        
        iteration += 1
        curr_sse = np.sum(distances)
        # print(curr_sse, prev_sse)
        if prev_sse < curr_sse:
            break
        prev_sse = curr_sse
        
    assert len(centroids) == k

    return centroids, preds, iteration

In [29]:
eu_sse_centroids, eu_sse_preds, eu_sse_iter = kmeans_sse(df, 10, euclidean_distance)
co_sse_centroids, co_sse_preds, co_sse_iter = kmeans_sse(df, 10, cosine_distance)
jc_sse_centroids, jc_sse_preds, jc_sse_iter = kmeans_sse(df, 10, jaccard_distance)

print("Kmeans SSE Iterations with Euclidean Distance:", eu_sse_iter)
print("Kmeans SSE Iterations with Cosine Distance:", co_sse_iter)
print("Kmeans SSE Iterations with Jaccard Distance:", jc_sse_iter)

print("Kmeans SSE with Euclidean Distance:", calculate_sse(df, eu_sse_centroids, eu_sse_preds))
print("Kmeans SSE with Cosine Distance:", calculate_sse(df, co_sse_centroids, co_sse_preds))
print("Kmeans SSE with Jaccard Distance:", calculate_sse(df, jc_sse_centroids, jc_sse_preds))

Kmeans SSE Iterations with Euclidean Distance: 40
Kmeans SSE Iterations with Cosine Distance: 69
Kmeans SSE Iterations with Jaccard Distance: 20
Kmeans SSE with Euclidean Distance: 25577491450.67655
Kmeans SSE with Cosine Distance: 25559656397.190304
Kmeans SSE with Jaccard Distance: 25485535652.641655


In [None]:
def assign_labels_to_clusters(cluster_assignments, true_labels, num_clusters):
    assigned_labels = []
    for cluster in range(num_clusters):
        cluster_indices = np.where(cluster_assignments == cluster)[0]
        # print(cluster_indices)
        if len(cluster_indices) > 0:
            majority_label = mode(true_labels.iloc[cluster_indices])[0]
            assigned_labels.append(majority_label)
        else:
            assigned_labels.append(None) 
    return assigned_labels

In [196]:
# assign_labels_to_clusters(np.array(eu_preds), labels, 10)