In [1]:
import numpy as np

from data import Data
from KMeans import KMeans
from KMeans import eucledian_distance
from KMeans import cosine_similarity_distance

from scipy.optimize import linear_sum_assignment
from sklearn.metrics import accuracy_score

In [2]:
mnist_data = Data()
mnist_data.prepare_dataset([2, 3, 8, 9])
train_images, test_images = mnist_data.x_train, mnist_data.x_test
train_labels, test_labels = mnist_data.y_train, mnist_data.y_test

In [3]:
train_images = mnist_data.normalize_data(train_images)
test_images = mnist_data.normalize_data(test_images)

  X_standardized = np.where(std_dev != 0, (X - mean) / std_dev, 0)


PCA Algorithm

In [4]:
def standardize(X):
    X_mean = np.mean(X, axis=0)
    X_std = np.std(X, axis=0)
    X_std[X_std == 0] = 1
    X_standardized = (X - X_mean) / X_std
    return X_standardized

def cov(X):
    m = X.shape[0]
    covariance_matrix = np.dot(X.T, X) / m
    return covariance_matrix

def get_eigenvectors(X, top_n):
    X_standardized = standardize(X)
    covariance_matrix = cov(X_standardized)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    sorted_idx = np.argsort(eigenvalues)[::-1]
    selected_eigenvectors = eigenvectors[:, sorted_idx[:top_n]]
    return selected_eigenvectors, np.mean(X, axis=0), np.std(X, axis=0)

def pca(X, components, mean, std):
    std[std == 0] = 1
    X_standardized = (X - mean) / std
    return np.dot(X_standardized, components)

components, train_mean, train_std = get_eigenvectors(train_images, 10)
train_images_pca = pca(train_images, components, train_mean, train_std)
test_images_pca = pca(test_images, components, train_mean, train_std)

SSE and Clustering Accuracy Functions

In [5]:
def compute_sse(X, clusters, centroids):
    sse = 0
    for cluster_ind, cluster in enumerate(clusters):
        cluster_points = X[cluster]
        centroid = centroids[cluster_ind]
        sse += np.sum((cluster_points - centroid) ** 2)
    return sse


def clustering_accuracy(true_labels, predicted_clusters, K):
    cost_matrix = np.zeros((K, K), dtype=int)
    for i in range(K):
        for j in range(K):
            matches = np.sum((true_labels == i) & (predicted_clusters == j))
            cost_matrix[i, j] = -matches

    row_ind, col_ind = linear_sum_assignment(cost_matrix)

    total_correct = 0
    for i, j in zip(row_ind, col_ind):
        total_correct += np.sum((true_labels == i) & (predicted_clusters == j))
    accuracy = total_correct / len(true_labels)
    return accuracy

KMeans with Eucledian Distance

In [6]:
kmeans = KMeans(K=4, converge_func=eucledian_distance, plot_steps=False)
kmeans.predict(train_images_pca)

In [7]:
kmeans = KMeans(K=4, converge_func=eucledian_distance, plot_steps=False)
kmeans.predict(train_images)

predicted_clusters = np.zeros(len(train_labels))
for cluster_ind, cluster in enumerate(kmeans.clusters):
    for ind in cluster:
        predicted_clusters[ind] = cluster_ind
        
sse = compute_sse(train_images, kmeans.clusters, kmeans.centroids)
accuracy = clustering_accuracy(train_labels, predicted_clusters, K=4)

print(f'Original Data - SSE: {sse}')
print(f'Original Data - Clustering Accuracy: {accuracy}')

Original Data - SSE: 14901588.93341656
Original Data - Clustering Accuracy: 0.3711331575201976


In [8]:
kmeans_pca = KMeans(K=4, converge_func=eucledian_distance, plot_steps=False)
kmeans_pca.predict(train_images_pca)

predicted_clusters_pca = np.zeros(len(train_labels))
for cluster_idx, cluster in enumerate(kmeans_pca.clusters):
    for index in cluster:
        predicted_clusters_pca[index] = cluster_idx

sse_pca = compute_sse(train_images_pca, kmeans_pca.clusters, kmeans_pca.centroids)
accuracy_pca = clustering_accuracy(train_labels, predicted_clusters_pca, K=4)

print(f'PCA Data - SSE: {sse_pca}')
print(f'PCA Data - Clustering Accuracy: {accuracy_pca}')

PCA Data - SSE: 3238412.8512867843
PCA Data - Clustering Accuracy: 0.36280296370714554


KMeans with Cosine Similarity Distance

In [11]:
kmeans = KMeans(K=4, converge_func=cosine_similarity_distance, plot_steps=False)
kmeans.predict(train_images_pca)

In [12]:
kmeans = KMeans(K=4, converge_func=cosine_similarity_distance, plot_steps=False)
kmeans.predict(train_images)

predicted_clusters = np.zeros(len(train_labels))
for cluster_ind, cluster in enumerate(kmeans.clusters):
    for ind in cluster:
        predicted_clusters[ind] = cluster_ind
        
sse = compute_sse(train_images, kmeans.clusters, kmeans.centroids)
accuracy = clustering_accuracy(train_labels, predicted_clusters, K=4)

print(f'Original Data - SSE: {sse}')
print(f'Original Data - Clustering Accuracy: {accuracy}')

Original Data - SSE: 14920075.865196228
Original Data - Clustering Accuracy: 0.38808656703922306


In [13]:
kmeans_pca = KMeans(K=4, converge_func=cosine_similarity_distance, plot_steps=False)
kmeans_pca.predict(train_images_pca)

predicted_clusters_pca = np.zeros(len(train_labels))
for cluster_idx, cluster in enumerate(kmeans_pca.clusters):
    for index in cluster:
        predicted_clusters_pca[index] = cluster_idx

sse_pca = compute_sse(train_images_pca, kmeans_pca.clusters, kmeans_pca.centroids)
accuracy_pca = clustering_accuracy(train_labels, predicted_clusters_pca, K=4)

print(f'PCA Data - SSE: {sse_pca}')
print(f'PCA Data - Clustering Accuracy: {accuracy_pca}')

PCA Data - SSE: 3324050.4203613014
PCA Data - Clustering Accuracy: 0.26158482983800074
