In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import rand_score, adjusted_rand_score

In [2]:
def readData(name):
    df = pd.read_csv(name)
    shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    X = shuffled_df.drop(columns=['label'])
    Y = shuffled_df['label']
    X = np.array(X, dtype=np.float32)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    print(X_scaled.shape)
    return X_scaled, Y
    
X,Y = readData('fashion-mnist_train.csv')
x_test, y_test = readData('fashion-mnist_test.csv')

(60000, 784)
(10000, 784)


In [3]:
def initialization(X, k):
    cluster_assignments = np.random.randint(0, k, len(X))
    print(cluster_assignments[:20]) 
    centroids = np.array([X[cluster_assignments == i].mean(axis=0) if np.any(cluster_assignments == i)
        else X[np.random.randint(0, len(X))]
    for i in range(k)])
    return cluster_assignments, centroids


In [4]:
def clusterAssign(X, centroids):
    distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
    cluster_assignments = np.argmin(distances, axis=1)
    return cluster_assignments

In [5]:
def clusterUpdate(X, cluster_assignments, k):
    new_centroids = []
    for i in range(k):
        cluster_points = X[cluster_assignments == i]
        if len(cluster_points) == 0:
            new_centroids.append(X[np.random.randint(0, len(X))])
        else:
            new_centroids.append(cluster_points.mean(axis=0))
    return np.array(new_centroids)
    

In [6]:
def kMeansCluster(X, k=10, threshold=1e-3, max_iter=100):
    X = np.array(X, dtype=np.float32)
    #scaler = StandardScaler()
    #X_scaled = scaler.fit_transform(X)
    cluster_assignments, centroids = initialization(X, k)
    for i in range(max_iter):
        new_assignments = clusterAssign(X, centroids)
        new_centroids = clusterUpdate(X, new_assignments, k)
        shift = np.linalg.norm(new_centroids - centroids)
        #print(shift)
        if shift < threshold:
            print(f"Converged after {i+1} iterations.")
            break
        centroids = new_centroids
        cluster_assignments = new_assignments
    return centroids, cluster_assignments
    

In [7]:
def evaluateClustering(Y, clusters):
    rand = rand_score(Y, clusters)
    adj_rand = adjusted_rand_score(Y, clusters)
    return rand, adj_rand
#a,b = evaluateClustering(Y, clusters)
#print(a,b)

In [8]:
results = []
for i in range (5,16):
    centroids, clusters = kMeansCluster(X, k=i, threshold=1e-3)
    rand, adj_rand = evaluateClustering(Y, clusters)
    results.append((i, rand, adj_rand))


[4 4 4 2 0 0 0 3 3 4 4 4 1 4 2 1 1 2 0 1]
[3 2 2 4 4 4 4 5 1 4 3 3 5 0 3 1 0 3 4 4]
Converged after 56 iterations.
[6 3 6 4 0 4 0 4 3 0 1 5 4 4 3 4 5 5 5 1]
Converged after 28 iterations.
[2 6 1 0 2 0 0 3 5 4 0 1 3 4 1 0 7 4 5 5]
Converged after 27 iterations.
[6 5 4 1 6 7 8 0 1 7 5 0 8 5 0 4 4 8 1 3]
Converged after 63 iterations.
[4 1 6 5 0 7 5 4 7 5 1 0 4 9 1 8 3 0 2 9]
Converged after 62 iterations.
[ 5  4  9  5  6 10  2  0  5  3  9  3  7  3  7 10  0  7  5  2]
Converged after 67 iterations.
[ 5  9 11  5  8 10  8 10  5 11  7  7 11  5  2  0  8 11  1  4]
Converged after 61 iterations.
[10  8  5 10 11  0  2  9  7  4  3  8  5  0  6  0  8  2 10  0]
Converged after 44 iterations.
[ 7  1 11  3  5  2  3 11 10  9  9 11  5  5  4  8  9  0  1  7]
Converged after 59 iterations.
[ 2  7  0 12 10  9  9 10  3  3 12 10  0  3  1 10  4 10  0  4]


In [9]:
print("\nK-Means Results (Training Data)")
print(f"{'k':<5}{'RAND Index':<25}{'Adjusted RAND Index':<25}")
print("-" * 55)
for k, rand, adj_rand in results:
    print(f"{k:<5}{rand:<25.4f}{adj_rand:<25.4f}")


K-Means Results (Training Data)
k    RAND Index               Adjusted RAND Index      
-------------------------------------------------------
5    0.8059                   0.2857                   
6    0.8393                   0.3177                   
7    0.8513                   0.3422                   
8    0.8515                   0.3204                   
9    0.8654                   0.3518                   
10   0.8807                   0.3775                   
11   0.8802                   0.3569                   
12   0.8726                   0.3363                   
13   0.8868                   0.3606                   
14   0.9012                   0.4077                   
15   0.8868                   0.3305                   


In [10]:
best_k = max(results, key=lambda x: x[2])[0]
print (best_k)
results_with_best_k = []
for i in range (5):
    centroids, clusters = kMeansCluster(X, k=best_k, threshold=1e-3)
    rand, adj_rand = evaluateClustering(Y, clusters)
    results_with_best_k.append((i, rand, adj_rand))


14
[ 5  7 11  2  4  7 10  5 13  9  2 11  0  6 11  4  4  9 10  0]
[13  4  3  7  7  7 12  9  0  9  3  8  6  6  0 13 12  7  2 11]
Converged after 83 iterations.
[ 6  3  9  7 11  4  5  3  3  7  7  6  2  1 10 13  9  0 11  4]
Converged after 54 iterations.
[ 2  3 12  9  7  7  9  3  8 10  9  2  9  0 13  9 10  6  8 10]
Converged after 74 iterations.
[ 9  8  6  7  3 10 12  3  2  5  8  7  2 10  4 11  9 10  4  1]
Converged after 86 iterations.


In [11]:
print(f"\nK-Means Results with best k{best_k} (Training Data)")
print(f"{'i':<5}{'RAND Index':<25}{'Adjusted RAND Index':<25}")
print("-" * 55)
for i, rand, adj_rand in results_with_best_k:
    print(f"{i:<5}{rand:<25.4f}{adj_rand:<25.4f}")


K-Means Results with best k14 (Training Data)
i    RAND Index               Adjusted RAND Index      
-------------------------------------------------------
0    0.8857                   0.3436                   
1    0.8896                   0.3611                   
2    0.8810                   0.3491                   
3    0.8866                   0.3342                   
4    0.8897                   0.3557                   


In [12]:
clusters_test = clusterAssign(x_test, centroids)
a_test, b_test = evaluateClustering(y_test, clusters_test)
print("RAND Index & Adjusted RAND Index on test data for best k:", a_test, b_test)

RAND Index & Adjusted RAND Index on test data for best k: 0.8894179417941794 0.35365537169318556
