# C K-means MNIST

- What is the majority class of each cluster?
- What is the percentage of the majority class in each cluster?
- Does each number have a cluster?
- If not, which hasn’t?

Do this for 10, 100, 1000 iterations

In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import scipy.spatial.distance

from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
import pandas as pd

In [38]:
#load dataset

train = tf.keras.datasets.mnist.load_data()[0]
X_train, Y_train = train[0], train[1]

n = X_train.shape[0]
m = X_train.shape[1]**2

X_train = X_train.reshape([n, m])

In [4]:
#take subsamples

n_sample = 1000
idx = np.random.randint(n, size=n_sample)
x_sample = X_train[idx]
y_sample = Y_train[idx]

k = 10

In [5]:
# create KMeans models

kmeans_10 = KMeans(n_clusters=10, max_iter=10).fit(x_sample)
kmeans_100 = KMeans(n_clusters=10, max_iter=100).fit(x_sample)
kmeans_1000 = KMeans(n_clusters=10, max_iter=1000).fit(x_sample)

In [71]:
def calculate_scores(model):
    scores = []
    count = 0
    
    for i in range(10):
        scores.append([])
    
    for value in model.labels_:
        scores[value].append(y_sample[count])
        count += 1
        
    return scores

kmeans_10_scores = calculate_scores(kmeans_10)
kmeans_100_scores = calculate_scores(kmeans_100)
kmeans_1000_scores = calculate_scores(kmeans_1000)


In [75]:
kmeans_10_clusters = []
kmeans_100_clusters = []
kmeans_1000_clusters = []
for i in range(10):
    # 10 iterations
    local_array = np.bincount(kmeans_10_scores[:][i])
    kmeans_10_clusters.append(local_array)
    # 100 iterations
    local_array = np.bincount(kmeans_100_scores[:][i])
    kmeans_100_clusters.append(local_array)
    #1000 iterations
    local_array = np.bincount(kmeans_1000_scores[:][i])
    kmeans_1000_clusters.append(local_array)

In [92]:
def calc_cluster_percentage(cluster):

    percentages = []
    
    for array in cluster:
        max_idx = np.where(array == max(array))[0]
        if len(max_idx) == 1:    
            percentages.append(max(array) / sum(array))
        else:
            percentages.append(-1)

    return(percentages)

def get_majority_labels(cluster):
    
    labels = []
    
    for array in cluster:
        max_idx = np.where(array == max(array))[0]
        if len(max_idx) == 1:    
            labels.append(max_idx[0])
        else:
            labels.append(-1)
    
    return labels
    
def get_missing_labels(labels):
    classes = np.arange(10)
    return np.where(np.isin(classes, labels) == False)[0]

cluster_10_perc = calc_cluster_percentage(kmeans_10_clusters)
cluster_10_labels = get_majority_labels(kmeans_10_clusters)

print(cluster_10_labels)
print(get_missing_labels(cluster_10_labels))
print(cluster_10_perc)

#perc_maj_cluster_100 = calc_cluster_percentage(kmeans_100_clusters)
#perc_maj_cluster_1000 = calc_cluster_percentage(kmeans_1000_clusters)


[5, 4, 2, 1, 7, 3, 0, 6, 6, 0]
[8 9]
[0.48484848484848486, 0.3691275167785235, 0.28688524590163933, 0.5954198473282443, 0.44755244755244755, 0.5078125, 0.896551724137931, 0.8214285714285714, 0.7733333333333333, 0.4722222222222222]


In [56]:
combined_percentages = [perc_maj_cluster_10, perc_maj_cluster_100, perc_maj_cluster_1000]
df1 = pd.DataFrame(np.array(combined_percentages).T, columns=["10","100","1000"])
np.array(combined_percentages).T
df1

Unnamed: 0,10,100,1000
0,0.484848,0.675,0.842105
1,0.369128,0.91954,0.371901
2,0.286885,0.819672,0.307018
3,0.59542,0.450704,0.341667
4,0.447552,0.488372,0.578125
5,0.507812,0.636364,0.818182
6,0.896552,0.335821,0.567568
7,0.821429,0.383562,0.722892
8,0.773333,0.71875,0.712644
9,0.472222,0.768116,0.42236
