# C K-means MNIST

- What is the majority class of each cluster?
- What is the percentage of the majority class in each cluster?
- Does each number have a cluster?
- If not, which hasn’t?

Do this for 10, 100, 1000 iterations

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import scipy.spatial.distance

from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
import pandas as pd

In [2]:
#load dataset

train = tf.keras.datasets.mnist.load_data()[0]
X_train, Y_train = train[0], train[1]

n = X_train.shape[0]
m = X_train.shape[1]**2

X_train = X_train.reshape([n, m])

In [3]:
#take subsamples

n_sample = 1000
idx = np.random.randint(n, size=n_sample)
x_sample = X_train[idx]
y_sample = Y_train[idx]

k = 10

In [4]:
# create KMeans models

kmeans_10 = KMeans(n_clusters=10, max_iter=10).fit(x_sample)
kmeans_100 = KMeans(n_clusters=10, max_iter=100).fit(x_sample)
kmeans_1000 = KMeans(n_clusters=10, max_iter=1000).fit(x_sample)

In [5]:
def calculate_scores(model):
    scores = []
    count = 0
    
    for i in range(10):
        scores.append([])
    
    for value in model.labels_:
        scores[value].append(y_sample[count])
        count += 1
        
    return scores

kmeans_10_scores = calculate_scores(kmeans_10)
kmeans_100_scores = calculate_scores(kmeans_100)
kmeans_1000_scores = calculate_scores(kmeans_1000)


In [6]:
kmeans_10_clusters = []
kmeans_100_clusters = []
kmeans_1000_clusters = []
for i in range(10):
    # 10 iterations
    local_array = np.bincount(kmeans_10_scores[:][i])
    kmeans_10_clusters.append(local_array)
    # 100 iterations
    local_array = np.bincount(kmeans_100_scores[:][i])
    kmeans_100_clusters.append(local_array)
    #1000 iterations
    local_array = np.bincount(kmeans_1000_scores[:][i])
    kmeans_1000_clusters.append(local_array)

In [7]:
def calc_cluster_percentage(cluster):

    percentages = []
    
    for array in cluster:
        max_idx = np.where(array == max(array))[0]
        if len(max_idx) == 1:    
            percentages.append(max(array) / sum(array))
        else:
            percentages.append(-1)

    return(percentages)

def get_majority_labels(cluster):
    
    labels = []
    
    for array in cluster:
        max_idx = np.where(array == max(array))[0]
        if len(max_idx) == 1:    
            labels.append(max_idx[0])
        else:
            labels.append(-1)
    
    return labels
    
def get_missing_labels(labels):
    classes = np.arange(10)
    return np.where(np.isin(classes, labels) == False)[0]

cluster_10_perc = calc_cluster_percentage(kmeans_10_clusters)
cluster_10_labels = get_majority_labels(kmeans_10_clusters)

print(cluster_10_labels)
print(get_missing_labels(cluster_10_labels))
print(cluster_10_perc)

#perc_maj_cluster_100 = calc_cluster_percentage(kmeans_100_clusters)
#perc_maj_cluster_1000 = calc_cluster_percentage(kmeans_1000_clusters)


[1, 4, 2, 0, 7, 3, 1, 3, 0, 6]
[5 8 9]
[0.6296296296296297, 0.3611111111111111, 0.5596330275229358, 0.95, 0.3787878787878788, 0.43636363636363634, 0.75, 0.43010752688172044, 0.5595238095238095, 0.7045454545454546]


In [8]:
combined_percentages = [perc_maj_cluster_10, perc_maj_cluster_100, perc_maj_cluster_1000]
df1 = pd.DataFrame(np.array(combined_percentages).T, columns=["10","100","1000"])
np.array(combined_percentages).T
df1

NameError: name 'perc_maj_cluster_10' is not defined