# C K-means MNIST

- What is the majority class of each cluster?
- What is the percentage of the majority class in each cluster?
- Does each number have a cluster?
- If not, which hasn’t?

Do this for 10, 100, 1000 iterations

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import scipy.spatial.distance

from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
import pandas as pd

In [2]:
#load dataset

train = tf.keras.datasets.mnist.load_data()[0]
X_train, Y_train = train[0], train[1]

n = X_train.shape[0]
m = X_train.shape[1]**2

X_train = X_train.reshape([n, m])

In [98]:
# create KMeans models

kmeans_10 = KMeans(n_clusters=10, max_iter=10).fit(X_train)
kmeans_100 = KMeans(n_clusters=10, max_iter=100).fit(X_train)
kmeans_1000 = KMeans(n_clusters=10, max_iter=1000).fit(X_train)

In [100]:
def calculate_scores(model):
    scores = []
    count = 0
    
    for i in range(10):
        scores.append([])
    
    for value in model.labels_:
        scores[value].append(Y_train[count])
        count += 1
        
    return scores

kmeans_10_scores = calculate_scores(kmeans_10)
kmeans_100_scores = calculate_scores(kmeans_100)
kmeans_1000_scores = calculate_scores(kmeans_1000)


In [101]:
kmeans_10_clusters = []
kmeans_100_clusters = []
kmeans_1000_clusters = []
for i in range(10):
    # 10 iterations
    local_array = np.bincount(kmeans_10_scores[:][i])
    kmeans_10_clusters.append(local_array)
    # 100 iterations
    local_array = np.bincount(kmeans_100_scores[:][i])
    kmeans_100_clusters.append(local_array)
    #1000 iterations
    local_array = np.bincount(kmeans_1000_scores[:][i])
    kmeans_1000_clusters.append(local_array)

In [102]:
def calc_cluster_percentage(cluster):

    percentages = []
    
    for array in cluster:
        max_idx = np.where(array == max(array))[0]
        if len(max_idx) == 1:    
            percentages.append(max(array) / sum(array))
        else:
            percentages.append(-1)

    return(percentages)

def get_majority_labels(cluster):
    
    labels = []
    
    for array in cluster:
        max_idx = np.where(array == max(array))[0]
        if len(max_idx) == 1:    
            labels.append(max_idx[0])
        else:
            labels.append(-1)
    
    return labels
    
def get_missing_labels(labels):
    classes = np.arange(10)
    return np.where(np.isin(classes, labels) == False)[0]

cluster_10_perc = calc_cluster_percentage(kmeans_10_clusters)
cluster_10_labels = get_majority_labels(kmeans_10_clusters)

cluster_100_perc = calc_cluster_percentage(kmeans_100_clusters)
cluster_100_labels = get_majority_labels(kmeans_100_clusters)

cluster_1000_perc = calc_cluster_percentage(kmeans_1000_clusters)
cluster_1000_labels = get_majority_labels(kmeans_1000_clusters)

In [103]:
combined_percentages = [cluster_10_perc, cluster_100_perc, cluster_1000_perc]
df1 = pd.DataFrame(np.array(combined_percentages).T, columns=["10","100","1000"])
np.array(combined_percentages).T
df1

Unnamed: 0,10,100,1000
0,0.872086,0.531311,0.896007
1,0.513204,0.426955,0.427117
2,0.644104,0.622717,0.53046
3,0.66606,0.527256,0.904793
4,0.83967,0.895411,0.79354
5,0.298385,0.357151,0.623135
6,0.398093,0.790205,0.525602
7,0.545896,0.904454,0.526726
8,0.421045,0.526309,0.859266
9,0.93845,0.859793,0.357247


In [105]:
print(cluster_10_labels)

print("Missing labels for 10 iterations:" , get_missing_labels(cluster_10_labels))
print("Missing labels for 100 iterations:" , get_missing_labels(cluster_100_labels))
print("Missing labels for 1000 iterations:" , get_missing_labels(cluster_1000_labels))

# Vernuenftige Ausgabe der Label pro Cluster und Antwortsatz


[2, 8, 1, 1, 6, 7, 4, 3, 7, 0]
Missing labels for 10 iterations: [5 9]
Missing labels for 100 iterations: [5 9]
Missing labels for 1000 iterations: [5 9]
