Take two real data sets which also carry label information with dimensionality higher
than 50 (e.g. MNIST, Fashion-MNIST, Kuzushiji-MNIST, ...1
). Test the following:
in how far do clustering algorithms provide reasonable clusters which align to the
given classes if (i) the clustering methods are used directly, (ii) clustering is used
after dimensionality reduction (e.g. UMAP). Evaluate this quantitatively!

In [2]:
from sklearn import datasets
from sklearn.cluster import KMeans, AffinityPropagation, AgglomerativeClustering

import numpy as np
import umap

# ignore warnings from sklearn regarding the AffinityPropagation algorithm
import warnings
warnings.filterwarnings("ignore")

# Load the digits dataset
digits_data = datasets.load_digits()

# Load the Fashioni-MNIST dataset
fashion_data = datasets.fetch_openml(data_id=44698, cache=True, as_frame=False)


def calculate_accuracy(X_prediction, y_correct):
    pred_vs_real = np.c_[X_prediction, y_correct]

    correct_classifications = 0

    n_clusters = max(X_prediction) + 1

    for i in range(n_clusters):
        correct_label_in_current_cluster = pred_vs_real[pred_vs_real[:,0] == i][:,1]
        
        samples_per_label = np.unique(correct_label_in_current_cluster, return_counts=True)

        most_occurring_label_index = np.argmax(samples_per_label[1])

        predicted_labels = samples_per_label[0][most_occurring_label_index]

        correct_classifications_for_cluster_i = np.sum(correct_label_in_current_cluster == predicted_labels)

        correct_classifications += correct_classifications_for_cluster_i
    
    misclassifications = len(X_prediction) - correct_classifications

    return correct_classifications / (correct_classifications + misclassifications)


datasets = [(digits_data, 'digits'), (fashion_data, 'Fashion-MNIST')]

n_cluster = 10

n_components = 10

for dataset, dataset_name in datasets:
    print(f"Dataset: {dataset_name}")
    X, y = dataset.data, dataset.target
    
    X_umap = umap.UMAP(random_state=42, n_components=n_components).fit_transform(X)

    for X, is_umap in [(X, False), (X_umap, True)]:
        kmeans_prediction = KMeans(n_clusters=n_cluster, random_state=42).fit_predict(X)

        ac_prediction = AgglomerativeClustering(n_clusters=n_cluster).fit_predict(X)

        ap_prediction = AffinityPropagation().fit_predict(X)

        kmeans_precision = calculate_accuracy(kmeans_prediction, y)
        ac_precision = calculate_accuracy(ac_prediction, y)
        ap_precision = calculate_accuracy(ap_prediction, y)

        umap_str = f" UMAP({n_components} dimensions)" if is_umap else ""

        print(f"KMeans{umap_str}: {kmeans_precision}")
        print(f"AgglomerativeClustering{umap_str}: {ac_precision}")
        print(f"AffinityPropagation{umap_str}: {ap_precision}")
        
    print()

Dataset: digits
KMeans: 0.7902058987200891
AgglomerativeClustering: 0.8619922092376182
AffinityPropagation: 0.9693934335002783
KMeans UMAP(10 dimensions): 0.8809126321647189
AgglomerativeClustering UMAP(10 dimensions): 0.8809126321647189
AffinityPropagation UMAP(10 dimensions): 0.9794101279910963

Dataset: Fashion-MNIST
KMeans: 0.5425
AgglomerativeClustering: 0.5465
AffinityPropagation: 0.7055
KMeans UMAP(10 dimensions): 0.6775
AgglomerativeClustering UMAP(10 dimensions): 0.654
AffinityPropagation UMAP(10 dimensions): 0.697

