In [14]:
import torchvision.datasets

from sklearn.manifold import TSNE
from sklearn.metrics.cluster import adjusted_rand_score

import plotly.express as px
import plotly.graph_objects as go

MNIST_train = torchvision.datasets.MNIST("./", download=True, train=True)
MNIST_test = torchvision.datasets.MNIST("./", download=True, train=False)

In [4]:
X_train = MNIST_train.data
y_train = MNIST_train.targets
X_test = MNIST_test.data
y_test = MNIST_test.targets

In [9]:
X_train.shape, X_test.shape

(torch.Size([60000, 28, 28]), torch.Size([10000, 28, 28]))

In [8]:
# Stretch the pictures
X_train = X_train.reshape([-1, 28 * 28])
X_test = X_test.reshape([-1, 28 * 28])

In [14]:
X_train.shape, X_test.shape

(torch.Size([60000, 784]), torch.Size([10000, 784]))

In [10]:
fig = px.imshow(X_train[0, :, :])#, binary_format="jpeg", binary_compression_level=0)
fig.show()

## Apply t-SNE to MNIST to 3 components to visualize it in 3D
# Choose parameters (`perplexity`, `n_iter`, etc. so the results of Rand Index of t-SNE should be larger than 95 %)

> 17 slide lecture calculate rand index for 3d transformation if its 95%+

1. t-SNE on MNIST datasat
2. Calculate Rand Index for 3D

## `t-SNE` t-distributed stochastic neighbor embedding 
The`t-SNE` algorithm seeks to obtain a mapping $X \rightarrow Y$ such that the new low dimensional d
space $(d<< N)$ reflects the similarities $p_{ij}$ as much as possible. To do this the algorithm
measures the similarity $q$ between two points $y_i$ and $y_j$ in a similar way

$q_{i/i}=\dfrac{\exp(-||y_i-y_j||^2)}{\sum_{k \ne i}\exp(-||y_i-y_k||^2)}$

In [15]:
from sklearn.decomposition import PCA
pca_50 = PCA(n_components=50)
pca_result_50 = pca_50.fit_transform(X_train)

In [19]:
pca_tsne = TSNE(
    random_state = 42,
    n_components=3,
    verbose=0, 
    perplexity=40, 
    n_iter=300)\
        .fit_transform(pca_result_50)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [20]:
x=pca_tsne[:, 0]
y=pca_tsne[:, 1]
z=pca_tsne[:, 2]

fig = go.Figure(data=[go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode="markers",
    marker=dict(
        size=12,
        color=x,                 # set color to an array/list of desired values
        colorscale="Spectral",   # choose a colorscale
        opacity=0.8
    )
)])

# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

**The Rand index** evaluates how many of those pairs of elements that were in the same class, and those pairs of elements that were in different classes, retained this state after clustering.

$Rand=\dfrac{TP+FN}{TP+TN+FP+FN}$

In [2]:
import numpy as np

n_classes = 10 # np.unique(X_train).size
n_classes

10

In [13]:
tsne = TSNE(n_components=3, n_iter=250).fit_transform(X_train)



In [15]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=n_classes, n_init=100, random_state=42).fit(tsne)

In [19]:
X_test = MNIST_test.data
X_test = X_test.reshape([-1, 28 * 28])

In [21]:
np.shape(X_test)

torch.Size([10000, 784])

In [25]:
#X_dr = tsne.fit(X_test)
y_preds = k_means.predict(X_test)

ValueError: ValueError: X has 784 features, but KMeans is expecting 3 features as input.

In [20]:
cluster_labels = k_means.predict(X_test)

ValueError: ValueError: X has 784 features, but KMeans is expecting 3 features as input.

In [25]:
print(f"ARI': {adjusted_rand_score(labels_true=y_test, labels_pred=cluster_labels)}")

AttributeError: AttributeError: 'numpy.ndarray' object has no attribute 'labels_'

Ref
https://www.kaggle.com/code/aayush9753/4-dimensionality-reduction-and-clustering#tSNE

https://umap.scikit-tda.org/clustering.html

https://notebook.community/Diyago/Machine-Learning-scripts/clustering/ods_unsupervised_learning

https://www.kaggle.com/code/parulpandey/visualizing-kannada-mnist-with-t-sne/notebook

In [None]:
algorithm = KMeans(n_clusters=10, random_state=1)

algorithm.fit(X)

print(f"ARI': {adjusted_rand_score(labels_true=y, labels_pred=algorithm.labels_)}")