#  t-distributed stochastic neighbor embedding (t-SNE)

In [None]:
import numpy as np
import sklearn
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import seaborn as sns

Lade MNIST Digits

In [None]:
digits = load_digits()
print(f'Digits shape: {digits.data.shape}')
# print(digits['DESCR'])

In [None]:
nrows, ncols = 2, 5
fig, axs = plt.subplots(2,5, figsize=(30,10))

for i in range(nrows):
    for j in range(ncols):
        axs[i,j].matshow(digits.images[i*5 + j,...])
        axs[i,j].set(xticks=[], yticks=[])
        axs[i,j].set_title(digits.target[i*5 + j])

`digits` enthält sowohl die Daten der Ziffern (`digits.data`), als auch die echten Target-Labels (`digits.target`):

In [None]:
print(f'Shapes: Data-{digits.data.shape}, Target-{digits.target.shape}')

Wenden wir jetzt t-SNE auf den Datensatz an und versuchen eine Dimensions-Reduktion von $\mathbb{R}^{64}$ auf $\mathbb{R}^{2}$:

In [None]:
tsne = TSNE(n_components=2, random_state=42).fit_transform(digits.data)

In [None]:
def scatter(x, colors):
    palette = np.array(sns.color_palette("cubehelix", 10))

    # We create a scatter plot.
    f = plt.figure(figsize=(10, 10))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40, c=palette[colors.astype(np.int)], alpha=0.5)
    plt.xlim(-25, 25)
    plt.ylim(-25, 25)
    ax.axis('off')
    ax.axis('tight')

    # We add the labels for each digit.
    txts = []
    for i in range(10):
        # Position of each label.
        xtext, ytext = np.median(x[colors == i, :], axis=0)
        txt = ax.text(xtext, ytext, str(i), fontsize=16)
        txt.set_path_effects([
            PathEffects.Stroke(linewidth=5, foreground="w"),
            PathEffects.Normal()])
        txts.append(txt)

    return f, ax, sc, txts

In [None]:
scatter(tsne, digits.target);

Schauen wir mal, wie der Algorithmus mit unseren "hochaufgelösten" Bildern umgeht...

In [None]:
train = pd.read_csv('./data/mnist/train.csv')
target = train.label
train = train.drop("label",axis=1)
m,n = train.shape
print(f'Wir haben {m} Datenpunkte mit {n} Features')

In [None]:
tsne = TSNE(n_components=2, random_state=42).fit_transform(train)

In [None]:
scatter(tsne, target);