In [29]:
# Based on docs:
# https://docs.lightly.ai/self-supervised-learning/tutorials/package/tutorial_simclr_clothing.html

# Also, see:
# https://github.com/giakoumoglou/classification/blob/main/notebooks/main_simclr.ipynb
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html

## Imports

In [30]:
import os
from typing import Union, List, Tuple
from pathlib import Path
import shutil

from GenerateEmbeddingsTrain import GenerateEmbeddingsTrain
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

import pytorch_lightning as pl
import torch
import torch.nn as nn
import torchvision
import torchvision.datasets as datasets
from PIL import Image

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.preprocessing import normalize

from lightly.data import LightlyDataset
from lightly.transforms import SimCLRTransform, utils

from utils import (
    get_image_as_np_array,
    plot_knn_clusters,
    get_distance_between_points_in_cluster,
    get_distances_between_centroids,
    plot_clusters,
    generate_embeddings_and_fnames_simclr,
    check_labels_correspondence,
    plot_knn_examples,
    plot_clusters_3d,
)

## Configuration

We set some configuration parameters for our experiment.
Feel free to change them and analyze the effect.

The default configuration with a batch size of 256 and input resolution of 128
requires 6GB of GPU memory.



In [31]:
# DEBUG
device = "cpu"
# device = "cuda" if torch.cuda.is_available() else "cpu"

accelerator = "gpu" if device == "cuda" else "cpu"

print(f"Using device: {device}")
print(f"Using accelerator: {accelerator}")

Using device: cpu
Using accelerator: cpu


In [32]:
# input_size = 32  # laptop
input_size = 128  # PC
# batch_size = 64  # laptop
batch_size = 256  # PC

num_workers = 8
seed = 1
max_epochs = 20
num_ftrs = 32

Let's set the seed for our experiments



In [33]:
pl.seed_everything(seed)

Seed set to 1


1

In [34]:
# Create the directory if it doesn't exist
os.makedirs("./datasets/mnist", exist_ok=True)
os.makedirs("./datasets/mnist/train", exist_ok=True)
os.makedirs("./datasets/mnist/test", exist_ok=True)

In [35]:
# Download the MNIST dataset (if not already downloaded)
train_dataset = datasets.MNIST(root="./datasets/mnist", train=True, download=True)
test_dataset = datasets.MNIST(root="./datasets/mnist", train=False, download=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./datasets/mnist\MNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:01<00:00, 5482309.91it/s]


Extracting ./datasets/mnist\MNIST\raw\train-images-idx3-ubyte.gz to ./datasets/mnist\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./datasets/mnist\MNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 235878.15it/s]


Extracting ./datasets/mnist\MNIST\raw\train-labels-idx1-ubyte.gz to ./datasets/mnist\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./datasets/mnist\MNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:02<00:00, 649764.63it/s]


Extracting ./datasets/mnist\MNIST\raw\t10k-images-idx3-ubyte.gz to ./datasets/mnist\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./datasets/mnist\MNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 4531524.45it/s]

Extracting ./datasets/mnist\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./datasets/mnist\MNIST\raw






In [36]:
idx_train = (train_dataset.targets==7) | (train_dataset.targets==2) | (train_dataset.targets==0) 
train_dataset.targets = train_dataset.targets[idx_train]
train_dataset.data = train_dataset.data[idx_train]

In [37]:
idx_test = (test_dataset.targets==7) | (test_dataset.targets==2) | (test_dataset.targets==0)
test_dataset.targets = test_dataset.targets[idx_test]
test_dataset.data = test_dataset.data[idx_test]

In [38]:
print(type(test_dataset.targets))
print(test_dataset.targets.shape)
print(test_dataset.targets[:10])
print(test_dataset.targets.unique(return_counts=True))
print(test_dataset.data[10].shape)

<class 'torch.Tensor'>
torch.Size([3040])
tensor([7, 2, 0, 0, 0, 7, 0, 7, 0, 7])
(tensor([0, 2, 7]), tensor([ 980, 1032, 1028]))
torch.Size([28, 28])


In [39]:
# Save the images to the directory
for i, (image, label) in enumerate(train_dataset):
    image.save(f"./datasets/mnist/train/{label}_{i}.png")

for i, (image, label) in enumerate(test_dataset):
    image.save(f"./datasets/mnist/test/{label}_{i}.png")

In [40]:
# Remove original data to prevent errors (if it exists)
if os.path.exists("./datasets/mnist/MNIST"):
    shutil.rmtree("./datasets/mnist/MNIST")

In [41]:
path_to_data = r"./datasets/mnist"
path_to_train_data = Path(path_to_data) / "train"
path_to_test_data = Path(path_to_data) / "test"

## Setup data augmentations and loaders

The images from the dataset have been taken from above when the clothing was
on a table, bed or floor. Therefore, we can make use of additional augmentations
such as vertical flip or random rotation (90 degrees).
By adding these augmentations we learn our model invariance regarding the
orientation of the clothing piece. E.g. we don't care if a shirt is upside down
but more about the strcture which make it a shirt.

You can learn more about the different augmentations and learned invariances
here: `lightly-advanced`.



In [42]:
transform = SimCLRTransform(input_size=input_size, vf_prob=0.5, rr_prob=0.5)

# We create a torchvision transformation for embedding the dataset after
# training
test_transform = torchvision.transforms.Compose(
    [
    torchvision.transforms.ToTensor(),  # Convert the image to a PyTorch tensor
    torchvision.transforms.Normalize((0.1307,), (0.3081,))  # Normalize with mean and std deviation
    ]
)

In [43]:
dataloader_test = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False
)

In [44]:
print(len(test_dataset))
print(test_dataset.data[11].shape)

3040
torch.Size([28, 28])


In [45]:
# dataset_train_simclr = LightlyDataset(input_dir=path_to_data, transform=transform)
train_dataset_simclr = LightlyDataset(input_dir=path_to_train_data, transform=transform)

# dataset_test = LightlyDataset(input_dir=path_to_data, transform=test_transform)
test_dataset_simclr = LightlyDataset(
    input_dir=path_to_test_data, transform=test_transform
)

In [46]:
print(len(train_dataset_simclr))
print(len(test_dataset_simclr))

18146
3040


In [47]:
dataloader_train_simclr = torch.utils.data.DataLoader(
    train_dataset_simclr,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers,
)

dataloader_test_simclr = torch.utils.data.DataLoader(
    test_dataset_simclr,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
)

## Create the SimCLR Model
Now we create the SimCLR model. We implement it as a PyTorch Lightning Module
and use a ResNet-18 backbone from Torchvision. Lightly provides implementations
of the SimCLR projection head and loss function in the `SimCLRProjectionHead`
and `NTXentLoss` classes. We can simply import them and combine the building
blocks in the module.



In [48]:
from lightly.loss import NTXentLoss
from lightly.models.modules.heads import SimCLRProjectionHead


class SimCLRModel(pl.LightningModule):
    def __init__(self):
        super().__init__()

        # create a ResNet backbone and remove the classification head
        resnet = torchvision.models.resnet18()
        self.backbone = nn.Sequential(*list(resnet.children())[:-1])

        hidden_dim = resnet.fc.in_features
        self.projection_head = SimCLRProjectionHead(hidden_dim, hidden_dim, 128)

        self.criterion = NTXentLoss()

    def forward(self, x):
        h = self.backbone(x).flatten(start_dim=1)
        z = self.projection_head(h)
        return z

    def training_step(self, batch, batch_idx):
        (x0, x1), _, _ = batch
        z0 = self.forward(x0)
        z1 = self.forward(x1)
        loss = self.criterion(z0, z1)
        # TODO: dopisać inne metryki, które sprawdzają czy trening 'ma sens'
        # czyli np. czy klasy się zbliżyły do siebie i oddaliły od innych klas
        self.log("train_loss_ssl", loss)
        return loss

    def configure_optimizers(self):
        optim = torch.optim.SGD(
            self.parameters(), lr=6e-2, momentum=0.9, weight_decay=5e-4
        )
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, max_epochs)
        return [optim], [scheduler]

Train the module using the PyTorch Lightning Trainer on a single GPU.



In [49]:
# Uncomment for training (may take some time)

model = SimCLRModel()


In [50]:
trainer = pl.Trainer(
    max_epochs=max_epochs, 
    devices=1, 
    accelerator=accelerator, 
    callbacks=GenerateEmbeddingsTrain(
        save_dir="./embeddings", 
        model=model, 
        dataloader=dataloader_train_simclr,
        device=device
        )
    )

SyntaxError: invalid syntax (2669290670.py, line 9)

In [None]:
trainer.fit(model, dataloader_train_simclr)


Next we create a helper function to generate embeddings
from our test images using the model we just trained.
Note that only the backbone is needed to generate embeddings,
the projection head is only required for the training.
Make sure to put the model into eval mode for this part!



In [None]:
# For eval only

# PC-trained model
model = SimCLRModel.load_from_checkpoint(
    "./lightning_logs/version_8/checkpoints/epoch=9-step=2340.ckpt"
)

# laptop-trained model
# maybe not-OK, as it was trained on 70k images (both train and test)
# model = SimCLRModel.load_from_checkpoint(
#     "./lightning_logs/version_0/checkpoints/epoch=9-step=10930.ckpt"
# )

In [None]:
model.eval()

In [None]:
embeddings, filenames = generate_embeddings_and_fnames_simclr(
    model, dataloader_test_simclr
)

In [None]:
# embeddings = generate_embeddings_and_fnames(model, dataloader_test)

## Visualize Clusters in Embedding Space
Let's look at the trained embedding how they're clustered in latent space.

In [None]:
kmeans = KMeans(n_clusters=10)

In [None]:
labels = kmeans.fit_predict(embeddings)

In [None]:
labels[0:20]

In [None]:
test_dataset.targets[0:20]

In [None]:
np.unique(labels)

In [None]:
get_distance_between_points_in_cluster(embeddings, labels)

In [None]:
distances = get_distances_between_centroids(embeddings)

In [None]:
print(f"Distances shape: {distances.shape}")
print(f"Distances rank: {np.linalg.matrix_rank(distances)}")
print(f"Distancess:\n {distances}")

In [None]:
plt.matshow(distances, cmap="viridis")

In [None]:
visualization_df = pd.DataFrame(distances)
print(visualization_df)

## Visualize Nearest Neighbors
Let's look at the trained embedding and visualize the nearest neighbors for
a few random samples.

We create some helper functions to simplify the work



In [None]:
def plot_knn_examples(
    embeddings, filenames, path_to_data, n_neighbors=3, num_examples=6
):
    """Plots multiple rows of random images with their nearest neighbors"""
    # lets look at the nearest neighbors for some samples
    # we use the sklearn library
    nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(embeddings)
    distances, indices = nbrs.kneighbors(embeddings)

    # get 5 random samples
    samples_idx = np.random.choice(len(indices), size=num_examples, replace=False)

    # loop through our randomly picked samples
    for idx in samples_idx:
        fig = plt.figure()
        # loop through their nearest neighbors
        for plot_x_offset, neighbor_idx in enumerate(indices[idx]):
            # add the subplot
            ax = fig.add_subplot(1, len(indices[idx]), plot_x_offset + 1)
            # get the correponding filename for the current index
            fname = os.path.join(path_to_data, filenames[neighbor_idx])
            # plot the image
            plt.imshow(get_image_as_np_array(fname))
            # set the title to the distance of the neighbor
            ax.set_title(f"d={distances[idx][plot_x_offset]:.3f}")
            # let's disable the axis
            plt.axis("off")

In [None]:
# plot_knn_examples(
#     embeddings, filenames, n_neighbors=7, num_examples=10, path_to_data=path_to_data
# )

## Visualize Nearest Neighbours on 2D Plane
Let's look at the trained embedding, perform KNN, then PCA and visualize clusters in 2D space.

In [None]:
plot_knn_clusters(
    embeddings, np.array(test_dataset.targets), n_neighbors=5, num_examples=10
)

## Visualize Clusters on 2D Plane
Let's look at the trained embedding, perform PCA and visualize clusters in 2D space.

In [None]:
# MNIST 'labels' created by simclr
print(type(test_dataset_simclr.dataset.targets))
unique_targets = np.unique(test_dataset_simclr.dataset.targets)
print(unique_targets)
print(test_dataset_simclr.dataset.targets[0])
print(type(test_dataset_simclr.dataset.targets[0]))

In [None]:
# original MNIST labels
print(type(test_dataset.targets))
unique_targets = np.unique(test_dataset.targets)
print(unique_targets)
print(test_dataset.targets[0])
print(type(test_dataset.targets[0]))

In [None]:
plot_clusters(
    embeddings,
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.01,
    alpha=0.1,
    plot_centroids=True,
    specific_labels=list(range(10)),
)

In [None]:
plot_clusters(
    embeddings,
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.5,
    alpha=0.3,
    plot_centroids=True,
    specific_labels=[0, 1, 2, 3],
)

In [None]:
plot_clusters(
    embeddings,
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.5,
    alpha=0.3,
    plot_centroids=True,
    specific_labels=[0, 8],
)

In [None]:
plot_clusters(
    embeddings,
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.5,
    alpha=0.3,
    plot_centroids=True,
    specific_labels=[1, 4],
)

In [None]:
plot_clusters_3d(
    embeddings,
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.5,
    alpha=0.3,
    plot_centroids=True,
    specific_labels=[2, 3, 5, 8],
)

In [None]:
plot_clusters(
    embeddings,
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.5,
    alpha=0.3,
    plot_centroids=True,
    specific_labels=[2, 3, 5, 8],
)

In [None]:
plot_clusters(
    embeddings,
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.5,
    alpha=0.3,
    plot_centroids=True,
    specific_labels=[1, 8],
)

## Check Labels Correspoding to Clusters
Let's check if the clusters in the embedding space correspond to the labels of the images.

In [None]:
check_labels_correspondence(
    embeddings=embeddings, base_path=path_to_test_data, filenames=filenames
)

## Simple clustering on MNIST
Let's check if it's possible to create good clusters from MNIST, working in pixel space.

In [None]:
test_dataset.targets[0:10]

In [None]:
len(test_dataset.test_data)

In [None]:
test_dataset.data[0].shape

In [None]:
mnist_clusters = KMeans(n_clusters=10).fit(test_dataset.data.reshape(-1, 28 * 28))

In [None]:
np.unique(mnist_clusters.labels_, return_counts=True)

In [None]:
plot_clusters(
    test_dataset.data.reshape(-1, 28 * 28),
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.4,
    alpha=0.3,
    plot_centroids=True,
    specific_labels=[0, 4, 7],
)

In [None]:
plot_clusters_3d(
    test_dataset.data.reshape(-1, 28 * 28),
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.4,
    alpha=0.3,
    plot_centroids=True,
    specific_labels=[0, 4, 7],
)

In [None]:
proportion_of_points_to_plot = 0.4
num_clusters = 20
alpha = 0.3

In [None]:
kmeans = KMeans(n_clusters=num_clusters)
labels = kmeans.fit_predict(embeddings)
centroids = kmeans.cluster_centers_

In [None]:
sampled_indices = np.random.choice(
    embeddings.shape[0],
    int(len(embeddings) * proportion_of_points_to_plot),
    replace=False,
)

In [None]:
%matplotlib widget

In [None]:
sampled_embeddings = embeddings[sampled_indices]

print(f"Plotting {len(sampled_embeddings)} points out of {len(embeddings)}")

ax = plt.figure().add_subplot(projection="3d")

pca = PCA(n_components=3)
print(pca)

to_plot_embeddings = pca.fit_transform(sampled_embeddings)
print(to_plot_embeddings.shape)

to_plot_centroids = pca.transform(centroids)
print(to_plot_centroids.shape)

ax.scatter(
    to_plot_embeddings[:, 0],
    to_plot_embeddings[:, 1],
    to_plot_embeddings[:, 2],
    alpha=alpha,
    cmap="viridis",
)

plt.show()

In [None]:
plot_clusters(
    test_dataset.data.reshape(-1, 28 * 28),
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.4,
    alpha=0.3,
    plot_centroids=True,
    specific_labels=[1, 4, 7],
)

In [None]:
plot_clusters(
    test_dataset.data.reshape(-1, 28 * 28),
    np.array(test_dataset.targets),
    proportion_of_points_to_plot=0.4,
    alpha=0.3,
    plot_centroids=True,
    specific_labels=[2, 3, 5, 8],
)