In [6]:
# Based on docs:
# https://docs.lightly.ai/self-supervised-learning/tutorials/package/tutorial_simclr_clothing.html

# Also, see:
# https://github.com/giakoumoglou/classification/blob/main/notebooks/main_simclr.ipynb
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html

## Imports

In [7]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from pathlib import Path
import mlflow
import numpy as np

import os

import matplotlib.pyplot as plt

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import CSVLogger
import torch
import torch.nn as nn
import torchvision

from sklearn.decomposition import PCA

from lightly.data import LightlyDataset
from lightly.transforms import SimCLRTransform, utils

from itertools import product

from utils import (
    generate_embeddings,
    prepare_mnist_images,
)
from models import SimCLRModel

## Setup


In [8]:
# DEBUG
# device = "cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"

accelerator = "gpu" if device == "cuda" else "cpu"

print(f"Using device: {device}")
print(f"Using accelerator: {accelerator}")

Using device: cuda
Using accelerator: gpu


In [9]:
# input_size = 32  # laptop
input_size = 128  # PC
# batch_size = 64  # laptop
batch_size = 256  # PC

num_workers = 8
seed = 1
max_epochs = 20
num_ftrs = 32

path_to_data = Path("datasets/MNIST")
path_to_train_data = path_to_data / "train"
path_to_test_data = path_to_data / "test"

experiment_name = "simclr_mnist"

Let's set the seed for our experiments



In [10]:
pl.seed_everything(seed)

Seed set to 1


1

In [11]:
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='file:///d:/__repos/ml_concepts/ideas/ssl/mlruns/521104266412162370', creation_time=1719482649711, experiment_id='521104266412162370', last_update_time=1719482649711, lifecycle_stage='active', name='simclr_mnist', tags={}>

## Config

Here, we define the configuration for our experiment, to be logged in mlflow.

In [12]:
classes_list_of_lists = [[0, 4, 9], [0, 2, 4, 6, 9], [0, 1, 2, 4, 5, 7, 8]]
initial_num_components_pca_list = [50, 20, 7, 3]
embedding_sizes_list = [8, 16, 32, 512]

# Main loop

In [13]:
for classes_list, initial_num_components_pca, embedding_size in product(
    classes_list_of_lists, initial_num_components_pca_list, embedding_sizes_list
):
    with mlflow.start_run():
        
        # Log parameters
        mlflow.log_param("classes_list", classes_list)
        mlflow.log_param("initial_num_components_pca", initial_num_components_pca)
        mlflow.log_param("embedding_size", embedding_size)
        mlflow.log_param("max_epochs", max_epochs)
        mlflow.log_param("batch_size", batch_size)
        
        # Prepare the data
        prepare_mnist_images(classes_list, path_to_data)

        ## Train
        train_transform = SimCLRTransform(
            input_size=input_size, vf_prob=0.5, rr_prob=0.5)
        train_dataset_simclr = LightlyDataset(
            input_dir=path_to_train_data, transform=train_transform
        )
        dataloader_train_simclr = torch.utils.data.DataLoader(
            train_dataset_simclr,
            batch_size=batch_size,
            shuffle=True,
            drop_last=True,
            num_workers=num_workers,
        )
        mlflow.log_metric("train_dataset_length", (len(train_dataset_simclr)))

        ## Test
        test_transform = torchvision.transforms.Compose(
            [
                torchvision.transforms.Resize((input_size, input_size)),
                torchvision.transforms.ToTensor(),
                torchvision.transforms.Normalize(
                    mean=utils.IMAGENET_NORMALIZE["mean"],
                    std=utils.IMAGENET_NORMALIZE["std"],
                ),
            ]
        )
        test_dataset_simclr = LightlyDataset(
            input_dir=path_to_test_data, transform=test_transform
        )
        dataloader_test_simclr = torch.utils.data.DataLoader(
            test_dataset_simclr,
            batch_size=batch_size,
            shuffle=False,
            drop_last=True,
            num_workers=num_workers,
        )
        mlflow.log_metric("test_dataset_length", (len(test_dataset_simclr)))
        
        # Create model and callbacks
        model = SimCLRModel()
        early_stopping_callback = EarlyStopping(
            monitor="train_loss_ssl", mode="min", patience=5
        )

        model_checkpoint_callback = ModelCheckpoint(
            monitor="train_loss_ssl", mode="min", save_top_k=-1
        )

        # Train the model
        trainer = pl.Trainer(
            log_every_n_steps=10,
            max_epochs=max_epochs,
            devices=1,
            accelerator=accelerator,
            callbacks=[early_stopping_callback, model_checkpoint_callback],
            logger=CSVLogger(save_dir="lightning_logs", name=experiment_name),
        )
        trainer.fit(model, dataloader_train_simclr)

        # Generate embeddings
        model.eval()
        embeddings, filenames = generate_embeddings(model, dataloader_test_simclr)

        # Visualize embeddings
        pca = PCA(n_components=initial_num_components_pca)
        data_pca = pca.fit_transform(embeddings.cpu().numpy())

        tsne = TSNE(n_components=2)
        data_tsne = tsne.fit_transform(data_pca)

        targets = [int(fname.split("\\")[0]) for fname in filenames]  # windows
        os.makedirs("outputs", exist_ok=True)
        
        # Save artifacts
        explained_variance_ratio = pca.explained_variance_ratio_
        explained_variance_path = f"outputs/explained_variance_{classes_list}_{initial_num_components_pca}_{embedding_size}.png"
        plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, alpha=0.5, align='center')
        plt.step(range(1, len(explained_variance_ratio) + 1), np.cumsum(explained_variance_ratio), where='mid')
        plt.xlabel('Principal Component Index')
        plt.ylabel('Explained Variance Ratio')
        plt.title('PCA Explained Variance Ratio')
        plt.savefig(explained_variance_path)
        plt.close()
        mlflow.log_artifact(explained_variance_path)

        tsne_path = f"outputs/tsne_{classes_list}_{initial_num_components_pca}_{embedding_size}.png"
        plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=targets, cmap="viridis")
        plt.title("t-SNE visualization")
        plt.colorbar()
        plt.savefig(tsne_path)
        plt.close()
        mlflow.log_artifact(tsne_path)

        pca_path = f"outputs/pca_{classes_list}_{initial_num_components_pca}_{embedding_size}.png"
        plt.scatter(data_pca[:, 0], data_pca[:, 1], c=targets, cmap="viridis")
        plt.title("PCA visualization")
        plt.colorbar()
        plt.savefig(pca_path)
        plt.close()
        mlflow.log_artifact(pca_path)
    
    break

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type                 | Params
---------------------------------------------------------
0 | backbone        | Sequential           | 11.2 M
1 | projection_head | SimCLRProjectionHead | 328 K 
2 | criterion       | NTXentLoss           | 0     
---------------------------------------------------------
11.5 M    Trainable params
0         Non-trainable params
11.5 M    Total params
46.022    Total estimated model para

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


torch.Size([256, 512, 1, 1])
torch.Size([256, 512, 1, 1])
torch.Size([256, 512, 1, 1])
torch.Size([256, 512, 1, 1])
torch.Size([256, 512, 1, 1])
torch.Size([256, 512, 1, 1])
torch.Size([256, 512, 1, 1])
torch.Size([256, 512, 1, 1])
torch.Size([256, 512, 1, 1])
torch.Size([256, 512, 1, 1])
torch.Size([256, 512, 1, 1])


[WinError 2] Nie można odnaleźć określonego pliku
  File "d:\__repos\ml_concepts\venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Users\Maciek\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 501, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\Maciek\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 966, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\Maciek\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1435, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
