<a href="https://colab.research.google.com/github/JonasMiksch/Research_Project_SimCLR/blob/main/Classifier_SimCLR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Standard libraries
import os
from copy import deepcopy

## Imports for plotting
import matplotlib.pyplot as plt
plt.set_cmap('cividis')
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
import seaborn as sns
sns.set()

## tqdm for loading bars
from tqdm.notebook import tqdm
#from torchvision.datasets import STL10
## PyTorch
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
from math import sqrt, ceil, floor
from statistics import mean

from torchvision.io import read_image
from random import randint

from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
from torchvision.utils import make_grid
from torch.nn import functional as F
import torch

try:
    import wandb
except ModuleNotFoundError: 
    !pip3 install wandb
    import wandb

try:
    import torchinfo
except ModuleNotFoundError: # Google Colab does not have PyTorch Lightning installed by default. Hence, we do it here if necessary
    !pip3 install torchinfo
    import torchinfo

## Torchvision
import torchvision
from torchvision import transforms
from torchvision import models
import numpy as np
# PyTorch Lightning
try:
    import pytorch_lightning as pl
except ModuleNotFoundError: # Google Colab does not have PyTorch Lightning installed by default. Hence, we do it here if necessary
    !pip3 install --quiet pytorch-lightning>=1.4 # type : ignore
    import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
import pandas as pd
import pdb
from torch.utils.data import DataLoader,Dataset
# Import tensorboard
%reload_ext tensorboard
from sklearn.metrics import classification_report

# In this notebook, we use data loaders with heavier computational processing. It is recommended to use as many
# workers as possible in a data loader, which corresponds to the number of CPU cores
NUM_WORKERS = os.cpu_count()

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Global seed set to 42


Device: cuda:0


In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
!unzip gdrive/My\ Drive/Seminararbeit/JPEGImages.zip > /dev/null


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
replace JPEGImages/__EX_1000788252_100912.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
#Dataset_Paths TODO Change these accordingly
WORKING_DIRECTORY = '/content/gdrive/My Drive/Seminararbeit'
ARTDL_CSV_PATH = WORKING_DIRECTORY
img_folder = 'JPEGImages'

#ADD name of checkpoint 
modeltype = "Original_SimCLR2best.ckpt"
#filename = "SimCLR_FInalbest_new_FInetuning"
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH =  os.path.join(WORKING_DIRECTORY,"checkpoints")



In [None]:
class ImageDataset(Dataset):
  def __init__(self,csv,img_folder,transform):
    self.csv=csv
    self.transform=transform
    self.img_folder=img_folder
    
    self.image_names=self.csv[:]['item']
    self.labels=np.array(self.csv.drop(['item', 'set', 'artdl_label'], axis=1))

  def __len__(self):
    return len(self.image_names)

  
  def __getitem__(self,index):
    img_path = os.path.join(self.img_folder, self.image_names.iloc[index]+'.jpg')
    image = read_image(img_path)
    image = self.transform(image)
    return image,self.labels[index]

In [None]:
img_transforms = transforms.Compose([transforms.ToPILImage(),
                                     transforms.Resize((256,256)),
                                     transforms.ToTensor(),
                                     transforms.Normalize((0.5,), (0.5,),
                                                      )])

train_data = pd.read_csv(os.path.join(WORKING_DIRECTORY,'artdl_train.csv'))
test_data = pd.read_csv(os.path.join(WORKING_DIRECTORY,'artdl_test.csv'))
val_data = pd.read_csv(os.path.join(WORKING_DIRECTORY,'artdl_valid.csv'))


KeyError: ignored

In [None]:
print(img_folder)

JPEGImages


In [None]:
pl.seed_everything(42)
NUM_IMAGES = 6
imgs = torch.stack([train_data_class[idx][0] for idx in range(NUM_IMAGES) ], dim=0)
print(imgs.shape)
img_grid = torchvision.utils.make_grid(imgs, nrow=6, normalize=True, pad_value=0.9)
img_grid = img_grid.permute(1, 2, 0)
plt.figure(figsize=(10,5))
plt.title('Image_tests')
plt.imshow(img_grid)
plt.axis('off')
plt.show()
plt.close()

In [None]:
class SimCLR(pl.LightningModule):

    def __init__(self, hidden_dim, lr, temperature, weight_decay, max_epochs=50):
        super().__init__()
        self.save_hyperparameters('hidden_dim','lr','temperature','weight_decay',"max_epochs")
        assert self.hparams.temperature > 0.0, 'The temperature must be a positive float!'
        # Base model f(.)
        self.convnet = torchvision.models.resnet50(pretrained=False,
                                                   num_classes=4*hidden_dim)  # Output of last linear layer
        # The MLP for g(.) consists of Linear->ReLU->Linear
        self.convnet.fc = nn.Sequential(
            self.convnet.fc,  # Linear(ResNet output, 4*hidden_dim)
            nn.ReLU(inplace=True),
            nn.Linear(4*hidden_dim, hidden_dim)
        )

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(),
                                lr=self.hparams.lr,
                                weight_decay=self.hparams.weight_decay)
        lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                            T_max=self.hparams.max_epochs,
                                                            eta_min=self.hparams.lr/50)
        return [optimizer], [lr_scheduler]

    def info_nce_loss(self, batch, mode='train'):
        imgs, _ = batch
        imgs = torch.cat(imgs, dim=0)

        # Encode all images
        feats = self.convnet(imgs)
        # Calculate cosine similarity
        cos_sim = F.cosine_similarity(feats[:,None,:], feats[None,:,:], dim=-1)
        # Mask out cosine similarity to itself
        self_mask = torch.eye(cos_sim.shape[0], dtype=torch.bool, device=cos_sim.device)
        cos_sim.masked_fill_(self_mask, -9e15)
        # Find positive example -> batch_size//2 away from the original example
        pos_mask = self_mask.roll(shifts=cos_sim.shape[0]//2, dims=0)
        # InfoNCE loss
        cos_sim = cos_sim / self.hparams.temperature
        nll = -cos_sim[pos_mask] + torch.logsumexp(cos_sim, dim=-1)
        nll = nll.mean()

        # Logging loss
        #self.log(mode+'_loss', nll)
        # Get ranking position of positive example
        comb_sim = torch.cat([cos_sim[pos_mask][:,None],  # First position positive example
                              cos_sim.masked_fill(pos_mask, -9e15)],
                             dim=-1)
        sim_argsort = comb_sim.argsort(dim=-1, descending=True).argmin(dim=-1)
        # Logging ranking metrics
        #wandb.log({mode+' accuracy_top1': (sim_argsort == 0).float().mean(),
                   #mode+' accuracy_top5': (sim_argsort <5).float().mean(),
                   #mode+ ' loss': nll})
        return nll

    def training_step(self, batch, batch_idx):
        return self.info_nce_loss(batch, mode='train')

    def validation_step(self, batch, batch_idx):
        return self.info_nce_loss(batch, mode='val')




In [None]:
@torch.no_grad()
def prepare_data_features(model, dataset):
    # Prepare model
    network = deepcopy(model.convnet)
    network.fc = nn.Identity()  # Removing projection head g(.)
    network.eval()
    network.to(device)
    # Encode all images
    data_loader = data.DataLoader(dataset, batch_size=64, num_workers=NUM_WORKERS, shuffle=False, drop_last=False)
    feats, labels = [], []
    for batch_imgs, batch_labels in tqdm(data_loader):
        batch_imgs = batch_imgs.to(device)
        batch_feats = network(batch_imgs)
        batch_feats = batch_feats.detach().cpu()
        #print(batch_feats[:3])
        feats.append(batch_feats)
        labels.append(batch_labels)

    feats = torch.cat(feats, dim=0)
    labels = torch.cat(labels, dim=0)

    labels = labels.flatten(0,1)

    labels, idxs = labels.sort()
    feats = feats[idxs]

    return data.TensorDataset(feats, labels)

In [None]:

simclr_model = SimCLR.load_from_checkpoint(checkpoint_path=os.path.join(CHECKPOINT_PATH,modeltype))

In [None]:
print(modeltype)
train_feats_simclr = prepare_data_features(simclr_model, train_data_class)
val_feats_simclr = prepare_data_features(simclr_model, val_data_class)
test_feats_simclr = prepare_data_features(simclr_model, test_data_class)

train_feats_simclr2 = prepare_data_features(simclr_model, train_data_class2)
val_feats_simclr2 = prepare_data_features(simclr_model, val_data_class2)

train_feats_simclr3 = prepare_data_features(simclr_model, train_data_class3)
val_feats_simclr3 = prepare_data_features(simclr_model, val_data_clas3)

In [None]:

# def flatten(original_dataset):
#      new_dataset = data.TensorDataset(
#          original_dataset.tensors[0],original_dataset.tensors[1].flatten(0,1)
#      )
#      return new_dataset

# train_feats_simclr2 = flatten(train_feats_simclr)
# val_feats_simclr2 = flatten(val_feats_simclr)
# test_feats_simclr2 = flatten(test_feats_simclr)




In [None]:
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import WeightedRandomSampler
train_labels = train_feats_simclr2.tensors[1].cpu().detach().numpy()

# class_weights = compute_class_weight(class_weight ='balanced',classes = np.unique(train_labels),y =train_labels)

# class_weights=torch.tensor(class_weights,dtype=torch.float)
# class_weights = class_weights.to(device)
# print(class_weights)


class_sample_count = np.array(
    [len(np.where(train_labels == t)[0]) for t in np.unique(train_labels)])
print(class_sample_count)
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in train_labels])
samples_weight = torch.from_numpy(samples_weight)
weighted_random_sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))


[ 234  784  939  943  115  727  419  949  448 9515]


In [None]:
class LogisticRegression(pl.LightningModule):

    def __init__(self, feature_dim, num_classes, lr, weight_decay, max_epochs=100):
        super().__init__()
        self.save_hyperparameters("feature_dim","num_classes","lr","weight_decay","max_epochs")
        # Mapping from representation h to classes
        self.model = nn.Linear(feature_dim, num_classes)

     

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(),
                                lr=self.hparams.lr,
                                weight_decay=self.hparams.weight_decay)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                      milestones=[int(self.hparams.max_epochs*0.6),
                                                                  int(self.hparams.max_epochs*0.8)],
                                                      gamma=0.1)
        return [optimizer], [lr_scheduler]

    def forward(self, x):
        return self.model(x)
         
    # def _calculate_loss(self, batch, mode='train'):
    #     feats, labels = batch
    #     preds = self.model(feats)
    #     if mode == 'train':
    #       loss = F.cross_entropy(preds, labels,weight=class_weights)
    #     else:
    #       loss = F.cross_entropy(preds, labels)
    #     acc = (preds.argmax(dim=-1) == labels).float().mean()
    #     wandb.log({mode + '_acc': acc, mode + '_loss': loss})

    #     self.log(mode + '_loss', loss,on_step=True, on_epoch=True)
    #     self.log(mode + '_acc', acc,on_step=True, on_epoch=True)
    #     return loss

    def training_step(self, batch, batch_idx):
        feats, labels = batch
        preds = self.model(feats)
        loss = F.cross_entropy(preds, labels)#,weight=class_weights)
        acc = (preds.argmax(dim=-1) == labels).float().mean()
        #wandb.log({'train' + '_acc': acc, 'train' + '_loss': loss})

        self.log('train' + '_loss', loss)
        self.log('train' + '_acc', acc)
        return loss

    def validation_step(self, batch, batch_idx):
        feats, labels = batch
        preds = self.model(feats)
        loss = F.cross_entropy(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()
        #wandb.log({'val' + '_acc': acc, 'val' + '_loss': loss})

        self.log('val_loss', loss)
        self.log('val_acc', acc)
        return loss

    def test_step(self, batch, batch_idx):
        feats, labels = batch
        preds = self.model(feats)

        loss = F.cross_entropy(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()

        x =  {'test_loss': loss, 'test_acc': acc}
        self.log_dict(x)
        return x

In [None]:
class CheckpointEveryNSteps(pl.Callback):
    """
    Save a checkpoint every N steps, instead of Lightning's default that checkpoints
    based on validation loss.
    """

    def __init__(
        self,
        save_step_frequency,
        prefix="N-Step-Checkpoint",
        use_modelcheckpoint_filename=False,
    ):
        """
        Args:
            save_step_frequency: how often to save in steps
            prefix: add a prefix to the name, only used if
                use_modelcheckpoint_filename=False
            use_modelcheckpoint_filename: just use the ModelCheckpoint callback's
                default filename, don't use ours.
        """
        self.save_step_frequency = save_step_frequency
        self.prefix = prefix
        self.use_modelcheckpoint_filename = use_modelcheckpoint_filename

    def on_batch_end(self, trainer: pl.Trainer, _):
        """ Check if we should save a checkpoint after every train batch """
        epoch = trainer.current_epoch
        global_step = trainer.global_step
        #if global_step % self.save_step_frequency == 0:
        if global_step % self.save_step_frequency == 0:
            filename = str(global_step)+"best.ckpt"
            ckpt_path = os.path.join(folder, filename)
            #print(ckpt_path)
            trainer.save_checkpoint(ckpt_path)

In [None]:
from pytorch_lightning.loggers import WandbLogger


def train_logreg( batch_size, train_feats_data, val_feats_data,test_feats_data, max_epochs, **kwargs):
    train_loader = data.DataLoader(train_feats_data, batch_size=batch_size, shuffle=True,
                                   drop_last=False, pin_memory=True, num_workers=NUM_WORKERS)#,sampler= weighted_random_sampler)
    val_loader = data.DataLoader(val_feats_data, batch_size=batch_size, shuffle=False,
                                  drop_last=False, pin_memory=True, num_workers=NUM_WORKERS)
    test_loader = data.DataLoader(test_feats_data, batch_size=batch_size, shuffle=False,
                                  drop_last=False, pin_memory=True, num_workers=NUM_WORKERS)
    wandb.init(project=filename,settings=wandb.Settings(start_method="thread"))
    wandb_logger = WandbLogger()
    trainer = pl.Trainer(default_root_dir=os.path.join(CHECKPOINT_PATH, filename),
                         gpus=1 if str(device)=="cuda:0" else 0,
                         max_epochs=max_epochs,
                         logger = wandb_logger,
                         callbacks=[ModelCheckpoint(filename='{epoch}-{step}-{val_loss:.2f}-{val_acc:.2f}',
                                    save_top_k=1,save_weights_only=True, mode='max', monitor='val_acc',save_on_train_epoch_end=False),
                                    LearningRateMonitor("epoch")],
                         progress_bar_refresh_rate=0,
                         check_val_every_n_epoch=1)
    if os.path.exists(os.path.join(CHECKPOINT_PATH,filename+".ckpt")):
      print("dont_train")
      model = LogisticRegression.load_from_checkpoint(os.path.join(CHECKPOINT_PATH,filename+".ckpt"))

    else:
      print("start_training")
      pl.seed_everything(42)
      
      model = LogisticRegression(**kwargs)
      trainer.fit(model, train_loader, val_loader)

      model = LogisticRegression.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    #wandb.finish()
    train_result = trainer.test(model, dataloaders=train_loader, verbose=True)
    val_result = trainer.test(model, dataloaders=val_loader, verbose=True)
    return model

In [None]:
print(test_feats_simclr.tensors[0].shape[1])


In [None]:
learning_rate = 0.0025
#filename = "ArtDL_FInetuning"
filename = "SImi"+str(learning_rate)
folder = os.path.join(CHECKPOINT_PATH, filename)
resnet_model = train_logreg(batch_size=64,
                            train_feats_data=train_feats_simclr,
                            val_feats_data=val_feats_simclr,
                            test_feats_data = test_feats_simclr,
                            feature_dim=train_feats_simclr.tensors[0].shape[1],
                            num_classes=10,
                            lr=learning_rate ,
                            weight_decay=1e-3,
                            max_epochs = 80)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr-AdamW,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_acc,█▁
test_loss,▁█
train_acc,▂▁▅▃▅▃▄▄▆▆▄▅▄▅▆▆▇▄▅▇▆▇▇▅▇▆▆▃▄▇▆▇▆▆▅▅█▆▆▆
train_loss,▇█▄▆▄▆▅▄▃▃▄▄▄▃▃▃▂▄▄▂▃▂▂▃▂▂▃▄▃▂▂▁▃▃▂▃▁▂▃▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_acc,▁▅▄▆▅▃▃▇█▄▆▄▅▄▄▂▃▄▄▆▃▅▂▅▅█▄▇▆▅▂▄▆
val_loss,▇▃▄▃▄▅▅▁▁▅▃▅▃▅▅█▆▄▅▃▅▄█▃▄▁▄▂▃▄▇▅▂

0,1
epoch,33.0
lr-AdamW,0.0025
test_acc,0.49221
test_loss,1.48529
train_acc,0.53125
train_loss,1.2694
trainer/global_step,7800.0
val_acc,0.41805
val_loss,1.66309


  "There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Global seed set to 42
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type   | Params
---------------------------------
0 | model | Linear | 20.5 K
---------------------------------
20.5 K    Trainable params
0         Non-trainable params
20.5 K    Total params
0.082     Total estimated model params size (MB)


start_training


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.7548596858978271
        test_loss           0.7830480933189392
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc             0.673293948173523
        test_loss            1.138407826423645
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [None]:
@torch.no_grad()

def prepare_data_features2(model, dataset):

    model.eval()
    model.to(device)

    data_loader = data.DataLoader(dataset, batch_size=64, num_workers=NUM_WORKERS, shuffle=False, drop_last=False)
    pred, labels = [], []
    for batch_feats, _ in tqdm(data_loader):
        batch_feats = batch_feats.to(device)
        batch_pred = model(batch_feats)
        batch_pred = batch_pred.argmax(dim=-1)
        batch_pred = batch_pred.detach().cpu()
        pred.append(batch_pred)

    pred = torch.cat(pred, dim=0)
    return pred

pred = prepare_data_features2(resnet_model,test_feats_simclr)
pred1 = prepare_data_features2(resnet_model,val_feats_simclr)
pred2 = prepare_data_features2(resnet_model,train_feats_simclr)


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

In [None]:
wrong_predict = 0
wrong_9 = 0
right_predict = 0

nx = pred.numpy()
y = np.where(nx == 5)
#print(y)
print(pred[y])

labels = test_feats_simclr.tensors[1]
labels1 = val_feats_simclr.tensors[1]
labels2 = train_feats_simclr.tensors[1]
print(pred.size(dim=0))
for i in range(pred.size(dim=0)):
  if pred[i] != labels[i]:
    wrong_predict += 1
    if pred[i] == 9:
      wrong_9 += 1
  else:
    right_predict+=1

print(wrong_predict)
print(wrong_9)
print(right_predict)

tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5])
1864
541
320
1323


In [None]:

print(classification_report(labels, pred))
print(classification_report(labels1, pred1))
print(classification_report(labels2, pred2))

              precision    recall  f1-score   support

           0       0.64      0.24      0.35        29
           1       0.38      0.16      0.23        98
           2       0.52      0.47      0.50       118
           3       0.37      0.37      0.37        99
           4       0.25      0.14      0.18        14
           5       0.65      0.22      0.33        90
           6       0.11      0.04      0.06        52
           7       0.54      0.24      0.33       119
           8       0.60      0.46      0.53        56
           9       0.78      0.95      0.86      1189

    accuracy                           0.71      1864
   macro avg       0.48      0.33      0.37      1864
weighted avg       0.67      0.71      0.67      1864

              precision    recall  f1-score   support

           0       0.60      0.30      0.40        30
           1       0.35      0.14      0.20        98
           2       0.48      0.29      0.36       117
           3       0.37 