In [1]:
import numpy as np
import torch
from pkg_resources import packaging

print("Torch version:", torch.__version__)

Torch version: 2.1.1


  from pkg_resources import packaging


In [2]:
import clip

clip.available_models()


['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [3]:
model, preprocess = clip.load("ViT-B/32")
model.eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [4]:
from pyexpat import features
import copy
import math
from sys import prefix
import numpy as np
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
import copy

from transformers import CLIPModel, AutoConfig, AutoModel

Duplicate key in file '/Users/naziultalukder/.matplotlib/matplotlibrc', line 2 ('backend: PyQt5')
Duplicate key in file '/Users/naziultalukder/.matplotlib/matplotlibrc', line 3 ('backend: TkAgg')


In [5]:
import os

import pandas as pd
import torch
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import Dataset
from transformers import CLIPTokenizer, CLIPProcessor, AutoTokenizer


class HatefulMemesDataset(Dataset):
    def __init__(self, root_folder, image_folder, split='train', labels='original', image_size=224):
        super(HatefulMemesDataset, self).__init__()
        self.root_folder = root_folder
        self.image_folder = image_folder
        self.split = split
        self.labels = labels
        self.image_size = image_size
        self.info_file = f"{self.split}.csv"

        print("data here: ", self.info_file)
        self.df = pd.read_csv(self.info_file)
        # float_cols = self.df.select_dtypes(float).columns
        # self.df[float_cols] = self.df[float_cols].fillna(-1).astype('Int64')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        item = {}
        image_fn = row['img'].split('/')[1]
        item['image'] = Image.open(f"{self.image_folder}/{image_fn}").convert('RGB').resize((self.image_size, self.image_size))
        item['text'] = row['text']
        item['label'] = row['label']
        item['idx_meme'] = row['id']
        return item

In [6]:
class CustomCollator(object):

    def __init__(self):
        pre_trained_model = 'openai/clip-vit-base-patch32'
        self.image_processor = CLIPProcessor.from_pretrained(pre_trained_model)
        self.text_processor = CLIPTokenizer.from_pretrained(pre_trained_model)

    def __call__(self, batch):
        pixel_values = self.image_processor(images=[item['image'] for item in batch], return_tensors="pt")['pixel_values']
        text_output = self.text_processor([item['text'] for item in batch], padding=True, return_tensors="pt", truncation=True)
        labels = torch.LongTensor([item['label'] for item in batch])
        idx_memes = torch.LongTensor([item['idx_meme'] for item in batch])

        batch_new = {}
        batch_new['pixel_values'] = pixel_values,
        batch_new['input_ids'] = text_output['input_ids']
        batch_new['attention_mask'] = text_output['attention_mask']

        batch_new['labels'] = labels
        batch_new['idx_memes'] = idx_memes

        return batch_new

collator = CustomCollator()

In [7]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores
multilingual_tokenizer_path = 'none'
fine_grained_labels = []
compute_fine_grained_metrics = True

# Load Datasets

In [8]:
labels = "original"
image_folder = 'data/img'
image_size = 224

dataset_train = HatefulMemesDataset(root_folder='data/', image_folder=image_folder, split='train',
            labels=labels, image_size=image_size)

dataset_val = HatefulMemesDataset(root_folder='data/', image_folder=image_folder, split='dev',
            labels=labels, image_size=image_size)

data here:  train.csv
data here:  dev.csv


In [9]:
dataset_train[0]

{'image': <PIL.Image.Image image mode=RGB size=224x224>,
 'text': 'its their character not their color that matters',
 'label': 0,
 'idx_meme': 42953}

In [10]:
from torch.utils.data import DataLoader

dataloader_train = DataLoader(dataset_train, batch_size=16, shuffle=True, num_workers=0, collate_fn=collator)
dataloader_val = DataLoader(dataset_val, batch_size=16, shuffle=False, num_workers=0, collate_fn=collator)

In [11]:
compute_fine_grained_metrics

True

In [12]:
dataloader_train

<torch.utils.data.dataloader.DataLoader at 0x2c840d8b0>

# Train The Model

In [13]:
class CLIPClassifier(pl.LightningModule):

    def __init__(self, args, fine_grained_labels, compute_fine_grained_metrics):
        super().__init__()

        # self.caption_mode = args.caption_mode
        self.use_pretrained_map = args['use_pretrained_map']
        self.num_mapping_layers = args['num_mapping_layers']
        self.map_dim = args['map_dim']
        self.fusion = args['fusion']
        self.num_pre_output_layers = args['num_pre_output_layers']
        self.lr = args['lr']
        self.weight_decay = args['weight_decay']
        self.weight_image_loss = args['weight_image_loss']
        self.weight_text_loss = args['weight_text_loss']
        self.weight_fine_grained_loss = args['weight_fine_grained_loss']
        self.weight_super_loss = args['weight_super_loss']
        self.fine_grained_labels = fine_grained_labels
        self.compute_fine_grained_metrics = compute_fine_grained_metrics

        self.acc = torchmetrics.Accuracy(task="binary")
        self.auroc = torchmetrics.AUROC(task="binary")
        # self.precision_score = torchmetrics.Precision()
        # self.recall = torchmetrics.Recall()
        # self.f1 = torchmetrics.F1Score()

        self.validation_step_outputs = []

       

        self.clip = CLIPModel.from_pretrained(args['clip_pretrained_model'])
        self.image_encoder = copy.deepcopy(self.clip.vision_model)
        self.text_encoder = copy.deepcopy(self.clip.text_model)
        self.image_map = nn.Sequential(
                copy.deepcopy(self.clip.visual_projection),
                nn.ReLU(),
                nn.Linear(self.clip.projection_dim, self.map_dim)
                )
        self.text_map = nn.Sequential(
            copy.deepcopy(self.clip.text_projection),
            nn.ReLU(),
            nn.Linear(self.clip.projection_dim, self.map_dim)
            )


        

        if args['fusion'] in ['align', 'align_shuffle']:
            pre_output_input_dim = self.map_dim
        elif args['fusion'] == 'concat':
            pre_output_input_dim = self.map_dim*2
        elif args['fusion'].startswith('cross'):
            pre_output_input_dim = self.map_dim**2
        elif args['fusion'] == 'align_concat':
            pre_output_input_dim = self.map_dim*3
        elif args['fusion'] == 'attention_m':
            self.gen_query = nn.Linear(self.map_dim, self.map_dim//4)
            self.gen_key = nn.Linear(self.map_dim, self.map_dim//4)
            self.soft = nn.Softmax(dim=1)
            pre_output_input_dim = self.map_dim*2

        pre_output_layers = [nn.Dropout(p=args['drop_probs'])]
        output_input_dim = pre_output_input_dim


        if self.num_pre_output_layers >= 1: # first pre-output layer
            pre_output_layers.extend([nn.Linear(pre_output_input_dim, self.map_dim), nn.ReLU(), nn.Dropout(p=args['drop_probs'])])
            output_input_dim = self.map_dim
        for _ in range(1, self.num_pre_output_layers): # next pre-output layers
            pre_output_layers.extend([nn.Linear(self.map_dim, self.map_dim), nn.ReLU(), nn.Dropout(p=args['drop_probs'])])

        self.pre_output = nn.Sequential(*pre_output_layers)
        self.output = nn.Linear(output_input_dim, 1)
        self.output_image = nn.Linear(output_input_dim, 1)
        self.output_text = nn.Linear(output_input_dim, 1)

        if self.weight_image_loss > 0:
            pre_output_layers = [nn.Dropout(p=args['drop_probs'])]
            for _ in range(self.num_pre_output_layers): # next pre-output layers
                pre_output_layers.extend([nn.Linear(self.map_dim, self.map_dim), nn.ReLU(), nn.Dropout(p=args['drop_probs'])])
            self.pre_output_image = nn.Sequential(*pre_output_layers)

        if self.weight_text_loss > 0:
            pre_output_layers = [nn.Dropout(p=args['drop_probs'])]
            for _ in range(self.num_pre_output_layers): # next pre-output layers
                pre_output_layers.extend([nn.Linear(self.map_dim, self.map_dim), nn.ReLU(), nn.Dropout(p=args['drop_probs'])])
            self.pre_output_text = nn.Sequential(*pre_output_layers)

        if self.fine_grained_labels:
            # if self.dataset in ['original', 'masked', 'inpainted']:
            self.output_pc1 = nn.Linear(output_input_dim, 1)
            self.output_pc2 = nn.Linear(output_input_dim, 1)
            self.output_pc3 = nn.Linear(output_input_dim, 1)
            self.output_pc4 = nn.Linear(output_input_dim, 1)
            self.output_pc5 = nn.Linear(output_input_dim, 1)
            self.output_pc6 = nn.Linear(output_input_dim, 1)
            self.output_attack1 = nn.Linear(output_input_dim, 1)
            self.output_attack2 = nn.Linear(output_input_dim, 1)
            self.output_attack3 = nn.Linear(output_input_dim, 1)
            self.output_attack4 = nn.Linear(output_input_dim, 1)
            self.output_attack5 = nn.Linear(output_input_dim, 1)
            self.output_attack6 = nn.Linear(output_input_dim, 1)
            self.output_attack7 = nn.Linear(output_input_dim, 1)
            self.output_attack8 = nn.Linear(output_input_dim, 1)
            self.outputs_fine_grained = [self.output_pc1, self.output_pc2, self.output_pc3, self.output_pc4, self.output_pc5, self.output_pc6,
                self.output_attack1, self.output_attack2, self.output_attack3, self.output_attack4, self.output_attack5, self.output_attack6, self.output_attack7, self.output_attack8]
            self.output_super = nn.Linear(15, 1)

        self.cross_entropy_loss = torch.nn.BCEWithLogitsLoss(reduction='mean')

        if args['freeze_image_encoder']:
            for _, p in self.image_encoder.named_parameters():
                p.requires_grad_(False)

        if args['freeze_text_encoder']:
            for _, p in self.text_encoder.named_parameters():
                p.requires_grad_(False)

        del self.clip
        # if self.caption_mode == 'replace_image':
        #     del self.image_encoder, self.image_map

    def forward(self, batch):

        image_features = self.image_encoder(pixel_values=batch['pixel_values'][0]).pooler_output
        image_features = self.image_map(image_features)
        text_features = self.text_encoder(input_ids=batch['input_ids'], attention_mask=batch['attention_mask']).pooler_output

        image_features = F.normalize(image_features, p=2, dim=1) # [batch_size, d]
        text_features = F.normalize(text_features, p=2, dim=1) # [batch_size, d]

        features = torch.mul(image_features, text_features)  # [batch_size, d]

        features = self.pre_output(features)
        logits = self.output(features)
        preds = (torch.sigmoid(logits) >= 0.5).long()

        return preds

    def common_step(self, batch, batch_idx, calling_function='validation'):
        image_features = self.image_encoder(pixel_values=batch['pixel_values'][0]).pooler_output
        image_features = self.image_map(image_features)

        text_features = self.text_encoder(input_ids=batch['input_ids'], attention_mask=batch['attention_mask']).pooler_output
        text_features = self.text_map(text_features)

        image_features = F.normalize(image_features, p=2, dim=1)
        text_features = F.normalize(text_features, p=2, dim=1)

        output = {}

        if self.weight_image_loss > 0:
            features_pre_output = self.pre_output_image(image_features)
            logits = self.output_image(features_pre_output).squeeze(dim=1) # [batch_size, 1]
            preds_proxy = torch.sigmoid(logits)
            preds = (preds_proxy >= 0.5).long()

            output['image_loss'] = self.cross_entropy_loss(logits, batch['labels'].float())
            output['image_accuracy'] = self.acc(preds, batch['labels'])
            output['image_auroc'] = self.auroc(preds_proxy, batch['labels'])

        if self.weight_text_loss > 0:
            features_pre_output = self.pre_output_text(text_features)
            logits = self.output_text(features_pre_output).squeeze(dim=1) # [batch_size, 1]
            preds_proxy = torch.sigmoid(logits)
            preds = (preds_proxy >= 0.5).long()

            output['text_loss'] = self.cross_entropy_loss(logits, batch['labels'].float())
            output['text_accuracy'] = self.acc(preds, batch['labels'])
            output['text_auroc'] = self.auroc(preds_proxy, batch['labels'])


        features = torch.mul(image_features, text_features)

        features_pre_output = self.pre_output(features)
        logits = self.output(features_pre_output).squeeze(dim=1) # [batch_size, 1(or)n]
        if self.fine_grained_labels and self.dataset in ['original', 'masked', 'inpainted']:
            logits_for_super = [torch.relu(logits)]
        preds_proxy = torch.sigmoid(logits)
        preds = (preds_proxy >= 0.5).long()

        output['loss'] = self.cross_entropy_loss(logits, batch['labels'].float())
        output['accuracy'] = self.acc(preds, batch['labels'])
        output['auroc'] = self.auroc(preds_proxy, batch['labels'])



        if calling_function == 'training' and self.fine_grained_labels and self.outputs_fine_grained:
            for fine_grained_label, output_fine_grained in zip(self.fine_grained_labels, self.outputs_fine_grained):
                logits = output_fine_grained(features_pre_output).squeeze(dim=1)
                logits_for_super.append(torch.relu(logits))
                preds_proxy = torch.sigmoid(logits)
                preds = (preds_proxy >= 0.5).long()
                output[f'{fine_grained_label}_loss'] = self.cross_entropy_loss(logits, batch[fine_grained_label].float())
            logits_for_super = torch.stack(logits_for_super, dim=1) # [batch_size, 15]
            logits = self.output_super(logits_for_super).squeeze(dim=1)
            preds_proxy = torch.sigmoid(logits)
            preds = (preds_proxy >= 0.5).long()
            output['super_loss'] = self.cross_entropy_loss(logits, batch['labels'].float())
            output['super_accuracy'] = self.acc(preds, batch['labels'])
            output['super_auroc'] = self.auroc(preds_proxy, batch['labels'])


        elif calling_function == 'validation' and self.fine_grained_labels and self.outputs_fine_grained:
            for fine_grained_label, output_fine_grained in zip(self.fine_grained_labels, self.outputs_fine_grained):
                logits = output_fine_grained(features_pre_output).squeeze(dim=1)
                logits_for_super.append(torch.relu(logits))
                preds_proxy = torch.sigmoid(logits)
                preds = (preds_proxy >= 0.5).long()
                output[f'{fine_grained_label}_loss'] = self.cross_entropy_loss(logits, batch[fine_grained_label].float())
                output[f'{fine_grained_label}_accuracy'] = self.acc(preds, batch[fine_grained_label])
                output[f'{fine_grained_label}_auroc'] = self.auroc(preds_proxy, batch[fine_grained_label])
                
            logits_for_super = torch.stack(logits_for_super, dim=1) # [batch_size, 15]
            logits = self.output_super(logits_for_super).squeeze(dim=1)
            preds_proxy = torch.sigmoid(logits)
            preds = (preds_proxy >= 0.5).long()
            output[f'super_loss'] = self.cross_entropy_loss(logits, batch['labels'].float())
            output[f'super_accuracy'] = self.acc(preds, batch['labels'])
            output[f'super_auroc'] = self.auroc(preds_proxy, batch['labels'])

        elif calling_function == 'visualisation-v1':
            return image_features, text_features

        elif calling_function == 'visualisation-v2':
            return features

        return output

    def training_step(self, batch, batch_idx):
        output = self.common_step(batch, batch_idx, calling_function='training')

        if self.weight_image_loss > 0:
            image_loss = output['image_loss']
        else:
            image_loss = 0

        if self.weight_text_loss > 0:
            text_loss = output['text_loss']
        else:
            text_loss = 0

        if self.fine_grained_labels and self.outputs_fine_grained:
            fine_grained_loss = 0
            for fine_grained_label in self.fine_grained_labels:
                fine_grained_loss += output[f'{fine_grained_label}_loss']
            fine_grained_loss /= len(self.fine_grained_labels)
            super_loss = output['super_loss']
        else:
            fine_grained_loss = 0.0
            super_loss = 0.0

        total_loss = output['loss'] + self.weight_image_loss * image_loss + self.weight_text_loss * text_loss + self.weight_fine_grained_loss * fine_grained_loss + self.weight_super_loss * super_loss

        self.log('train/total_loss', total_loss)
        self.log('train/loss', output['loss'])
        self.log('train/accuracy', output['accuracy'])
        self.log('train/auroc', output['auroc'])

        if self.weight_image_loss > 0:
            self.log('train/image_loss', image_loss)
        if self.weight_text_loss > 0:
            self.log('train/text_loss', text_loss)

        self.log('train/fine_grained_loss', fine_grained_loss)
        self.log('train/super_loss', super_loss)

        return total_loss

    def validation_step(self, batch, batch_idx):
        output = self.common_step(batch, batch_idx, calling_function='validation')

        if self.weight_image_loss > 0:
            image_loss = output['image_loss']
        else:
            image_loss = 0

        if self.weight_text_loss > 0:
            text_loss = output['text_loss']
        else:
            text_loss = 0

        if self.fine_grained_labels and self.outputs_fine_grained:
            fine_grained_loss = torch.mean(torch.Tensor([output[f'{fine_grained_label}_loss'] for fine_grained_label in self.fine_grained_labels]))
            super_loss = output['super_loss']
        else:
            fine_grained_loss = 0.0
            super_loss = 0.0

        total_loss = output['loss'] + self.weight_image_loss * image_loss + self.weight_text_loss * text_loss + self.weight_fine_grained_loss * fine_grained_loss + self.weight_super_loss * super_loss

        self.log(f'val/total_loss', total_loss)
        self.log(f'val/loss', output['loss'])
        self.log(f'val/accuracy', output['accuracy'])
        self.log(f'val/auroc', output['auroc'])

        if self.weight_image_loss > 0:
            self.log(f'val/image_loss', image_loss)
        if self.weight_text_loss > 0:
            self.log(f'val/text_loss', text_loss)


        # TODO include this logic if needed
        if self.fine_grained_labels and self.compute_fine_grained_metrics:
            self.log(f'val/fine_grained_loss', fine_grained_loss)
            self.log(f'val/super_loss', super_loss)

            for fine_grained_label in self.fine_grained_labels:
                self.log(f'val-fine-grained/{fine_grained_label}_accuracy', output[f'{fine_grained_label}_accuracy'])
                self.log(f'val-fine-grained/{fine_grained_label}_auroc', output[f'{fine_grained_label}_auroc'])
                # self.log(f'val-fine-grained/{fine_grained_label}_precision', output[f'{fine_grained_label}_precision'])
                # self.log(f'val-fine-grained/{fine_grained_label}_recall', output[f'{fine_grained_label}_recall'])
                # self.log(f'val-fine-grained/{fine_grained_label}_f1', output[f'{fine_grained_label}_f1'])

            self.log(f'val/super_loss', output['super_loss'])
            self.log(f'val/super_accuracy', output['super_accuracy'])
            self.log(f'val/super_auroc', output['super_auroc'])

        self.validation_step_outputs.append(total_loss)
        return total_loss

    # def on_train_epoch_end(self, validation_step_outputs):
    def on_train_epoch_end(self):
        self.acc.reset()
        self.auroc.reset()
        # self.precision_score.reset()
        # self.recall.reset()
        # self.f1.reset()

    # def on_validation_epoch_end(self, validation_step_outputs):
    def on_validation_epoch_end(self):

        self.acc.reset()
        self.auroc.reset()
        # self.precision_score.reset()
        # self.recall.reset()
        # self.f1.reset()

        epoch_average = torch.stack(self.validation_step_outputs).mean()
        self.log("validation_epoch_average", epoch_average)
        self.validation_step_outputs.clear()  # free memory

    # def test_epoch_end(self, validation_step_outputs):
    #     self.acc.reset()
    #     self.auroc.reset()
    #     self.precision_score.reset()
    #     self.recall.reset()
    #     self.f1.reset()

    def configure_optimizers(self):
        param_dicts = [
            {"params": [p for n, p in self.named_parameters() if p.requires_grad]}
            ]
        # print("what are params ", param_dicts)
        optimizer = torch.optim.AdamW(param_dicts, lr=self.lr, weight_decay=self.weight_decay)

        return optimizer


def create_model(args, fine_grained_labels):
    compute_fine_grained_metrics = True
    model = CLIPClassifier(args=args, fine_grained_labels=fine_grained_labels, compute_fine_grained_metrics = compute_fine_grained_metrics)

    return model

In [14]:
# setup params
clip_model = "openai/clip-vit-base-patch32"

default_param = {
    "use_pretrained_map": False,
    "num_mapping_layers": 1,
    "map_dim": 768,
    "fusion": "align",
    "num_pre_output_layers": 1,
    "lr": 1e-4,
    "weight_decay": 1e4,
    "weight_image_loss": 1.0,
    "weight_text_loss": 1.0,
    "weight_fine_grained_loss": 1.0,
    "weight_super_loss": 1.0,
    "fine_grained_labels": [],
    "clip_pretrained_model": clip_model,
    "drop_probs": 0.1,
    "freeze_image_encoder": True,
    "freeze_text_encoder": True
}

default_param = {
    "use_pretrained_map": False,
    "num_mapping_layers": 1,
    "map_dim": 768,
    "fusion": "align",
    "num_pre_output_layers": 1,
    "lr": 1e-4,
    "weight_decay": 1e4,
    "weight_image_loss": 1.0,
    "weight_text_loss": 1.0,
    "weight_fine_grained_loss": 1.0,
    "weight_super_loss": 1.0,
    "fine_grained_labels": [],
    "clip_pretrained_model": clip_model,
    "drop_probs": 0.1,
    "freeze_image_encoder": True,
    "freeze_text_encoder": True
}


from pytorch_lightning import Trainer, seed_everything


model = create_model(default_param, fine_grained_labels=[])

In [15]:
model

CLIPClassifier(
  (acc): BinaryAccuracy()
  (auroc): BinaryAUROC()
  (image_encoder): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (position_embedding): Embedding(50, 768)
    )
    (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_f

In [16]:
# from pytorch_lightning.callbacks import ModelCheckpoint

# monitor="val/auroc"
# project="meme-v2"

# checkpoint_callback = ModelCheckpoint(dirpath='checkpoints', filename=project,  monitor=monitor, 
#                                       mode='max', verbose=True, save_weights_only=True, save_top_k=3, save_last=False)

In [17]:
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint

seed_everything(42)
model = create_model(default_param, fine_grained_labels=[])


# #TODO add GPU later
# gpus=args.gpus

max_steps = -1
gradient_clip_val = 0.1
log_every_n_steps = 50
max_epochs = -1
val_check_interval = 1.0
limit_train_batches = 1.0
limit_val_batches = 1.0

monitor="val/auroc"
project="meme-v2"


checkpoint_callback = ModelCheckpoint(dirpath='checkpoints', filename='checkpointFile',  
                                      monitor=monitor, mode='max', verbose=True, save_weights_only=True, save_top_k=3, save_last=False)

# accelerator="cpu", devices=2
# accelerator="gpu", devices=1

trainer = Trainer(max_epochs=max_epochs, max_steps=max_steps, gradient_clip_val=gradient_clip_val, 
        log_every_n_steps=log_every_n_steps, val_check_interval=val_check_interval, accelerator="cpu", devices=1,
        strategy="auto", callbacks=[checkpoint_callback],
        limit_train_batches=limit_train_batches, limit_val_batches=limit_val_batches,
        deterministic=True)

Seed set to 42
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/naziultalukder/miniconda3/envs/test-env4/lib/python3.8/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/naziultalukder/miniconda3/envs/test-env4/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches pe

In [None]:
trainer.fit(model, train_dataloaders=dataloader_train, val_dataloaders=dataloader_val)

/Users/naziultalukder/miniconda3/envs/test-env4/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory checkpoints exists and is not empty.

   | Name               | Type                  | Params
--------------------------------------------------------------
0  | acc                | BinaryAccuracy        | 0     
1  | auroc              | BinaryAUROC           | 0     
2  | image_encoder      | CLIPVisionTransformer | 87.5 M
3  | text_encoder       | CLIPTextTransformer   | 63.2 M
4  | image_map          | Sequential            | 787 K 
5  | text_map           | Sequential            | 656 K 
6  | pre_output         | Sequential            | 590 K 
7  | output             | Linear                | 769   
8  | output_image       | Linear                | 769   
9  | output_text        | Linear                | 769   
10 | pre_output_image   | Sequential            | 590 K 
11 | pre_output_text    | Sequential            | 590 K 
12 | cro

Sanity Checking: |                                                                                            …

/Users/naziultalukder/miniconda3/envs/test-env4/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/naziultalukder/miniconda3/envs/test-env4/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Training: |                                                                                                   …

Validation: |                                                                                                 …

Epoch 0, global step 532: 'val/auroc' reached 0.44800 (best 0.44800), saving model to 'checkpoints/checkpointFile-v3.ckpt' as top 3


Validation: |                                                                                                 …

Epoch 1, global step 1064: 'val/auroc' reached 0.44800 (best 0.44800), saving model to 'checkpoints/checkpointFile-v4.ckpt' as top 3


Validation: |                                                                                                 …

Epoch 2, global step 1596: 'val/auroc' reached 0.44800 (best 0.44800), saving model to 'checkpoints/checkpointFile-v5.ckpt' as top 3


Validation: |                                                                                                 …

Epoch 3, global step 2128: 'val/auroc' was not in top 3


Validation: |                                                                                                 …

Epoch 4, global step 2660: 'val/auroc' was not in top 3


Validation: |                                                                                                 …

Epoch 5, global step 3192: 'val/auroc' was not in top 3


Validation: |                                                                                                 …

Epoch 6, global step 3724: 'val/auroc' was not in top 3


Validation: |                                                                                                 …

Epoch 7, global step 4256: 'val/auroc' was not in top 3


Validation: |                                                                                                 …

Epoch 8, global step 4788: 'val/auroc' was not in top 3


Validation: |                                                                                                 …

Epoch 9, global step 5320: 'val/auroc' was not in top 3


Validation: |                                                                                                 …

Epoch 10, global step 5852: 'val/auroc' was not in top 3


In [None]:
dataloader_val

- Added 0 worker for dataloader and 1 device for trainer. The auroc is 0.448 and it is unchanged through different epochs.
- <the issue could be that configure_optimizer is not taking the right parameters and updating the right thing