In [None]:
#!pip install --upgrade transformers

In [None]:
#!pip install wandb


In [None]:
import wandb

wandb.login()

In [7]:
!rm -rf /kaggle/working/*

  pid, fd = os.forkpty()


In [9]:
from torch import nn


class MisogynyCls(nn.Module):
    def __init__(self, num_linear_layers, task_a_out=1, task_b_out=4, input_dim=1024, hidden_dim=512, drop_value=0.2):
        super().__init__()
        self.head_task_a = nn.Linear(hidden_dim, task_a_out)
        self.head_task_b = nn.Linear(hidden_dim, task_b_out)
        self.sigmoid = nn.Sigmoid()
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Check if CUDA is available
        
        # Pretrained CLIP loading...
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", device_map='cuda')
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


        self.layers = nn.ModuleList()

        for i in range(num_linear_layers):
            if i == 0:
                self.layers.append(nn.Linear(input_dim, hidden_dim))
            else:
                self.layers.append(nn.Linear(hidden_dim, hidden_dim))
                
            self.layers.append(nn.BatchNorm1d(hidden_dim))
            self.layers.append(nn.Dropout(drop_value))
            self.layers.append(nn.ReLU())

    def forward(self, text_list, image_list):
        clip_inputs = self.clip_processor(text=text_list, images=image_list, return_tensors="pt", padding=True, truncation=True)
        clip_inputs['input_ids'] = clip_inputs['input_ids'].to(self.device)
        clip_inputs['attention_mask'] = clip_inputs['attention_mask'].to(self.device)
        clip_inputs['pixel_values'] = clip_inputs['pixel_values'].to(self.device)
        clip_outputs = self.clip_model(**clip_inputs)
        
        x = torch.cat([clip_outputs['text_embeds'], clip_outputs['image_embeds']], dim=1).to(self.device) # model input is the concatenation of the two modalities !
            
        for layer in self.layers:
            x = layer(x)
            
        pred_taskA = self.sigmoid(self.head_task_a(x))
        pred_taskB = self.sigmoid(self.head_task_b(x))
        
        return pred_taskA.squeeze(1), pred_taskB

## IN CASE WE NEED TO TRANSFORM A TSV/CSV INTO A JSON FORMAT (last two cells are only for testing purposese... the first instead is the one needed for the conversion from tsv to json)

In [10]:
import csv
import json
import os

images_path = "/kaggle/input/dataset-wow/MAMI DATASET/MAMI DATASET/training/TRAINING"

file_and_dest = [('/kaggle/input/dataset-wow/train_image_text.tsv','/kaggle/working/train_image_text.json'),
                    ('/kaggle/input/dataset-wow/test_image_text.tsv','/kaggle/working/test_image_text.json')]


for file in file_and_dest: 
    counter = 0
    data = []

    with open(file[0], newline='', encoding='utf-8') as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        
        for row in reader:
            counter += 1
            data.append(row)
        print(f"counter: {counter}")
    if not os.path.exists(file[1]):
        with open(file[1], 'w', encoding='utf-8') as jsonfile:
            json.dump([], jsonfile, ensure_ascii=False, indent=4)
        print(f"File JSON vuoto creato come {file[1]}")

        with open(file[1], 'w', encoding='utf-8') as jsonfile:
            json.dump(data, jsonfile, ensure_ascii=False, indent=4)

        print(f"File JSON salvato come {file[1]}")



counter: 9000
File JSON vuoto creato come /kaggle/working/train_image_text.json
File JSON salvato come /kaggle/working/train_image_text.json
counter: 1000
File JSON vuoto creato come /kaggle/working/test_image_text.json
File JSON salvato come /kaggle/working/test_image_text.json


In [None]:
'''
import csv
import json
import os

images_path = "/kaggle/input/dataset-wow/MAMI DATASET/MAMI DATASET/training/TRAINING"

file_source = '/kaggle/input/dataset-wow/train_image_text.tsv'
file_train = '/kaggle/working/train_image_text.json'
file_test = '/kaggle/working/test_image_text.json'
data = []

with open(file_source, newline='', encoding='utf-8') as tsvfile:
    reader = csv.DictReader(tsvfile, delimiter='\t')
    

    for row in reader:
        file_path = os.path.join(images_path, row["file_name"])

        if os.path.exists(file_path):
            data.append(row)
    
    split = int(len(data)*0.8)

    with open(file_train, 'w', encoding='utf-8') as jsonfile:
        json.dump([], jsonfile, ensure_ascii=False, indent=4)
    print(f"File JSON vuoto creato come {file_train}")

    with open(file_test, 'w', encoding='utf-8') as jsonfile:
        json.dump([], jsonfile, ensure_ascii=False, indent=4)
    print(f"File JSON vuoto creato come {file_test}")
    
    with open(file_train, 'w', encoding='utf-8') as jsonfile:
        json.dump(data[:split], jsonfile, ensure_ascii=False, indent=4)
    print(f"File JSON salvato come {file_train}")
        
    with open(file_test, 'w', encoding='utf-8') as jsonfile:
        json.dump(data[split:], jsonfile, ensure_ascii=False, indent=4)
    print(f"File JSON salvato come {file_test}")
'''

## Multimodal Dataset definition

In [11]:
from torch.utils.data import Dataset
import os
import wandb

class MultimodalDataset(Dataset): # Dataset for handling multimodal data
    def __init__(self, images_dir, json_file_path): # dir_path -> directory path where images are stored / json_file_path -> file path for metadata (including labels)   
        file_paths, text_list, labels_misogyny, shaming_label_list, stereotype_label_list, objectification_label_list, violence_label_list = load_json_file(json_file_path)
   
        self.file_paths = file_paths
        self.images_dir = images_dir
        self.text_list = text_list
        self.labels_misogyny = labels_misogyny
        self.shaming_label_list = shaming_label_list
        self.stereotype_label_list = stereotype_label_list
        self.objectification_label_list = objectification_label_list
        self.violence_label_list = violence_label_list
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        return self.file_paths[idx], self.text_list[idx], self.labels_misogyny[idx], self.shaming_label_list[idx], self.stereotype_label_list[idx], self.objectification_label_list[idx], self.violence_label_list[idx]
    
def load_json_file(json_file_path):
    with open(json_file_path,"r") as f:
        data = json.load(f)
        text_list = [] # list of the text related to each image
        image_list = [] # list of images path
        labels_misogyny = [] # list of TASK A labels (misogyny classification)

        ### list of TASK B labels ###
        shaming_label_list = [] 
        stereotype_label_list = []
        objectification_label_list = []
        violence_label_list = []


        for item in tqdm(data): 
            image_list.append(item['file_name'])
            text_list.append(item["text"])
            labels_misogyny.append(float(item["label"]))
            shaming_label_list.append(float(item["shaming"]))
            stereotype_label_list.append(float(item["stereotype"]))
            objectification_label_list.append(float(item["objectification"]))
            violence_label_list.append(float(item["violence"]))

        #print(f"{type(labels_misogyny)}")
        return image_list, text_list, torch.tensor(labels_misogyny, dtype=torch.float32), torch.tensor(shaming_label_list, dtype=torch.float32), torch.tensor(stereotype_label_list, dtype=torch.float32), torch.tensor(objectification_label_list, dtype=torch.float32), torch.tensor(violence_label_list, dtype=torch.float32)
    
def accuracy(preds, labels, thresh):
    num_samples = labels.shape[0]
    preds = preds > thresh
    matching_rows = torch.eq(labels.bool(), preds)
    
    # in case we're dealing with the prediction of task B/task A (they've different number of dimensions)
    num_correct = matching_rows.all(dim=1).sum().item() if preds.ndim!=1 else matching_rows.sum().item()
    return 100*(num_correct/num_samples)

## LOADING THE DATASET AND TRAINING THE NETWORK
# Thanks to a class used for handling the argument of the network training (still room for modifying it !)

In [69]:
### import torch
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image # per debug
import os
import json
from tqdm import tqdm
from PIL import Image
import torch.nn.functional as F
import torch.optim as optim
import wandb



class Trainer():
    def __init__(self, train_images_dir,
                       test_image_dir,
                       json_train_path,
                       json_test_path,
                       num_linear_layers=5,
                       drop_value=0.2,
                       train_data_split=0.8,
                       batch_size=256, 
                       lr=0.001, 
                       num_epochs=10,
                       threshold=0.5,
                       weight_taskA=1,
                       weight_taskB=1):
        
        # Check if CUDA is available
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print(f"Training on: {self.device}")
        
        # Loading the Dataset
        self.train_images_dir = train_images_dir
        self.test_image_dir = test_image_dir
        self.num_epochs = num_epochs
        self.threshold = threshold
        self.weight_taskA = weight_taskA
        self.weight_taskB = weight_taskB
        
        train_data = MultimodalDataset(train_images_dir, json_train_path)
        test_data = MultimodalDataset(test_image_dir, json_test_path)
        
        print(f"training on samples:{train_data.__len__()}")
        print(f"testing on samples:{test_data.__len__()}")
    
        self.train_dataloader = DataLoader(train_data, batch_size, shuffle=True, pin_memory=True)
        self.test_dataloader = DataLoader(test_data, batch_size, shuffle=True, pin_memory=True)

        # Defining the Model
        self.classifier = MisogynyCls(num_linear_layers=num_linear_layers, drop_value=drop_value).to(self.device)
        self.optimizer = optim.Adam(self.classifier.parameters(), lr)
        self.loss_taskA = F.binary_cross_entropy 
        self.loss_taskB = F.binary_cross_entropy

        # Pretrained CLIP loading...
        #self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", device_map='cuda')
        #self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        
    def get_test_dataset(self,size=100):
        batch = next(iter(self.test_dataloader))
        images, _,_,_,_,_,_ = batch
        return images[:size]
    
    def get_model(self):
        return self.classifier
    
        
    def train_model(self):
        for epoch in range(self.num_epochs):
            print(f'Epoch [{epoch+1}/{self.num_epochs}]')
            train_loss_list, test_loss_list, train_acc_taskA_list, train_acc_taskB_list,  test_acc_taskA_list, test_acc_taskB_list= self.train_epoch()
            
            train_loss_avg = sum(train_loss_list) / len(train_loss_list)
            test_loss_avg = sum(test_loss_list) / len(test_loss_list)
            train_acc_taskA_avg = sum(train_acc_taskA_list) / len(train_acc_taskA_list)
            train_acc_taskB_avg = sum(test_acc_taskA_list) / len(test_acc_taskA_list)
            test_acc_taskA_avg = sum(train_acc_taskB_list) / len(train_acc_taskB_list)
            test_acc_taskB_avg = sum(test_acc_taskB_list) / len(test_acc_taskB_list)
            
            print(f'Average Train Loss: {train_loss_avg: .4f}, Average Test Loss: {test_loss_avg: .4f}')
            print(f'Average Accuracy Train (task A): {train_acc_taskA_avg: .4f}%, Average Accuracy Test (task A): {train_acc_taskB_avg: .4f}%')
            print(f'Average Accuracy Train (task B): {test_acc_taskA_avg: .4f}%, Average Accuracy Test (task B): {test_acc_taskB_avg: .4f}%')
                    
    def train_epoch(self):
        train_loss_list = []
        test_loss_list = []
        train_acc_taskA_list = []
        train_acc_taskB_list = []
        test_acc_taskA_list = []
        test_acc_taskB_list = []
        
        for batch in tqdm(self.train_dataloader):
            self.optimizer.zero_grad() # ZEROING OUT THE GRADIENTS
            self.classifier.train() # TRAINING MODE
            
            # CREATING THE CLIP EMBEDDINGS
            image_list, text_list, labels_misogyny, shaming_labels, stereotype_labels, objectification_labels, violence_labels = batch
            image_list = [Image.open(f"{os.path.join(self.train_images_dir, img)}") for img in image_list] # per poterlo usare poi con CLIP

            labels_misogyny = labels_misogyny.to(self.device)
            labels_taskB = torch.stack([shaming_labels, stereotype_labels, objectification_labels, violence_labels],  dim=1).to(self.device)
    
            pred_taskA, pred_taskB = self.classifier(text_list, image_list)

            loss_A = self.loss_taskA(pred_taskA, labels_misogyny)
            loss_B = self.loss_taskB(pred_taskB, labels_taskB, reduction='mean')
            loss = (self.weight_taskA * loss_A) + (self.weight_taskB * loss_B)
            train_loss_list.append(loss)
            
            loss.backward()
            self.optimizer.step()

            accuracy_taskA = accuracy(pred_taskA, labels_misogyny, self.threshold)
            accuracy_taskB = accuracy(pred_taskB, labels_taskB, self.threshold)
            
            train_acc_taskA_list.append(accuracy_taskA)
            train_acc_taskB_list.append(accuracy_taskB)
        
        
        with torch.no_grad():
            self.classifier.eval()

            for batch in tqdm(self.test_dataloader):
                # CREATING THE CLIP EMBEDDINGS
                image_list, text_list, labels_misogyny, shaming_labels, stereotype_labels, objectification_labels, violence_labels = batch

                image_list = [Image.open(f"{os.path.join(self.test_image_dir, img)}") for img in image_list] # per poterlo usare poi con CLIP
                labels_misogyny = labels_misogyny.to(self.device)
                labels_taskB = torch.stack([shaming_labels, stereotype_labels, objectification_labels, violence_labels],  dim=1).to(self.device)
                
                pred_taskA, pred_taskB = self.classifier(text_list, image_list)

                loss_A = self.loss_taskA(pred_taskA, labels_misogyny)
                loss_B = self.loss_taskB(pred_taskB, labels_taskB, reduction='mean')
                loss = (self.weight_taskA * loss_A) + (self.weight_taskB * loss_B)
                test_loss_list.append(loss)

                accuracy_taskA = accuracy(pred_taskA, labels_misogyny, self.threshold)
                accuracy_taskB = accuracy(pred_taskB, labels_taskB, self.threshold)

                test_acc_taskA_list.append(accuracy_taskA)
                test_acc_taskB_list.append(accuracy_taskB)
            
            
            
        return train_loss_list, test_loss_list, train_acc_taskA_list, train_acc_taskB_list, test_acc_taskA_list, test_acc_taskB_list

            

## Training WITHOUT wandb

In [70]:
model_trainer = Trainer("/kaggle/input/dataset-wow/MAMI DATASET/MAMI DATASET/training/TRAINING",# train_images_dir
                        "/kaggle/input/dataset-wow/MAMI DATASET/MAMI DATASET/test", #test_images_dir
                        "/kaggle/working/train_image_text.json",
                        "/kaggle/working/test_image_text.json",
                        batch_size=64,
                        num_epochs=3) # json_file as data source


Training on: cuda:0


100%|██████████| 9000/9000 [00:00<00:00, 457466.17it/s]
100%|██████████| 1000/1000 [00:00<00:00, 375027.18it/s]

training on samples:9000
testing on samples:1000





In [26]:
model_trainer.train_model()

Training on: cuda:0


100%|██████████| 9000/9000 [00:00<00:00, 454393.45it/s]
100%|██████████| 1000/1000 [00:00<00:00, 306019.55it/s]

training on samples:9000
testing on samples:1000





Epoch [1/3]


  3%|▎         | 4/141 [00:08<04:58,  2.18s/it]


KeyboardInterrupt: 

## Training with wandb

In [None]:
import wandb


def train(config=None):
    with wandb.init(config=config):
    
        config = wandb.config
        model_trainer = Trainer("/kaggle/input/dataset-wow/MAMI DATASET/MAMI DATASET/training/TRAINING",# train_images_dir
                            "/kaggle/input/dataset-wow/MAMI DATASET/MAMI DATASET/test", #test_images_dir
                            "/kaggle/working/train_image_text.json",
                            "/kaggle/working/test_image_text.json",
                            batch_size=64,
                            num_linear_layers=config.num_layers,
                            lr=config.learning_rate,
                            threshold=config.threshold,
                            drop_value=config.dropout) # json_file as data source

        model_trainer.train_model()
    
    
# Log in to W&B using the API key
sweep_config = {
    'method': 'random'
}
parameters_dict = {
    'learning_rate': {
        'values': [0.1, 0.01, 0.001, 0.0001]
        },
    'threshold': {
          'values': [0.5, 0.6, 0.7, 0.8]
        },
    'dropout': {
          'values': [0.2, 0.3, 0.5]
        },
    'num_layers': {
          'values': [5, 7, 10]
        },
    }

sweep_config['parameters'] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project="Multimodal_xai")   
wandb.agent(sweep_id, train, count=20)
    

ShapSingle Modalities Explaination

In [71]:
test_img=model_trainer.get_test_dataset(100) 
background_dataset= test_img[:100]

image_to_explain = test_img[100:103]



In [72]:
import shap
e = shap.DeepExplainer(model_trainer.get_model(), background_dataset)
shap_values = e.shap_values(image_to_explain)

TypeError: MisogynyCls.forward() takes 3 positional arguments but 65 were given