In [1]:
#!pip install --upgrade transformers

In [2]:
#!pip install wandb


In [3]:
#!wandb login


In [12]:
!rm -rf /kaggle/working/*

In [13]:
from torch import nn


class MisogynyCls(nn.Module):
    def __init__(self, num_linear_layers, task_a_out=1, task_b_out=4, input_dim=1024, hidden_dim=512, drop_value=0.2):
        super().__init__()
        self.head_task_a = nn.Linear(hidden_dim, task_a_out)
        self.head_task_b = nn.Linear(hidden_dim, task_b_out)
        self.sigmoid = nn.Sigmoid()


        self.layers = nn.ModuleList()

        for i in range(num_linear_layers):
            if i == 0:
                self.layers.append(nn.Linear(input_dim, hidden_dim))
            else:
                self.layers.append(nn.Linear(hidden_dim, hidden_dim))
                
            self.layers.append(nn.Dropout(drop_value))
            self.layers.append(nn.ReLU())

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        pred_taskA = self.sigmoid(self.head_task_a(x))
        pred_taskB = self.sigmoid(self.head_task_b(x))
        
        return pred_taskA.squeeze(1), pred_taskB

## IN CASE WE NEED TO TRANSFORM A TSV/CSV INTO A JSON FORMAT (last two cells are only for testing purposese... the first instead is the one needed for the conversion from tsv to json)

In [14]:
import csv
import json
import os

images_path = "/kaggle/input/dataset-wow/MAMI DATASET/MAMI DATASET/training/TRAINING"

file_and_dest = [('/kaggle/input/dataset-wow/train_image_text.tsv','/kaggle/working/train_image_text.json'),
                    ('/kaggle/input/dataset-wow/test_image_text.tsv','/kaggle/working/test_image_text.json')]


for file in file_and_dest: 
    data = []

    with open(file[0], newline='', encoding='utf-8') as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        
        for row in reader:
            data.append(row)
                
    if not os.path.exists(file[1]):
        with open(file[1], 'w', encoding='utf-8') as jsonfile:
            json.dump([], jsonfile, ensure_ascii=False, indent=4)
        print(f"File JSON vuoto creato come {file[1]}")

        with open(file[1], 'w', encoding='utf-8') as jsonfile:
            json.dump(data, jsonfile, ensure_ascii=False, indent=4)

        print(f"File JSON salvato come {file[1]}")



File JSON vuoto creato come /kaggle/working/train_image_text.json
File JSON salvato come /kaggle/working/train_image_text.json
File JSON vuoto creato come /kaggle/working/test_image_text.json
File JSON salvato come /kaggle/working/test_image_text.json


In [5]:
'''
import csv
import json
import os

images_path = "/kaggle/input/dataset-wow/MAMI DATASET/MAMI DATASET/training/TRAINING"

file_source = '/kaggle/input/dataset-wow/train_image_text.tsv'
file_train = '/kaggle/working/train_image_text.json'
file_test = '/kaggle/working/test_image_text.json'
data = []

with open(file_source, newline='', encoding='utf-8') as tsvfile:
    reader = csv.DictReader(tsvfile, delimiter='\t')
    

    for row in reader:
        file_path = os.path.join(images_path, row["file_name"])

        if os.path.exists(file_path):
            data.append(row)
    
    split = int(len(data)*0.8)

    with open(file_train, 'w', encoding='utf-8') as jsonfile:
        json.dump([], jsonfile, ensure_ascii=False, indent=4)
    print(f"File JSON vuoto creato come {file_train}")

    with open(file_test, 'w', encoding='utf-8') as jsonfile:
        json.dump([], jsonfile, ensure_ascii=False, indent=4)
    print(f"File JSON vuoto creato come {file_test}")
    
    with open(file_train, 'w', encoding='utf-8') as jsonfile:
        json.dump(data[:split], jsonfile, ensure_ascii=False, indent=4)
    print(f"File JSON salvato come {file_train}")
        
    with open(file_test, 'w', encoding='utf-8') as jsonfile:
        json.dump(data[split:], jsonfile, ensure_ascii=False, indent=4)
    print(f"File JSON salvato come {file_test}")
'''

File JSON vuoto creato come /kaggle/working/train_image_text.json
File JSON vuoto creato come /kaggle/working/test_image_text.json
File JSON salvato come /kaggle/working/train_image_text.json
File JSON salvato come /kaggle/working/test_image_text.json


## Multimodal Dataset definition

In [15]:
from torch.utils.data import Dataset
import os
import wandb

class MultimodalDataset(Dataset): # Dataset for handling multimodal data
    def __init__(self, images_dir, json_file_path): # dir_path -> directory path where images are stored / json_file_path -> file path for metadata (including labels)   
        file_paths, text_list, labels_misogyny, shaming_label_list, stereotype_label_list, objectification_label_list, violence_label_list = load_json_file(json_file_path)
   
        self.file_paths = file_paths
        self.images_dir = images_dir
        self.text_list = text_list
        self.labels_misogyny = labels_misogyny
        self.shaming_label_list = shaming_label_list
        self.stereotype_label_list = stereotype_label_list
        self.objectification_label_list = objectification_label_list
        self.violence_label_list = violence_label_list
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        return self.file_paths[idx], self.text_list[idx], self.labels_misogyny[idx], self.shaming_label_list[idx], self.stereotype_label_list[idx], self.objectification_label_list[idx], self.violence_label_list[idx]
    
def load_json_file(json_file_path):
    with open(json_file_path,"r") as f:
        data = json.load(f)
        text_list = [] # list of the text related to each image
        image_list = [] # list of images path
        labels_misogyny = [] # list of TASK A labels (misogyny classification)

        ### list of TASK B labels ###
        shaming_label_list = [] 
        stereotype_label_list = []
        objectification_label_list = []
        violence_label_list = []


        for item in tqdm(data): 
            image_list.append(item['file_name'])
            text_list.append(item["text"])
            labels_misogyny.append(float(item["label"]))
            shaming_label_list.append(float(item["shaming"]))
            stereotype_label_list.append(float(item["stereotype"]))
            objectification_label_list.append(float(item["objectification"]))
            violence_label_list.append(float(item["violence"]))

        #print(f"{type(labels_misogyny)}")
        return image_list, text_list, torch.tensor(labels_misogyny, dtype=torch.float32), torch.tensor(shaming_label_list, dtype=torch.float32), torch.tensor(stereotype_label_list, dtype=torch.float32), torch.tensor(objectification_label_list, dtype=torch.float32), torch.tensor(violence_label_list, dtype=torch.float32)
    
def accuracy(preds, labels, thresh):
    num_samples = labels.shape[0]
    preds = preds > thresh
    matching_rows = torch.eq(labels.bool(), preds)
    
    # in case we're dealing with the prediction of task B/task A (they've different number of dimensions)
    num_correct = matching_rows.all(dim=1).sum().item() if preds.ndim!=1 else matching_rows.sum().item()
    return 100*(num_correct/num_samples)

## LOADING THE DATASET AND TRAINING THE NETWORK
# Thanks to a class used for handling the argument of the network training (still room for modifying it !)

In [21]:
### import torch
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image # per debug
import os
import json
from tqdm import tqdm
from PIL import Image
import torch.nn.functional as F
import torch.optim as optim
import wandb



class Trainer():
    def __init__(self, train_images_dir,
                       test_image_dir,
                       json_train_path,
                       json_test_path,
                       train_data_split=0.8,
                       batch_size=256, 
                       lr=0.001, 
                       num_epochs=15,
                       threshold=0.5,
                       weight_taskA=0.7,
                       weight_taskB=0.3):
        
        # Check if CUDA is available
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print(f"Training on: {self.device}")
        
        # Loading the Dataset
        self.train_images_dir = train_images_dir
        self.test_image_dir = test_image_dir
        self.num_epochs = num_epochs
        self.threshold = threshold
        self.weight_taskA = weight_taskA
        self.weight_taskB = weight_taskB
        
        train_data = MultimodalDataset(train_images_dir, json_train_path)
        test_data = MultimodalDataset(test_image_dir, json_test_path)

        self.train_dataloader = DataLoader(train_data, batch_size, shuffle=True, pin_memory=True)
        self.test_dataloader = DataLoader(test_data, batch_size, shuffle=True, pin_memory=True)

        # Defining the Model
        self.classifier = MisogynyCls(5).to(self.device)
        self.optimizer = optim.Adam(self.classifier.parameters(), lr)
        self.loss_taskA = F.binary_cross_entropy 
        self.loss_taskB = F.binary_cross_entropy

        # Pretrained CLIP loading...
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", device_map='cuda')
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
        '''
        wandb.init(  # set the wandb project where this run will be logged
                    project="Multimodal_xai", 
                    # track hyperparameters and run metadata   
                    config={
                        #"architecture": args.model,    "optimizer": "SGD" if args.use_sgd else "ADAM",
                        #"scheduler" : args.scheduler,    "learning_rate": args.lr * 100 if args.use_sgd else args.lr,
                        #"min_learning_rate": args.lrmin,    "epochs": args.epochs,
                        #"filename": args.filename,    "batch": args.batch_size,
                        #"momentum": args.momentum,    "weightdecay": args.wd,
                        #"tanh": args.tanh,    "invert_lr": args.invert_lr,
                        #"max_learning_rate": args.lrmax
                    })
        '''
        
    def train_model(self):
        for epoch in range(self.num_epochs):
            print(f'Epoch [{epoch+1}/{self.num_epochs}]')
            train_loss_list, test_loss_list, train_acc_taskA_list, train_acc_taskB_list,  test_acc_taskA_list, test_acc_taskB_list= self.train_epoch()
            
            print(f'Average Train Loss: {sum(train_loss_list) / len(train_loss_list): .4f}, Average Test Loss: {sum(test_loss_list) / len(test_loss_list): .4f}')
            print(f'Average Accuracy Train (task A): {sum(train_acc_taskA_list) / len(train_acc_taskA_list): .4f}%, Average Accuracy Test (task A): {sum(test_acc_taskA_list) / len(test_acc_taskA_list): .4f}%')
            print(f'Average Accuracy Train (task B): {sum(train_acc_taskB_list) / len(train_acc_taskB_list): .4f}%, Average Accuracy Test (task B): {sum(test_acc_taskB_list) / len(test_acc_taskB_list): .4f}%')

            model_path = '/kaggle/working/model_' + str(epoch+1) + '.pth'
            torch.save(self.classifier.state_dict(), model_path);
            
            

    def train_epoch(self):
        train_loss_list = []
        test_loss_list = []
        train_acc_taskA_list = []
        train_acc_taskB_list = []
        test_acc_taskA_list = []
        test_acc_taskB_list = []
        
        for batch in tqdm(self.train_dataloader):
            self.optimizer.zero_grad() # ZEROING OUT THE GRADIENTS
            self.classifier.train() # TRAINING MODE
            
            # CREATING THE CLIP EMBEDDINGS
            image_list, text_list, labels_misogyny, shaming_labels, stereotype_labels, objectification_labels, violence_labels = batch
            
            image_list = [Image.open(f"{os.path.join(self.train_images_dir, img)}") for img in image_list] # per poterlo usare poi con CLIP
            labels_misogyny = labels_misogyny.to(self.device)
            labels_taskB = torch.stack([shaming_labels, stereotype_labels, objectification_labels, violence_labels],  dim=1).to(self.device)

            clip_inputs = self.clip_processor(text=text_list, images=image_list, return_tensors="pt", padding=True, truncation=True)
            clip_inputs['input_ids'] = clip_inputs['input_ids'].to(self.device)
            clip_inputs['attention_mask'] = clip_inputs['attention_mask'].to(self.device)
            clip_inputs['pixel_values'] = clip_inputs['pixel_values'].to(self.device)
    
            clip_outputs = self.clip_model(**clip_inputs)
            model_input = torch.cat([clip_outputs['text_embeds'], clip_outputs['image_embeds']], dim=1).to(self.device)
    
            pred_taskA, pred_taskB = self.classifier(model_input)

            loss_A = self.loss_taskA(pred_taskA, labels_misogyny)
            loss_B = self.loss_taskB(pred_taskB, labels_taskB, reduction='mean')
            loss = (self.weight_taskA * loss_A) + (self.weight_taskB * loss_B)
            train_loss_list.append(loss)
            
            loss.backward()
            self.optimizer.step()

            accuracy_taskA = accuracy(pred_taskA, labels_misogyny, self.threshold)
            accuracy_taskB = accuracy(pred_taskB, labels_taskB, self.threshold)
            
            train_acc_taskA_list.append(accuracy_taskA)
            train_acc_taskB_list.append(accuracy_taskB)
        
        
        with torch.no_grad():
            self.classifier.eval()

            for batch in tqdm(self.test_dataloader):
                # CREATING THE CLIP EMBEDDINGS
                image_list, text_list, labels_misogyny, shaming_labels, stereotype_labels, objectification_labels, violence_labels = batch

                image_list = [Image.open(f"{os.path.join(self.test_images_dir, img)}") for img in image_list] # per poterlo usare poi con CLIP
                labels_misogyny = labels_misogyny.to(self.device)
                labels_taskB = torch.stack([shaming_labels, stereotype_labels, objectification_labels, violence_labels],  dim=1).to(self.device)

                clip_inputs = self.clip_processor(text=text_list, images=image_list, return_tensors="pt", padding=True, truncation=True)
                clip_inputs['input_ids'] = clip_inputs['input_ids'].to(self.device)
                clip_inputs['attention_mask'] = clip_inputs['attention_mask'].to(self.device)
                clip_inputs['pixel_values'] = clip_inputs['pixel_values'].to(self.device)

                clip_outputs = self.clip_model(**clip_inputs)
                model_input = torch.cat([clip_outputs['text_embeds'], clip_outputs['image_embeds']], dim=1).to(self.device)

                pred_taskA, pred_taskB = self.classifier(model_input)

                loss_A = self.loss_taskA(pred_taskA, labels_misogyny)
                loss_B = self.loss_taskB(pred_taskB, labels_taskB, reduction='mean')
                loss = (self.weight_taskA * loss_A) + (self.weight_taskB * loss_B)
                test_loss_list.append(loss)

                accuracy_taskA = accuracy(pred_taskA, labels_misogyny, self.threshold)
                accuracy_taskB = accuracy(pred_taskB, labels_taskB, self.threshold)

                test_acc_taskA_list.append(accuracy_taskA)
                test_acc_taskB_list.append(accuracy_taskB)

            
            '''
            wandb.log({"accuracy_taskA": accuracy_taskA,
                       "accuracy_taskB": accuracy_taskB,
                       "loss_A": loss_A.item(),
                       "loss_B": loss_B.item(),
                       "learning_rate": self.optimizer.param_groups[0]['lr']})
            
            '''
            
        return train_loss_list, test_loss_list, train_acc_taskA_list, train_acc_taskB_list, test_acc_taskA_list, test_acc_taskB_list

            

In [None]:

model_trainer = Trainer("/kaggle/input/dataset-wow/MAMI DATASET/MAMI DATASET/training/TRAINING",# train_images_dir
                        "/kaggle/input/dataset-wow/MAMI DATASET/MAMI DATASET/test", #test_images_dir
                        "/kaggle/working/train_image_text.json",
                        "/kaggle/working/test_image_text.json",
                        batch_size=64,
                        weight_taskA=1,
                        weight_taskB=1) # json_file as data source


model_trainer.train_model()

Training on: cuda:0


100%|██████████| 9000/9000 [00:00<00:00, 439439.55it/s]
100%|██████████| 1000/1000 [00:00<00:00, 345039.82it/s]


Epoch [1/15]


 48%|████▊     | 67/141 [01:58<02:11,  1.78s/it]