In [None]:
!pip install --upgrade transformers

In [None]:
from torch import nn


class MisogynyCls(nn.Module):
    def __init__(self, num_linear_layers, task_a_out=1, task_b_out=4, input_dim=1024, hidden_dim=512, drop_value=0.2):
        super().__init__()
        self.head_task_a = nn.Linear(hidden_dim, task_a_out)
        self.head_task_b = nn.Linear(hidden_dim, task_b_out)

        self.layers = nn.ModuleList()

        for i in range(num_linear_layers):
            if i == 0:
                self.layers.append(nn.Linear(input_dim, hidden_dim))
            else:
                self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.layers.append(nn.Dropout(drop_value))
            self.layers.append(nn.ReLU())

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return self.head_task_a(x), self.head_task_b(x)

## IN CASE WE NEED TO TRANSFORM A TSV/CSV INTO A JSON FORMAT

In [None]:
import csv
import json
import os

# Leggi il file TSV, convertilo in JSON (per facilitare la lettura)
input_file = '/kaggle/input/mydataset-wow/train_image_text.tsv'
output_file = '/kaggle/working/train_image_text.json'

data = []

with open(input_file, newline='', encoding='utf-8') as tsvfile:
    reader = csv.DictReader(tsvfile, delimiter='\t')
    for row in reader:
        data.append(row)

# Crea il file JSON se non esiste
if not os.path.exists(output_file):
    with open(output_file, 'w', encoding='utf-8') as jsonfile:
        json.dump([], jsonfile, ensure_ascii=False, indent=4)
    print(f"File JSON vuoto creato come {output_file}")

    # Scrivi i dati nel file JSON
    with open(output_file, 'w', encoding='utf-8') as jsonfile:
        json.dump(data, jsonfile, ensure_ascii=False, indent=4)

    print(f"File JSON salvato come {output_file}")

In [None]:
# testing the model...
import torch

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)

#print(f"text_embeds shape: {outputs['text_embeds'].shape}")
#print(f"image_embeds shape: {outputs['image_embeds'].shape}")

model_input = torch.cat([outputs['text_embeds'], outputs['image_embeds']], dim=1)
model_to_test = MyModel(2, 4, 10)

miao, miaomiao = model_to_test(model_input)

print(f"head_task_a: {miao}")
print(f"head_task_b: {miaomiao}")


In [None]:
#CLIP IMAGE PROCESSING AND EMBEDDING

import os
import json
from tqdm import tqdm
from PIL import Image

with open("/kaggle/working/train_image_text.json","r") as f:
  data = json.load(f)

  text_list=[]
  image_list=[]
  i=0
    
  for item in tqdm(data):
    if i>5000:
      break
    file_path = f"/kaggle/input/mydataset-wow/MAMI DATASET-20240614T064502Z-001/MAMI DATASET/training/TRAINING/{item['file_name']}"
    image_text = item["text"]
    text_list.append(image_text)
    image_list.append(Image.open(f"{file_path}"))
    i+=1

  print(f"len image_list: {len(image_list)}")
  print(f"len text_list: {len(text_list)}")

  inputs = processor(text=text_list, images=image_list, return_tensors="pt", padding=True)
  outputs = model(**inputs)


In [None]:
from torch.utils.data import Dataset
import os

class MultimodalDataset(Dataset): # Dataset for handling multimodal data
    def __init__(self, images_dir, json_file_path): # dir_path -> directory path where images are stored / json_file_path -> file path for metadata (including labels)   
        file_paths, text_list, labels_misogyny, shaming_label_list, stereotype_label_list, objectification_label_list, violence_label_list = load_json_file(json_file_path)
        
        #print(f"len image_list: {len(file_paths)}") # dovrebbe essere +/- 9k
        #print(f"len text_list: {len(text_list)}") # dovrebbe essere +/- 9k
        
        self.file_paths = file_paths
        self.images_dir = images_dir
        self.text_list = text_list
        self.labels_misogyny = labels_misogyny
        self.shaming_label_list = shaming_label_list
        self.stereotype_label_list = stereotype_label_list
        self.objectification_label_list = objectification_label_list
        self.violence_label_list = violence_label_list
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        return self.file_paths[idx], self.text_list[idx], self.labels_misogyny[idx], self.shaming_label_list[idx], self.stereotype_label_list[idx], self.objectification_label_list[idx], self.violence_label_list[idx]
    
def load_json_file(json_file_path):
    with open(json_file_path,"r") as f:
        data = json.load(f)
        text_list = [] # list of the text related to each image
        image_list = [] # list of images path
        labels_misogyny = [] # list of TASK A labels (misogyny classification)

        ### list of TASK B labels ###
        shaming_label_list = [] 
        stereotype_label_list = []
        objectification_label_list = []
        violence_label_list = []


        for item in tqdm(data): 
            image_list.append(item['file_name'])
            text_list.append(item["text"])
            labels_misogyny.append(float(item["label"]))
            shaming_label_list.append(float(item["shaming"]))
            stereotype_label_list.append(float(item["stereotype"]))
            objectification_label_list.append(float(item["objectification"]))
            violence_label_list.append(float(item["violence"]))

        #print(f"{type(labels_misogyny)}")
        return image_list, text_list, torch.tensor(labels_misogyny, dtype=torch.float32), torch.tensor(shaming_label_list, dtype=torch.float32), torch.tensor(stereotype_label_list, dtype=torch.float32), torch.tensor(objectification_label_list, dtype=torch.float32), torch.tensor(violence_label_list, dtype=torch.float32)


In [None]:
def accuracy(out, preds):
    total = out.shape[0]
    #print(f"out shape: {out.shape}")
    #print(f"preds shape: {preds.shape}")

    correct = (preds == out).sum().item()
    return correct/total

## LOADING THE DATASET AND TRAINING THE NETWORK

In [None]:

# DA NOTARE CHE HO TAGLIATO A 77 IL NUMERO DI TOKEN NEL TESTO !!! 
# ALTERNATIVA E' QUELLA DI PROCESSARE SOLO LE COPPIE CHE HANNO UN PROMPT DELLA GIUSTA LUNGHEZZA (< 77)

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
from PIL import Image # per debug
import os
import json
from tqdm import tqdm
from PIL import Image
import torch.nn.functional as F
import torch.optim as optim



class Trainer():
    def __init__(self, images_dir, json_file_path)
    # Check if CUDA is available
    if torch.cuda.is_available(images_dir):
        # Set the device to the first available CUDA device
        self.device = torch.device("cuda:0")
        print(f"Using device: {device}")
    else:
        # If CUDA is not available, use the CPU
        self.device = torch.device("cpu")
        print("CUDA is not available. Using CPU.")

    # Loading the Dataset
    self.images_dir = images_dir
    self.json_file_path = json_file_path
    
    # Defining the Dataset
    train_data = MultimodalDataset(images_dir, json_file_path)
    self.train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

    # Defining the Model
    self.classifier = MisogynyCls(5).to(device)
    self.optimizer = optim.Adam(classifier.parameters(), lr=0.001)
    self.loss_taskA = nn.CrossEntropyLoss()
    self.loss_taskB = nn.CrossEntropyLoss()
    
    # Pretrained CLIP loading...
    self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    def train_model(self, num_epochs):
        for epoch in range(num_epochs):
            loss_taskA, loss_taskB, accuracy_taskA, accuracy_taskB = self.train_epoch()
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss task A: {loss_taskA: .4f}, Loss task B: {loss_taskB: .4f}')
            print(f'Accuracy task A: {accuracy_taskA}, Accuracy task B: {accuracy_taskB}')
            
            

    def train_epoch(self)
        # Poi durante il training andrò a creare l'embedding per entrambi
        for batch in tqdm(self.train_dataloader):
            self.optimizer.zero_grad() # ZEROING OUT THE GRADIENTS

            # CREATING THE CLIP EMBEDDINGS
            image_list, text_list, labels_misogyny, shaming_labels, stereotype_labels, objectification_labels, violence_labels = batch
            labels_misogyny = labels_misogyny.to(device)
            labels_taskB = torch.stack([shaming_labels, stereotype_labels, objectification_labels, violence_labels],  dim=1).to(device)

            image_list = [Image.open(f"{os.path.join(images_dir, img)}") for img in image_list] # per poterlo usare poi con CLIP
            text_list = [text[:77] for text in text_list] # per poterlo usare poi con CLIP, altrimenti fa i capricci :(

            inputs = self.clip_processor(text=text_list, images=image_list, return_tensors="pt", padding=True)
            outputs = self.clip_model(**inputs)


            # GETTING THE PREDICTIONS...
            model_input = torch.cat([outputs['text_embeds'], outputs['image_embeds']], dim=1).to(device)
            pred_taskA, pred_taskB = self.classifier(model_input)

            loss_A = self.loss_taskA(pred_taskA.squeeze(1), labels_misogyny)
            loss_B = self.loss_taskB(pred_taskB, labels_taskB)
            loss = loss_A + loss_B
            
            loss.backward()
            
            accuracy_taskA = accuracy(pred_taskA, labels_misogyny)
            accuracy_taskB = accuracy(pred_taskB, labels_taskB)

            self.optimizer.step()


            return loss_A.item(), loss_B.item(), accuracy_taskA, accuracy_taskB


In [None]:
model_trainer = Trainer("/kaggle/input/mydataset-wow/MAMI DATASET-20240614T064502Z-001/MAMI DATASET/training/TRAINING/", # images_dir
                        "/kaggle/working/train_image_text.json") # json_file as data source
)

model_trainer.train_model()