In [12]:
import torch
from torch import nn
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

class MisogynyCls(nn.Module):
    def __init__(self, num_linear_layers, task_a_out=1, task_b_out=4, input_dim=1024, hidden_dim=512, drop_value=0.2):
        super().__init__()
        self.head_task_a = nn.Linear(hidden_dim, task_a_out)
        self.head_task_b = nn.Linear(hidden_dim, task_b_out)
        self.sigmoid = nn.Sigmoid()
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Check if CUDA is available
        
        # Pretrained CLIP loading...
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


        self.layers = nn.ModuleList()

        for i in range(num_linear_layers):
            if i == 0:
                self.layers.append(nn.Linear(input_dim, hidden_dim))
            else:
                self.layers.append(nn.Linear(hidden_dim, hidden_dim))
                
            self.layers.append(nn.BatchNorm1d(hidden_dim))
            self.layers.append(nn.Dropout(drop_value))
            self.layers.append(nn.ReLU())


    def forward(self, text_list, image_list):
        clip_inputs = self.clip_processor(text=text_list, images=image_list, return_tensors="pt", padding=True, truncation=True)
        clip_outputs = self.clip_model(**clip_inputs)
        
        x = torch.cat([clip_outputs['text_embeds'], clip_outputs['image_embeds']], dim=1).to(self.device) # model input is the concatenation of the two modalities !
        
        for layer in self.layers:
            x = layer(x)
            #print(x.shape)
        pred_taskA = self.sigmoid(self.head_task_a(x))
        pred_taskB = self.sigmoid(self.head_task_b(x))
        
        return pred_taskA, pred_taskB

In [13]:
import shap
import torch
import numpy as np
from PIL import Image
import os, copy, sys
import math, json
import random
from tqdm import tqdm
from PIL import Image
from torchvision import transforms
from nltk.tokenize import word_tokenize





class MMSHAP:
    
    def __init__(self,
                 classifier):
        self.classifier = classifier
        self.img = None
        self.num_txt_token = None
        self.patch_size = None
        
    def custom_masker(self, mask, x):
        masked_X = np.copy(x).reshape(1, -1) # fai controllo per vedere se effettivamente ha una shape  e.g. (1, 15)
        mask = np.expand_dims(mask, axis=0) # same as unsqueeze(0)
        
        #print(f'type mask: {type(mask)}')
        #print(f'shape mask: {mask.shape}')
        #print(f'shape x: {x.shape}')
        

        masked_X[~mask] = "UNK"
        return masked_X

    def get_model_prediction(self, x): # x must be an ndarray of strings representing the couple (perturbed_txt, perturbed_img)
        #print(x)
        self.classifier.eval()
        perturbed_imgs = []

        with torch.no_grad():
            # split up the input_ids and the image_token_ids from x (containing both appended)
            masked_txt_tokens = [input_string[:self.num_txt_tokens] for input_string in x]
            masked_image_tokens = [input_string[self.num_txt_tokens:] for input_string in x]
            perturbed_txts = [' '.join(token_list.tolist()) for token_list in masked_txt_tokens]
            
            #print(perturbed_txts)

            result = np.zeros(len(x))
            row_cols = 224 // self.patch_size # 224 / 32 = 7

            # call the model for each "new image" generated with masked features
            for i in range(len(x)):
                perturbed_img = copy.deepcopy(self.img)

                # here the actual masking of the image is happening. The custom masker only specified which patches to mask, but no actual masking has happened
                curr_masked_txt_tokens = copy.deepcopy(masked_txt_tokens[i])

                # PATCHIFY THE IMAGE
                for k in range(len(masked_image_tokens[i])):
                    if masked_image_tokens[i][k] == "UNK":  # should be the patch we want to mask
                        m = k // row_cols
                        n = k % row_cols
                        perturbed_img[:, m*self.patch_size:(m+1)*self.patch_size, n*self.patch_size:(n+1)*self.patch_size] = 0 # torch.rand(3, patch_size, patch_size)  # np.random.rand()

                perturbed_imgs.append(perturbed_img)

            outputs_taskA, _ = self.classifier(perturbed_txts, perturbed_imgs)

        return outputs_taskA

    
    def compute_mmscore(self, num_txt_tokens, shap_values):
        """ Compute Multimodality Score. (80% textual, 20% visual, possibly: 0% knowledge). """
        print(shap_values.values.shape)
        #print(shap_values.data)
        
        text_contrib = np.abs(shap_values.values[0, :num_txt_tokens]).sum()
        image_contrib = np.abs(shap_values.values[0, num_txt_tokens:]).sum()
        text_score = text_contrib / (text_contrib + image_contrib)
        image_score = image_contrib / (text_contrib + image_contrib) # is just 1 - text_score in the two modalities case
        return text_score, image_score
    
    def wrapper_mmscore(self, txt_to_explain, img_to_explain): # specify better the types of the parameters (img must be a tensor of shape CxWxH)
        mmscore_list = []
        
        for txt, img in zip(txt_to_explain, img_to_explain):
            
            txt_tokens = word_tokenize(txt)
            num_txt_tokens = len(txt_tokens)
            p = int(math.ceil(np.sqrt(num_txt_tokens)))
            patch_size = 224 // p
            img_tokens = [" " for el in range(1, p**2+1)]
            txt_tokens = np.array(txt_tokens + img_tokens)

            self.img = img
            self.num_txt_tokens = num_txt_tokens
            self.patch_size = patch_size

            explainer = shap.Explainer(self.get_model_prediction, self.custom_masker, silent=True)

            # print(txt_tokens.shape)
            # print(type(txt_tokens))
            txt_tokens = txt_tokens.reshape(1, -1)
            #print(txt_tokens)

            shap_values = explainer(txt_tokens)
            text_score, image_score = self.compute_mmscore(num_txt_tokens, shap_values)
            mmscore_list.append(text_score)
            
        mmscore_array = np.array(mmscore_list)
        mmshap_mean = np.mean(mmscore_array)
        mmshap_variance = np.var(mmscore_array)
            
        return mmshap_mean, mmshap_variance # image_score si ricava in automatico da text_score

In [14]:
from torchvision import transforms

checkpoint = torch.load('/kaggle/input/model-params/model_3.pth', map_location=torch.device('cpu'))
classifier = MisogynyCls(5)
classifier.load_state_dict(checkpoint)

mmshap_analyzer = MMSHAP(classifier)

txt = ["mi chiamo silvio e sono bellissimo", "miao miao miao", "yo yo yu ghdhd "]
image = [transforms.ToTensor()(Image.new('RGB', (224, 224), color='white')) for i in range(3)] # DA FARE UN RESIZE DI (224, 224)

text_score_mean, test_score_variance = mmshap_analyzer.wrapper_mmscore(txt, image)

print(f"text_score: {text_score_mean} - test_score_variance: {test_score_variance}")


(1, 15)
(1, 7)
(1, 8)
text_score: 0.9998103516728108 - test_score_variance: 1.296362378938283e-08
