## Import packages

In [1]:
from __future__ import print_function
import torch.utils.data
from scipy import misc
from torch import optim
from torchvision.utils import save_image
import numpy as np
import pickle
import time
import random
import os
import torch
from torch import nn
from torch.nn import functional as F
from tqdm.auto import tqdm, trange
from torchvision import transforms
import pandas as pd
import torchvision.datasets as datasets
import torch.utils.data as data
import copy
from torch.autograd import Variable
from torch.utils.data import Dataset
from skimage import io
from PIL import Image

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from transformers import AutoTokenizer, BertTokenizer, BertModel, BertForSequenceClassification
from collections import namedtuple
import torchvision.models as models
from sentence_transformers import SentenceTransformer
from laserembeddings import Laser

## Enable gpu device

In [2]:
device = torch.device('cuda:0')

## Set random seed

In [3]:
SEED = 8888

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Load data

In [4]:
train_dir = os.path.join('../', 'data', 'TRAINING','images')
trial_dir = os.path.join('../', 'data', 'Users', 'fersiniel', 'Desktop', 'MAMI - TO LABEL/TRIAL DATASET', 'images')

# load training label
train_df = pd.read_csv('../data/TRAINING/training.csv', sep='\t')
# load trial label
trial_df = pd.read_csv('../data/Users/fersiniel/Desktop/MAMI - TO LABEL/TRIAL DATASET/trial.csv', sep='\t')
# load test label
test_df = pd.read_csv('../data/test/Test.csv', sep='\t')

In [3]:
train_df.describe()

Unnamed: 0,misogynous,shaming,stereotype,objectification,violence
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.5,0.1274,0.281,0.2202,0.0953
std,0.500025,0.333437,0.44951,0.414402,0.293644
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.5,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0


In [4]:
train_df.head(5)

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,1.jpg,0,0,0,0,0,Milk Milk.zip
1,10.jpg,1,0,0,0,1,"ROSES ARE RED, VIOLETS ARE BLUE IF YOU DON'T S..."
2,1000.jpg,0,0,0,0,0,BREAKING NEWS: Russia releases photo of DONALD...
3,10000.jpg,0,0,0,0,0,MAN SEEKING WOMAN Ignad 18 O
4,10006.jpg,0,0,0,0,0,Me explaining the deep lore of. J.R.R. Tolkein...


## Define image transform

In [66]:
# pretrained_size = 128
# pretrained_size = 256
pretrained_size = 224
pretrained_means = [0.485, 0.456, 0.406]
pretrained_stds= [0.229, 0.224, 0.225]

# train_transforms = transforms.Compose([
#                            transforms.Resize(pretrained_size),
#                            transforms.RandomRotation(5),
#                            transforms.RandomHorizontalFlip(0.5),
#                            transforms.RandomCrop(pretrained_size, padding = 10),
#                            transforms.ToTensor(),
#                            transforms.Normalize(mean = pretrained_means, 
#                                                 std = pretrained_stds)
#                        ])

train_transforms = transforms.Compose([
                           transforms.ToTensor()
                       ])

# trial_transforms = transforms.Compose([
#                            transforms.Resize(pretrained_size),
#                            transforms.CenterCrop(pretrained_size),
#                            transforms.ToTensor(),
#                            transforms.Normalize(mean = pretrained_means, 
#                                                 std = pretrained_stds)
#                        ])

trial_transforms = train_transforms

## Construct ResNet class

In [6]:
class ResNet(nn.Module):
    def __init__(self, config, output_dim):
        super().__init__()
                
        block, n_blocks, channels = config
        self.in_channels = channels[0]
            
        assert len(n_blocks) == len(channels) == 4
        
        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size = 7, stride = 2, padding = 3, bias = False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace = True)
        self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1)
        
        self.layer1 = self.get_resnet_layer(block, n_blocks[0], channels[0])
        self.layer2 = self.get_resnet_layer(block, n_blocks[1], channels[1], stride = 2)
        self.layer3 = self.get_resnet_layer(block, n_blocks[2], channels[2], stride = 2)
        self.layer4 = self.get_resnet_layer(block, n_blocks[3], channels[3], stride = 2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(self.in_channels, output_dim)
        
    def get_resnet_layer(self, block, n_blocks, channels, stride = 1):
    
        layers = []
        
        if self.in_channels != block.expansion * channels:
            downsample = True
        else:
            downsample = False
        
        layers.append(block(self.in_channels, channels, stride, downsample))
        
        for i in range(1, n_blocks):
            layers.append(block(block.expansion * channels, channels))

        self.in_channels = block.expansion * channels
            
        return nn.Sequential(*layers)
    
    def forward(self, x):
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        h = x.view(x.shape[0], -1)
        x = self.fc(h)
        
        return x, h

class BasicBlock(nn.Module):
    
    expansion = 1
    
    def __init__(self, in_channels, out_channels, stride = 1, downsample = False):
        super().__init__()
                
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size = 3, 
                               stride = stride, padding = 1, bias = False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size = 3, 
                               stride = 1, padding = 1, bias = False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        self.relu = nn.ReLU(inplace = True)
        
        if downsample:
            conv = nn.Conv2d(in_channels, out_channels, kernel_size = 1, 
                             stride = stride, bias = False)
            bn = nn.BatchNorm2d(out_channels)
            downsample = nn.Sequential(conv, bn)
        else:
            downsample = None
        
        self.downsample = downsample
        
    def forward(self, x):
        
        i = x
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        
        if self.downsample is not None:
            i = self.downsample(i)
                        
        x += i
        x = self.relu(x)
        
        return x

class Bottleneck(nn.Module):
    
    expansion = 4
    
    def __init__(self, in_channels, out_channels, stride = 1, downsample = False):
        super().__init__()
    
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size = 1, 
                               stride = 1, bias = False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size = 3, 
                               stride = stride, padding = 1, bias = False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        self.conv3 = nn.Conv2d(out_channels, self.expansion * out_channels, kernel_size = 1,
                               stride = 1, bias = False)
        self.bn3 = nn.BatchNorm2d(self.expansion * out_channels)
        
        self.relu = nn.ReLU(inplace = True)
        
        if downsample:
            conv = nn.Conv2d(in_channels, self.expansion * out_channels, kernel_size = 1, 
                             stride = stride, bias = False)
            bn = nn.BatchNorm2d(self.expansion * out_channels)
            downsample = nn.Sequential(conv, bn)
        else:
            downsample = None
            
        self.downsample = downsample
        
    def forward(self, x):
        
        i = x
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
                
        if self.downsample is not None:
            i = self.downsample(i)
            
        x += i
        x = self.relu(x)
    
        return x

## Fix parameters

In [5]:
def set_parameter_requires_grad_false(model):
    for param in model.parameters():
        param.requires_grad = False

## Load pre-trained model

In [6]:
# Bert pretrained model

bert_pretrained = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
        
set_parameter_requires_grad_false(bert_pretrained)

bert_pretrained.cuda()
bert_pretrained.eval()

# ResNet pretrained model
    
ResNetConfig = namedtuple('ResNetConfig', ['block', 'n_blocks', 'channels'])
resnet50_config = ResNetConfig(block = Bottleneck,
                               n_blocks = [3, 4, 6, 3],
                               channels = [64, 128, 256, 512])
pretrained_model = models.resnet50(pretrained = True)
        
resnet_pretrained = ResNet(resnet50_config, 1000)
resnet_pretrained.load_state_dict(pretrained_model.state_dict())
        
set_parameter_requires_grad_false(resnet_pretrained)

resnet_pretrained.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

NameError: name 'Bottleneck' is not defined

In [5]:
# Clip pretrained model for image encoding

clip_pretrained = SentenceTransformer('clip-ViT-B-32')

# Laser model for text encoding

laser_model = Laser()

In [11]:
emb = laser_model.embed_sentences("Hello World!", lang='en')
emb.shape

(1, 1024)

## Self-define Dataset class

In [6]:
class MAMIDataset(Dataset):
    """MAMI dataset."""

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.df = pd.read_csv(csv_file, sep='\t')
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.df.iloc[idx, 0])
        meme = Image.open(img_name)#.convert("RGB")   # convert to RGB is important
        meme = torch.Tensor(clip_pretrained.encode(meme))
        labels = self.df.iloc[idx, 1:-1]   # multi-labels
        labels = np.array(labels)
        labels = labels.astype('long')
        
        text = self.df.iloc[idx, -1]   # Text transcription
        text = torch.Tensor(laser_model.embed_sentences(text, lang='en'))
#         text = "[CLS] " + text + " [SEP]"   # Add special tokens
        
#         tokenized_text = tokenizer.tokenize(text)
#         # Map the token strings to their vocabulary indeces.
#         indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
#         segments_ids = [1] * len(tokenized_text)

#         tokens_tensor = torch.tensor([indexed_tokens])
#         segments_tensor = torch.tensor([segments_ids])

#         text_ids = tokenizer(text, return_tensors="pt", padding='max_length', max_length=512, truncation=True)
        
        sample = {'meme': meme, 'labels': labels, 'text': text}

        if self.transform:
            sample['meme'] = self.transform(meme)

        return sample

In [7]:
class MAMITestset(Dataset):
    """MAMI dataset."""

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.df = pd.read_csv(csv_file, sep='\t')
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.df.iloc[idx, 0])
        meme = Image.open(img_name)#.convert("RGB")   # convert to RGB is important
        meme = torch.Tensor(clip_pretrained.encode(meme))
        
        text = self.df.iloc[idx, -1]   # Text transcription
        text = torch.Tensor(laser_model.embed_sentences(text, lang='en'))
        
#         text = "[CLS] " + text + " [SEP]"   # Add special tokens
        
#         tokenized_text = tokenizer.tokenize(text)
#         # Map the token strings to their vocabulary indeces.
#         indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
#         segments_ids = [1] * len(tokenized_text)

#         tokens_tensor = torch.tensor([indexed_tokens])
#         segments_tensor = torch.tensor([segments_ids])
        
        sample = {'meme': meme, 'text': text}

        if self.transform:
            sample['meme'] = self.transform(meme)

        return sample

## Instantiate the train and trial dataset

In [8]:
trial_root_dir = '../data/Users/fersiniel/Desktop/MAMI - TO LABEL/TRIAL DATASET/'
# trial_data = MAMIDataset(trial_root_dir + 'trial.csv', trial_root_dir, trial_transforms)
trial_data = MAMIDataset(trial_root_dir + 'trial.csv', trial_root_dir)

train_root_dir = '../data/TRAINING/'
# train_data = MAMIDataset(train_root_dir + 'training.csv', train_root_dir, train_transforms)
train_data = MAMIDataset(train_root_dir + 'training.csv', train_root_dir)

In [9]:
test_root_dir = '../data/test/'
# train_data = MAMIDataset(train_root_dir + 'training.csv', train_root_dir, train_transforms)
test_data = MAMITestset(test_root_dir + 'Test.csv', test_root_dir)

## Train valid split

In [10]:
VALID_RATIO = 0.9

n_train_examples = int(len(train_data) * VALID_RATIO)
n_valid_examples = len(train_data) - n_train_examples

train_data, valid_data = data.random_split(train_data, 
                                           [n_train_examples, n_valid_examples])

In [11]:
valid_data = copy.deepcopy(valid_data)
# valid_data.dataset.transform = trial_transforms

In [12]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(trial_data)}')

Number of training examples: 9000
Number of validation examples: 1000
Number of testing examples: 100


In [17]:
length = len(valid_data)
num_misogynous = 0
for i in range(length):
    if valid_data[i]['labels'][0] == 1:
        num_misogynous += 1

print(num_misogynous)

512


## Create batch iterators

In [13]:
BATCH_SIZE = 64

In [14]:
train_iterator = data.DataLoader(train_data, 
                                 shuffle = True, 
                                 batch_size = BATCH_SIZE)

valid_iterator = data.DataLoader(valid_data, 
                                 batch_size = BATCH_SIZE)

trial_iterator = data.DataLoader(trial_data, 
                                batch_size = BATCH_SIZE)

In [15]:
test_iterator = data.DataLoader(test_data, 
                                batch_size = BATCH_SIZE)

In [42]:
len(train_iterator)

141

## Playground

In [15]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [18]:
max_length = 0
for index, row in train_df.iterrows():
    txt = row['Text Transcription']
    txt = "[CLS] " + txt + " [SEP]"
    length = len(tokenizer.tokenize(txt))
    if max_length < length:
        max_length = length
    
max_length

557

In [23]:
# Average length of Text Transcription
avg_length

38.5096

In [19]:
txt = "Hello, my dog is cute"
txt = "[CLS] " + txt + " [SEP]"
tokenized_txt = tokenizer.tokenize(txt)
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_txt)
segments_ids = [1] * len(tokenized_txt)

tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segments_ids])
print(segments_tensor)

tensor([[1, 1, 1, 1, 1, 1, 1, 1]])


In [20]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [21]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensor)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

In [22]:
a = hidden_states[-1][:, 0, :]
a

tensor([[-4.7624e-01,  1.1550e-01,  1.1242e-01, -5.7556e-01, -2.4503e-01,
          8.4391e-02,  5.3658e-01,  4.9711e-01, -4.0918e-01, -1.6279e-01,
         -2.8653e-02,  5.8704e-02,  2.1708e-01, -1.1108e-02,  7.4637e-02,
          3.4948e-01,  2.0475e-01,  1.8479e-02,  2.1207e-01, -4.7727e-02,
         -1.3609e-01, -2.6399e-01, -2.1642e-02, -3.0580e-01,  3.2172e-01,
         -2.0820e-01, -2.3615e-01, -1.6109e-01,  1.0481e-01, -2.9942e-01,
         -1.0333e-01,  5.3430e-01, -1.8657e-01, -4.1273e-01,  2.6009e-02,
          8.9587e-02,  1.0829e-01, -2.5235e-02,  2.0665e-01,  5.0784e-01,
         -3.6589e-01,  1.5930e-01,  1.8801e-01,  2.6812e-02, -1.4873e-01,
         -1.5996e-01, -2.4484e+00, -2.2372e-01, -4.6786e-01, -3.4599e-01,
         -6.4978e-02,  1.4295e-01,  1.6354e-01,  1.7121e-01, -1.9327e-01,
          3.7022e-01, -5.4412e-01,  2.0715e-01,  5.2038e-01,  1.3504e-01,
         -2.7291e-02, -5.9998e-02, -9.9373e-02,  1.2187e-01, -2.5336e-02,
          4.9320e-01, -5.6170e-01,  8.

In [23]:
# two methods learn the same last hidden state
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt", padding="max_length")   # 512
print(inputs)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

{'input_ids': tensor([[  101,  8667,   117,  1139,  3676,  1110, 10509,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [24]:
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, output_hidden_states=True)
# outputs[1]
b = outputs[1][-1][:, 0, :]
b

tensor([[-4.7624e-01,  1.1550e-01,  1.1242e-01, -5.7556e-01, -2.4503e-01,
          8.4391e-02,  5.3658e-01,  4.9711e-01, -4.0918e-01, -1.6279e-01,
         -2.8653e-02,  5.8704e-02,  2.1708e-01, -1.1108e-02,  7.4637e-02,
          3.4948e-01,  2.0475e-01,  1.8479e-02,  2.1207e-01, -4.7727e-02,
         -1.3610e-01, -2.6399e-01, -2.1643e-02, -3.0580e-01,  3.2172e-01,
         -2.0820e-01, -2.3615e-01, -1.6109e-01,  1.0481e-01, -2.9942e-01,
         -1.0333e-01,  5.3430e-01, -1.8657e-01, -4.1273e-01,  2.6009e-02,
          8.9587e-02,  1.0829e-01, -2.5235e-02,  2.0665e-01,  5.0784e-01,
         -3.6589e-01,  1.5930e-01,  1.8801e-01,  2.6812e-02, -1.4873e-01,
         -1.5996e-01, -2.4484e+00, -2.2372e-01, -4.6786e-01, -3.4599e-01,
         -6.4978e-02,  1.4295e-01,  1.6354e-01,  1.7121e-01, -1.9327e-01,
          3.7022e-01, -5.4412e-01,  2.0715e-01,  5.2038e-01,  1.3504e-01,
         -2.7290e-02, -5.9998e-02, -9.9373e-02,  1.2187e-01, -2.5336e-02,
          4.9320e-01, -5.6170e-01,  8.

In [25]:
# last hidden state
outputs[1][-1].shape

torch.Size([1, 512, 768])

## Playground

In [21]:
ResNetConfig = namedtuple('ResNetConfig', ['block', 'n_blocks', 'channels'])

In [22]:
resnet50_config = ResNetConfig(block = Bottleneck,
                               n_blocks = [3, 4, 6, 3],
                               channels = [64, 128, 256, 512])

In [23]:
pretrained_model = models.resnet50(pretrained = True)

In [24]:
model = ResNet(resnet50_config, 1000)

In [25]:
model.load_state_dict(pretrained_model.state_dict())

<All keys matched successfully>

In [33]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 25,557,032 trainable parameters


## Playground

In [6]:
clip_model = SentenceTransformer('clip-ViT-B-32')

In [12]:
sentences = ["This is an example sentence", "Hello world"]

embeddings = clip_model.encode(sentences)
print(embeddings.shape)

(2, 512)


## Construct VAE class

In [17]:
class VAE(nn.Module):
    def __init__(self, zsize, output_dim=2):
        super(VAE, self).__init__()
        
        self.zsize = zsize
        self.fc1 = nn.Linear(zsize, zsize)   # 4 * 4 is the current size of the image
        self.fc2 = nn.Linear(zsize, zsize)

        ######
        # multi-tasks sub-networks
        self.fc_misogynous = nn.Linear(zsize, output_dim)
        self.fc_shaming = nn.Linear(zsize, output_dim)
        self.fc_stereotype = nn.Linear(zsize, output_dim)
        self.fc_objectification = nn.Linear(zsize, output_dim)
        self.fc_violence = nn.Linear(zsize, output_dim)
        
        # language pre-trained model
#         self.bert_pretrained = BertForSequenceClassification.from_pretrained('bert-base-uncased')
#         self.tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
        
#         set_parameter_requires_grad_false(self.bert_pretrained)
        
        # image pre-trained model
#         ResNetConfig = namedtuple('ResNetConfig', ['block', 'n_blocks', 'channels'])
#         resnet50_config = ResNetConfig(block = Bottleneck,
#                                n_blocks = [3, 4, 6, 3],
#                                channels = [64, 128, 256, 512])
#         pretrained_model = models.resnet50(pretrained = True)
        
#         self.resnet_pretrained = ResNet(resnet50_config, 1000)
#         self.resnet_pretrained.load_state_dict(pretrained_model.state_dict())
        
#         set_parameter_requires_grad_false(self.resnet_pretrained)
        
        # encoder layers
        self.enc_txt_fc = nn.Linear(1024, int(0.5 * zsize))
        self.enc_img_fc1 = nn.Linear(512, int(0.5 * zsize))
#         self.enc_img_fc2 = nn.Linear(1024, int(0.5 * zsize))
        
        # decoder layers
        self.dec_txt_fc = nn.Linear(zsize, 1024)
        self.dec_img_fc1 = nn.Linear(zsize, 512)
#         self.dec_img_fc2 = nn.Linear(1024, 2048)

        # batch normalizations
        self.enc_txt_bn = nn.BatchNorm1d(num_features=int(0.5 * zsize))
        self.enc_img_bn1 = nn.BatchNorm1d(num_features=int(0.5 * zsize))
#         self.enc_img_bn2 = nn.BatchNorm1d(num_features=int(0.5 * zsize))
        
        self.dec_txt_bn = nn.BatchNorm1d(num_features=1024)
        self.dec_img_bn1 = nn.BatchNorm1d(num_features=512)
#         self.dec_img_bn2 = nn.BatchNorm1d(num_features=2048)
        
        # dropout
        self.dropout_txt_enc = nn.Dropout(0.2)
        self.dropout_img_enc = nn.Dropout(0.2)
        self.dropout_txt_dec = nn.Dropout(0.2)
        self.dropout_img_dec = nn.Dropout(0.2)
        
        
    def img_encode(self, x_img):
#         _, x_img = self.resnet_pretrained(x_img)
        x_img = F.relu(self.dropout_img_enc(self.enc_img_bn1(self.enc_img_fc1(x_img))))
#         x_img = F.relu(self.enc_img_fc2(x_img))
        
        return x_img   # [bs, 2048]

    def txt_encode(self, x_txt):

#         inputs = self.tokenizer(x_txt, return_tensors="pt", padding="max_length")   # 512
#         x_txt = self.bert_pretrained(**inputs, output_hidden_states=True)
#         # last hidden state
#         x_txt = x_txt[1][-1]   # [bs, max_length, 768]
#         x_txt = x_txt[:, 0, :]   # [CLS] token embedding represent the whole sentence
        x_txt = x_txt.view(x_txt.shape[0], 1024)
        x_txt = F.relu(self.dropout_txt_enc(self.enc_txt_bn(self.enc_txt_fc(x_txt))))
        return x_txt   # [bs, 0.5 * zsize]

    def encode(self, x_img, x_txt):
        
        x_img = self.img_encode(x_img)
        
        x_txt = self.txt_encode(x_txt)
        
        # concate x_img and x_txt
        x = torch.cat((x_txt, x_img), 1)
        
        h1 = self.fc1(x)   # mu
        h2 = self.fc2(x)   # logvar
        return h1, h2
    
    def subtask_misogynous(self, z):
        
        h = self.fc_misogynous(z)
        return h
    
    def subtask_shaming(self, z):
        
        h = self.fc_shaming(z)
        return h
    
    def subtask_stereotype(self, z):

        h = self.fc_stereotype(z)
        return h
    
    def subtask_objectification(self, z):

        h = self.fc_objectification(z)
        return h
    
    def subtask_violence(self, z):

        h = self.fc_violence(z)
        return h

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        else:
            return mu

    def decode(self, x):
#         x = x.view(x.shape[0], self.zsize)   # flatten

        # Decoding txt
        dec_x_txt = F.relu(self.dropout_txt_dec(self.dec_txt_bn(self.dec_txt_fc(x))))
        
        # Decoding img
        dec_x_img = F.relu(self.dropout_img_dec(self.dec_img_bn1(self.dec_img_fc1(x))))
#         dec_x_img = F.relu(self.dec_img_fc2(dec_x_img))
        
        return dec_x_img, dec_x_txt

    def forward(self, x_img, x_txt):
        mu, logvar = self.encode(x_img, x_txt)
        mu = mu.squeeze()
        logvar = logvar.squeeze()
        z = self.reparameterize(mu, logvar)

        y_misogynous = self.subtask_misogynous(z)
        y_shaming = self.subtask_shaming(z)
        y_stereotype = self.subtask_stereotype(z)
        y_objectification = self.subtask_objectification(z)
        y_violence = self.subtask_violence(z)
        
        y_pred = dict()
        y_pred["misogynous"] = y_misogynous
        y_pred["shaming"] = y_shaming
        y_pred["stereotype"] = y_stereotype
        y_pred["objectification"] = y_objectification
        y_pred["violence"] = y_violence
        
        dec_x_img, dec_x_txt = self.decode(z.view(-1, self.zsize))
        
        return dec_x_img, dec_x_txt, mu, logvar, y_pred

    def weight_init(self, mean, std):
        for m in self._modules:
            normal_init(self._modules[m], mean, std)


In [24]:
vae = VAE(1024)   # 5 layers
# vae = VAE(1024, 5)
print(vae)

VAE(
  (fc1): Linear(in_features=1024, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc_misogynous): Linear(in_features=1024, out_features=2, bias=True)
  (fc_shaming): Linear(in_features=1024, out_features=2, bias=True)
  (fc_stereotype): Linear(in_features=1024, out_features=2, bias=True)
  (fc_objectification): Linear(in_features=1024, out_features=2, bias=True)
  (fc_violence): Linear(in_features=1024, out_features=2, bias=True)
  (enc_txt_fc): Linear(in_features=1024, out_features=512, bias=True)
  (enc_img_fc1): Linear(in_features=512, out_features=512, bias=True)
  (dec_txt_fc): Linear(in_features=1024, out_features=1024, bias=True)
  (dec_img_fc1): Linear(in_features=1024, out_features=512, bias=True)
  (enc_txt_bn): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (enc_img_bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dec_txt_bn): BatchNorm1d(1024, e

In [25]:
def normal_init(m, mean, std):
    if isinstance(m, nn.ConvTranspose2d) or isinstance(m, nn.Conv2d):
        m.weight.data.normal_(mean, std)
        m.bias.data.zero_()

In [16]:
# im_size = 128
# im_size = 256
im_size = 224

In [26]:
def loss_function(recon_x_img, recon_x_txt, x_img, x_txt, mu, logvar):
    BCE_img = torch.mean((recon_x_img - x_img)**2)
    BCE_txt = torch.mean((recon_x_txt - x_txt)**2)

    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.mean(torch.mean(1 + logvar - mu.pow(2) - logvar.exp(), 1))
    return BCE_img, BCE_txt, KLD * 0.1

In [27]:
f = open("VAE_result_clip_laser_dropout_saved_model.txt", "w")

In [28]:
def main():
    
    name_dict = dict()
    name_dict["misogynous"] = 0
    name_dict["shaming"] = 1
    name_dict["stereotype"] = 2
    name_dict["objectification"] = 3
    name_dict["violence"] = 4
    
    #batch_size = 32
    z_size = 512
#     z_size = 1024
    vae = VAE(z_size)
    vae.cuda()
    vae.train()
    vae.weight_init(mean=0, std=0.02)
#     input_dim = 128 * 128
#     input_dim = 256 * 256
#     input_dim = 224 * 224

#     lr = 0.0005
    lr = 0.0001

    vae_optimizer = optim.Adam(vae.parameters(), lr=lr, betas=(0.5, 0.999), weight_decay=1e-5)
    
    criterion = nn.CrossEntropyLoss()
    criterion.to(device)
 
    train_epoch = 30

    
    dataloader = train_iterator
    
    f1_max = 0
    max_acc = 0
    
    for epoch in range(train_epoch):
        vae.train()

#         with open('data_fold_%d.pkl' % (epoch % 5), 'rb') as pkl:
#             data_train = pickle.load(pkl)

#         print("Train set size:", len(data_train))

#         random.shuffle(data_train)

#         batches = batch_provider(data_train, batch_size, process_batch, report_progress=True)

        rec_txt_loss = 0
        rec_img_loss = 0
        kl_loss = 0
        subtask_misogynous_loss = 0
        subtask_shaming_loss = 0
        subtask_stereotype_loss = 0
        subtask_objectification_loss = 0
        subtask_violence_loss = 0

        epoch_start_time = time.time()

        if (epoch + 1) % 8 == 0:
            vae_optimizer.param_groups[0]['lr'] /= 4
#             print("learning rate change!")
            f.write("learning rate change! The learning rate is %1.4f now\n" % (lr))

#         i = 0
        acc = 0
        num = 0
        for i, data in tqdm(enumerate(dataloader, 0), desc='iterations'):
        #for x in batches:
            vae.train()
            
            #inputs, classes = data
            img_inputs = data['meme']
            img_inputs = img_inputs.to(device)
#             _, img_inputs = resnet_pretrained(img_inputs)
#             img_inputs = clip_pretrained.encode(img_inputs)
#             print(img_inputs.shape)
            
#             txt_ids = data['text_ids']
#             tokens_tensor = txt_ids['input_ids']
#             segments_tensor = txt_ids['attention_mask']
            
#             tokens_tensor = tokens_tensor.to(device).squeeze(axis=1)
#             segments_tensor = segments_tensor.to(device).squeeze(axis=1)
            txt_inputs = data["text"]
            txt_inputs = txt_inputs.to(device)
            
            
#             with torch.no_grad():
#                 txt_inputs = bert_pretrained(tokens_tensor, segments_tensor, output_hidden_states=True)
                
#                 # last hidden state
#                 txt_inputs = txt_inputs[1][-1]   # [bs, max_length, 768]
#                 txt_inputs = txt_inputs[:, 0, :]   # [CLS] token embedding represent the whole sentence
            
            classes = data['labels']
            
            # multi-task labels
            classes_misogynous = classes[:, 0]
            classes_shaming = classes[:, 1]
            classes_stereotype = classes[:, 2]
            classes_objectification = classes[:, 3]
            classes_violence = classes[:, 4]
            #print(classes)
#             inputs, classes = Variable(inputs.resize_(batch_size, input_dim)), Variable(classes)
            
            img_inputs, txt_inputs, classes_misogynous = Variable(img_inputs), Variable(txt_inputs), Variable(classes_misogynous)
            classes_shaming = Variable(classes_shaming)
            classes_stereotype = Variable(classes_stereotype)
            classes_objectification = Variable(classes_stereotype)
            classes_violence = Variable(classes_violence)
        
            img_inputs = img_inputs.to(device)
            txt_inputs = txt_inputs.to(device)
            classes_misogynous = classes_misogynous.to(device)
            classes_shaming = classes_shaming.to(device)
            classes_stereotype = classes_stereotype.to(device)
            classes_objectification = classes_objectification.to(device)
            classes_violence = classes_violence.to(device)
            
            vae.zero_grad()
#             rec, mu, logvar = vae(x)
            rec_img, rec_txt, mu, logvar, y_pred = vae(img_inputs, txt_inputs)

            loss_re_img, loss_re_txt, loss_kl = loss_function(rec_img, rec_txt, img_inputs, txt_inputs, mu, logvar)
            loss_subtask_misogynous = criterion(y_pred["misogynous"], classes_misogynous)
            loss_subtask_shaming = criterion(y_pred["shaming"], classes_shaming)
            loss_subtask_stereotype = criterion(y_pred["stereotype"], classes_stereotype)
            loss_subtask_objectification = criterion(y_pred["objectification"], classes_objectification)
            loss_subtask_violence = criterion(y_pred["violence"], classes_violence)
            
            (loss_re_img + loss_re_txt + loss_kl + loss_subtask_misogynous \
             + loss_subtask_shaming + loss_subtask_stereotype + loss_subtask_objectification\
             + loss_subtask_violence).backward()
            
            vae_optimizer.step()
            rec_img_loss += loss_re_img.item()
            rec_txt_loss += loss_re_txt.item()
            
            kl_loss += loss_kl.item()
            subtask_misogynous_loss += loss_subtask_misogynous.item()
            subtask_shaming_loss += loss_subtask_shaming.item()
            subtask_stereotype_loss += loss_subtask_stereotype.item()
            subtask_objectification_loss += loss_subtask_objectification.item()
            subtask_violence_loss += loss_subtask_violence.item()
            
            # Calculate batch accuracy
            _, top_pred = y_pred["misogynous"].topk(1, 1)
            y = classes_misogynous.cpu()
            batch_size = y.shape[0]
            top_pred = top_pred.cpu().view(batch_size)
            acc += sum(top_pred == y).item()
            num += batch_size

            #############################################

#             os.makedirs('results_rec_64', exist_ok=True)
#             os.makedirs('results_gen_64', exist_ok=True)

            epoch_end_time = time.time()
            per_epoch_ptime = epoch_end_time - epoch_start_time

            # report losses and save samples each 60 iterations
            m = len(dataloader)
            i += 1
            if i % m == 0:
                rec_txt_loss /= m
                rec_img_loss /= m
                kl_loss /= m
                subtask_misogynous_loss /= m
                subtask_shaming_loss /= m
                subtask_stereotype_loss /= m
                subtask_objectification_loss /= m
                subtask_violence_loss /= m
                
#                 print('\n[%d/%d] - ptime: %.2f, rec img loss: %.9f, rec txt loss: %.9f, KL loss: %.9f, misogynous loss: %.9f, shaming loss: %.9f, stereotype loss: %.9f, objectification loss: %.9f, violence loss: %.9f' % (
#                     (epoch + 1), train_epoch, per_epoch_ptime, rec_img_loss, rec_txt_loss, kl_loss, subtask_misogynous_loss, subtask_shaming_loss, subtask_stereotype_loss, subtask_objectification_loss, subtask_violence_loss))

                f.write('\n[%d/%d] - ptime: %.2f, rec img loss: %.9f, rec txt loss: %.9f, KL loss: %.9f, misogynous loss: %.9f, shaming loss: %.9f, stereotype loss: %.9f, objectification loss: %.9f, violence loss: %.9f\n' % (
                    (epoch + 1), train_epoch, per_epoch_ptime, rec_img_loss, rec_txt_loss, kl_loss, subtask_misogynous_loss, subtask_shaming_loss, subtask_stereotype_loss, subtask_objectification_loss, subtask_violence_loss))
                rec_txt_loss = 0
                rec_img_loss = 0
                kl_loss = 0
                with torch.no_grad():
#                     test_loss, test_acc, test_accuracy, test_f1, test_recall, test_precision = evaluate(vae, valid_iterator, criterion, device, "misogynous")
                    test_loss, test_acc, test_accuracy, test_f1, test_recall, test_precision = evaluate(vae, valid_iterator, criterion, device)
                    f.write(f'Test subtask misogynous Loss: {test_loss["misogynous"]:.3f} | Test Acc @1: {test_acc["misogynous"]*100:6.2f}%\n')
                    f.write(f'Test subtask misogynous accuracy: {test_accuracy["misogynous"]*100:6.2f}%\n')
                    f.write(f'Test subtask misogynous f1: {test_f1["misogynous"]*100:6.2f}%\n')
                    f.write(f'Test subtask misogynous recall: {test_recall["misogynous"]*100:6.2f}%\n')
                    f.write(f'Test subtask misogynous precision: {test_precision["misogynous"]*100:6.2f}%\n')
                    
#                     test_loss, test_acc, test_accuracy, test_f1, test_recall, test_precision = evaluate(vae, trial_iterator, criterion, device, "shaming")
                    f.write(f'Test subtask shaming Loss: {test_loss["shaming"]:.3f} | Test Acc @1: {test_acc["shaming"]*100:6.2f}%\n')
                    f.write(f'Test subtask shaming accuracy: {test_accuracy["shaming"]*100:6.2f}%\n')
                    f.write(f'Test subtask shaming f1: {test_f1["shaming"]*100:6.2f}%\n')
                    f.write(f'Test subtask shaming recall: {test_recall["shaming"]*100:6.2f}%\n')
                    f.write(f'Test subtask shaming precision: {test_precision["shaming"]*100:6.2f}%\n')
                    
#                     test_loss, test_acc, test_accuracy, test_f1, test_recall, test_precision = evaluate(vae, trial_iterator, criterion, device, "stereotype")
                    f.write(f'Test subtask stereotype Loss: {test_loss["stereotype"]:.3f} | Test Acc @1: {test_acc["stereotype"]*100:6.2f}%\n')
                    f.write(f'Test subtask stereotype accuracy: {test_accuracy["stereotype"]*100:6.2f}%\n')
                    f.write(f'Test subtask stereotype f1: {test_f1["stereotype"]*100:6.2f}%\n')
                    f.write(f'Test subtask stereotype recall: {test_recall["stereotype"]*100:6.2f}%\n')
                    f.write(f'Test subtask stereotype precision: {test_precision["stereotype"]*100:6.2f}%\n')
                    
#                     test_loss, test_acc, test_accuracy, test_f1, test_recall, test_precision = evaluate(vae, trial_iterator, criterion, device, "objectification")
                    f.write(f'Test subtask objectification Loss: {test_loss["objectification"]:.3f} | Test Acc @1: {test_acc["objectification"]*100:6.2f}%\n')
                    f.write(f'Test subtask objectification accuracy: {test_accuracy["objectification"]*100:6.2f}%\n')
                    f.write(f'Test subtask objectification f1: {test_f1["objectification"]*100:6.2f}%\n')
                    f.write(f'Test subtask objectification recall: {test_recall["objectification"]*100:6.2f}%\n')
                    f.write(f'Test subtask objectification precision: {test_precision["objectification"]*100:6.2f}%\n')
                    
#                     test_loss, test_acc, test_accuracy, test_f1, test_recall, test_precision = evaluate(vae, trial_iterator, criterion, device, "violence")
                    f.write(f'Test subtask violence Loss: {test_loss["violence"]:.3f} | Test Acc @1: {test_acc["violence"]*100:6.2f}%\n')
                    f.write(f'Test subtask violence accuracy: {test_accuracy["violence"]*100:6.2f}%\n')
                    f.write(f'Test subtask violence f1: {test_f1["violence"]*100:6.2f}%\n')
                    f.write(f'Test subtask violence recall: {test_recall["violence"]*100:6.2f}%\n')
                    f.write(f'Test subtask violence precision: {test_precision["violence"]*100:6.2f}%\n')
                    
                    acc /= num
                    print(f'num_correct: {acc}\n')
                    print(f'total_num: {num}\n')
                    f.write(f'Training accuracy: {acc*100:6.2f}%\n')
                    
                    if test_f1["misogynous"]*100 >= f1_max:
                        
                        torch.save(vae.state_dict(), "VAEmodel-clip-laser-bn-dropout-epoch-%d.pkl" % (epoch+1))
                        f.write("Epoch [%d/%d]: test f1 on misogynous improves, saving training results\n" % (epoch+1, train_epoch))
                        f1_max = test_f1["misogynous"]*100
                    
#                     vae.eval()
#                     x_rec, _, _, _ = vae(inputs)
#                     resultsample = torch.cat([inputs, x_rec]) * 0.5 + 0.5
#                     resultsample = resultsample.cpu()
#                     save_image(resultsample.view(-1, 3, im_size, im_size),
#                                'results_rec_64/sample_' + str(epoch) + "_" + str(i) + '.png')
#                     x_rec = vae.decode(sample1)
#                     resultsample = x_rec * 0.5 + 0.5
#                     resultsample = resultsample.cpu()
#                     save_image(resultsample.view(-1, 3, im_size, im_size),
#                                'results_gen_64/sample_' + str(epoch) + "_" + str(i) + '.png')

        f.flush()

    f.write("Training finish!... save training results\n")
    return vae

In [29]:
def calculate_accuracy(y_pred, y):
    with torch.no_grad():
        batch_size = y.shape[0]
        _, top_pred = y_pred.topk(1, 1)
        top_pred = top_pred.t()
        correct = top_pred.eq(y.view(1, -1).expand_as(top_pred))
        correct_1 = correct[:1].reshape(-1).float().sum(0, keepdim = True)
        acc_1 = correct_1 / batch_size
    
    top_pred = top_pred.cpu().view(batch_size)
    y = y.cpu()
    
#     print(y.shape)
#     print(y)
#     print(top_pred.shape)
#     print(top_pred)
    accuracy = accuracy_score(y, top_pred)
    #print("accuracy: {}".format(accuracy))

    f1 = f1_score(y, top_pred)
#     print(top_pred)
    #print("f1: {}".format(f1))

    recall = recall_score(y, top_pred)
    #print("recall: {}".format(recall))

    precision = precision_score(y, top_pred)
    #print("precision: {}".format(precision))

    cm = confusion_matrix(y, top_pred)
    #print("cm: {}".format(cm))
    return acc_1, accuracy, f1, recall, precision, cm

In [30]:
def evaluate(model, iterator, criterion, device, subtask_name="misogynous"):
    
    epoch_loss = dict()
    epoch_loss["misogynous"] = 0
    epoch_loss["shaming"] = 0
    epoch_loss["stereotype"] = 0
    epoch_loss["objectification"] = 0
    epoch_loss["violence"] = 0
    
    epoch_acc = dict()
    epoch_acc["misogynous"] = 0
    epoch_acc["shaming"] = 0
    epoch_acc["stereotype"] = 0
    epoch_acc["objectification"] = 0
    epoch_acc["violence"] = 0
    
    epoch_accuracy = dict()
    epoch_accuracy["misogynous"] = 0
    epoch_accuracy["shaming"] = 0
    epoch_accuracy["stereotype"] = 0
    epoch_accuracy["objectification"] = 0
    epoch_accuracy["violence"] = 0
    
    epoch_f1 = dict()
    epoch_f1["misogynous"] = 0
    epoch_f1["shaming"] = 0
    epoch_f1["stereotype"] = 0
    epoch_f1["objectification"] = 0
    epoch_f1["violence"] = 0
    
    epoch_recall = dict()
    epoch_recall["misogynous"] = 0
    epoch_recall["shaming"] = 0
    epoch_recall["stereotype"] = 0
    epoch_recall["objectification"] = 0
    epoch_recall["violence"] = 0
    
    epoch_precision = dict()
    epoch_precision["misogynous"] = 0
    epoch_precision["shaming"] = 0
    epoch_precision["stereotype"] = 0
    epoch_precision["objectification"] = 0
    epoch_precision["violence"] = 0
    
    epoch_cm = dict()
    epoch_cm["misogynous"] = 0
    epoch_cm["shaming"] = 0
    epoch_cm["stereotype"] = 0
    epoch_cm["objectification"] = 0
    epoch_cm["violence"] = 0
    
    model.eval()
    
    name_dict = dict()
    name_dict["misogynous"] = 0
    name_dict["shaming"] = 1
    name_dict["stereotype"] = 2
    name_dict["objectification"] = 3
    name_dict["violence"] = 4
    
    with torch.no_grad():
        
        #for (x, y) in iterator:
        for i, data in tqdm(enumerate(iterator, 0), desc='iterations'):

            x_img = data['meme']
            x_img = x_img.to(device)
            
            x_txt = data['text']
            x_txt = x_txt.to(device)
            
#             txt_ids = data['text_ids']

#             tokens_tensor = txt_ids['input_ids']
#             segments_tensor = txt_ids['attention_mask']
            
#             tokens_tensor = tokens_tensor.to(device).squeeze(axis=1)
#             segments_tensor = segments_tensor.to(device).squeeze(axis=1)
#             with torch.no_grad():
#                 x_txt = bert_pretrained(tokens_tensor, segments_tensor, output_hidden_states=True)
#                 # last hidden state
#                 x_txt = x_txt[1][-1]   # [bs, max_length, 768]
#                 x_txt = x_txt[:, 0, :]   # [CLS] token embedding represent the whole sentence
            
            y = data['labels']
#             y = y[:, name_dict[subtask_name]]

#             _, x_img = resnet_pretrained(x_img)
#             x_img = clip_pretrianed(x_img)
            
#             inputs = tokenizer(x_txt, return_tensors="pt", padding="max_length")   # 512
#             x_txt = bert_pretrained(**inputs, output_hidden_states=True)
#             # last hidden state
#             x_txt = x_txt[1][-1]   # [bs, max_length, 768]
#             x_txt = x_txt[:, 0, :]   # [CLS] token embedding represent the whole sentence
            
            x_img, x_txt = x_img.to(device), x_txt.to(device)
            
#             y = y.to(device)

            _, _, _, _, y_pred = model(x_img, x_txt)
            
#             loss = criterion(y_pred[subtask_name], y)

#             acc, accuracy, f1, recall, precision, cm = calculate_accuracy(y_pred[subtask_name], y)

#             epoch_loss += loss.item()
#             epoch_acc += acc.item()
#             epoch_accuracy += accuracy.item()
#             epoch_f1 += f1.item()
#             epoch_recall += recall.item()
#             epoch_precision += precision.item()
#             # epoch_cm += cm.item()
            
            for subtask_name, subtask_index in name_dict.items():
                subtask_y = y[:, subtask_index]
                subtask_y = subtask_y.to(device)
                loss = criterion(y_pred[subtask_name], subtask_y)
                acc, accuracy, f1, recall, precision, cm = calculate_accuracy(y_pred[subtask_name], subtask_y)
                
                epoch_loss[subtask_name] += loss.item()
                epoch_acc[subtask_name] += acc.item()
                epoch_accuracy[subtask_name] += accuracy.item()
                epoch_f1[subtask_name] += f1.item()
                epoch_recall[subtask_name] += recall.item()
                epoch_precision[subtask_name] += precision.item()
                # epoch_cm += cm.item()
                
                
    for subtask_name, subtask_index in name_dict.items():
        epoch_loss[subtask_name] /= len(iterator)
        epoch_acc[subtask_name] /= len(iterator)
        epoch_accuracy[subtask_name] /= len(iterator)
        epoch_f1[subtask_name] /= len(iterator)
        epoch_recall[subtask_name] /= len(iterator)
        epoch_precision[subtask_name] /= len(iterator)
                
        
#     epoch_loss /= len(iterator)
#     epoch_acc /= len(iterator)
#     epoch_accuracy /= len(iterator)
#     epoch_f1 /= len(iterator)
#     epoch_recall /= len(iterator)
#     epoch_precision /= len(iterator)
#     # epoch_cm /= len(iterator)
        
    return epoch_loss, epoch_acc, epoch_accuracy, epoch_f1, epoch_recall, epoch_precision

In [None]:
vae = main()
criterion = nn.CrossEntropyLoss()
criterion.to(device)
test_loss, test_acc, test_accuracy, test_f1, test_recall, test_precision = evaluate(vae, trial_iterator, criterion, device, "misogynous")

f.write(f'Trial subtask misogynous Loss: {test_loss:.3f} | Test Acc @1: {test_acc*100:6.2f}%\n')
f.write(f'Trial subtask misogynous accuracy: {test_accuracy*100:6.2f}%\n')
f.write(f'Trial subtask misogynous f1: {test_f1*100:6.2f}%\n')
f.write(f'Trial subtask misogynous recall: {test_recall*100:6.2f}%\n')
f.write(f'Trial subtask misogynous precision: {test_precision*100:6.2f}%\n')

test_loss, test_acc, test_accuracy, test_f1, test_recall, test_precision = evaluate(vae, trial_iterator, criterion, device, "shaming")

f.write(f'Trial subtask shaming Loss: {test_loss:.3f} | Test Acc @1: {test_acc*100:6.2f}%\n')
f.write(f'Trial subtask shaming accuracy: {test_accuracy*100:6.2f}%\n')
f.write(f'Trial subtask shaming f1: {test_f1*100:6.2f}%\n')
f.write(f'Trial subtask shaming recall: {test_recall*100:6.2f}%\n')
f.write(f'Trial subtask shaming precision: {test_precision*100:6.2f}%\n')

f.close()

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='iterations', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='iterations', max=1.0, style=ProgressSty…

  _warn_prf(average, modifier, msg_start, len(result))



num_correct: 0.7284444444444444

total_num: 9000




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='iterations', max=1.0, style=ProgressSty…

In [None]:
torch.save(vae.state_dict(), "VAEmodel.pkl")

In [103]:
%%capture cap --no-stderr
print('123139013')
with open('output.txt', 'w') as f:
    f.write(cap.stdout)

In [18]:
def test(model, iterator, device):
    
    name_dict = dict()
    name_dict["misogynous"] = 0
    name_dict["shaming"] = 1
    name_dict["stereotype"] = 2
    name_dict["objectification"] = 3
    name_dict["violence"] = 4
    
    y_test = dict()
    y_test["misogynous"] = []
    y_test["shaming"] = []
    y_test["stereotype"] = []
    y_test["objectification"] = []
    y_test["violence"] = []
    
    model.cuda()
    model.eval()
    
    with torch.no_grad():
        
        for i, data in tqdm(enumerate(iterator, 0), desc='iterations'):

            x_img = data['meme']
            x_img = x_img.to(device)
            
            x_txt = data['text']
            
            x_img, x_txt = x_img.to(device), x_txt.to(device)

            _, _, _, _, y_pred = model(x_img, x_txt)
            
            
            for subtask_name, subtask_index in name_dict.items():
                subtask_y = y_pred[subtask_name].cpu()
                for dp in subtask_y:
                    if dp[0] >= dp[1]:
                        y_test[subtask_name].append(0)
                    else:
                        y_test[subtask_name].append(1)
        
    return y_test

In [33]:
best_VAE = VAE(512)
best_VAE.load_state_dict(torch.load("VAEmodel-clip-laser-bn-dropout-epoch-6.pkl"))

<All keys matched successfully>

In [34]:
y_test = test(best_VAE, test_iterator, device)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='iterations', max=1.0, style=ProgressSty…




In [35]:
len(y_test["misogynous"])

1000

In [36]:
prediction_df = test_df.copy()

prediction_df["misogynous"] = y_test["misogynous"]
prediction_df["shaming"] = y_test["shaming"]
prediction_df["stereotype"] = y_test["stereotype"]
prediction_df["objectification"] = y_test["objectification"]
prediction_df["violence"] = y_test["violence"]

In [37]:
prediction_df = prediction_df.drop('Text Transcription', 1)
prediction_df

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence
0,15236.jpg,1,0,0,0,0
1,15805.jpg,1,0,1,1,0
2,16254.jpg,1,0,0,0,0
3,16191.jpg,1,0,0,0,0
4,15952.jpg,1,0,0,0,0
...,...,...,...,...,...,...
995,15591.jpg,1,0,1,1,0
996,15049.jpg,1,0,0,0,0
997,15363.jpg,1,0,0,0,0
998,15199.jpg,1,0,1,1,0


In [38]:
# Subtask A and subtask B
with open("answer.txt", "w") as f:
    for i, row in prediction_df.iterrows():
        f.write(row["file_name"] + '\t' + str(row["misogynous"]) + '\t' + str(row["shaming"]) + '\t' + str(row["stereotype"]) + '\t' + str(row["objectification"]) + '\t' + str(row["violence"]) + '\n')

In [39]:
!zip submission.zip answer.txt

  adding: answer.txt (deflated 84%)


In [23]:
# Subtask A and subtask B
results_7 = []
with open("answer7.txt", "r") as f_7:
    lines = f_7.readlines()
    for line in lines:
        items = line.strip().split('\t')
        results_7.append(items)

In [20]:
# Subtask A and subtask B
results_18 = []
with open("answer18.txt", "r") as f_18:
    lines = f_18.readlines()
    for line in lines:
        items = line.strip().split('\t')
        results_18.append(items)

In [24]:
with open("answer.txt", "w") as f:
    for i in range(len(results_18)):
        f.write(results_18[i][0] + '\t' + results_18[i][1] + '\t' + results_7[i][2] + '\t' + results_7[i][3] + '\t' + results_7[i][4] + '\t' + results_7[i][5] + '\n')
        

In [25]:
!zip submission.zip answer.txt

  adding: answer.txt (deflated 84%)


In [None]:
for i, data in tqdm(enumerate(trial_iterator, 0), desc='iterations'):
        #for x in batches:
            
        inputs = data['meme']
        classes = data['labels']
        print(classes[:, 0])

In [None]:
!nvidia-smi

In [None]:
!sudo fuser -v /dev/nvidia*

In [None]:
!sudo kill -9 4103