# Memotion 7k Visual Bert

## Install

In [1]:
%%capture
! pip install transformers
! pip install datasets 
! pip install --upgrade tqdm
! pip install pytorch-lightning

## Import

In [2]:
import os
import time
import logging
import argparse
from pathlib import Path
from string import punctuation

import pandas as pd
import numpy as np
from PIL import Image, ImageFile

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger

from sklearn.model_selection import train_test_split
from sklearn.utils import resample, class_weight
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)

from transformers import (
    AdamW,
    get_scheduler,
    BertTokenizer, AutoTokenizer,
    ViTImageProcessor, ViTModel,
    TrainingArguments, Trainer,
    VisualBertModel, VisualBertConfig
)

from memotion_utility import load_dataset_text_only,load_data



In [3]:
torch.__version__

'2.0.0'

## Configs

In [4]:
ImageFile.LOAD_TRUNCATED_IMAGES = True
CSV_FILE = '/kaggle/input/memotion-dataset-7k/memotion_dataset_7k/labels.csv'
ROOT_DIR = '/kaggle/input/memotion-dataset-7k/memotion_dataset_7k/images'
MAX_LEN = 128
LR = 1e-6
ADAM_E = 1e-8
batch_size = 16
IMAGE_SIZE = (224,224)
dropout_prob = 0.01
weight_decay = 0.01
epochs = 10
NUM_WARMUP_STEPS = 12
NUM_TRAINING_STEPS = 1230
downsample = False

tokenizer_name = 'bert-base-uncased'
img_feature_name = 'google/vit-base-patch16-224-in21k'
multi_model_name = 'uclanlp/visualbert-nlvr2-coco-pre'

## Dataset

In [5]:
df = pd.read_csv(CSV_FILE)
df.head()

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,hilarious,general,not_offensive,not_motivational,very_positive
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,not_funny,general,not_offensive,motivational,very_positive
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,very_funny,not_sarcastic,not_offensive,not_motivational,positive
3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,very_funny,twisted_meaning,very_offensive,motivational,positive
4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,hilarious,very_twisted,very_offensive,not_motivational,neutral


In [6]:
# def preprocess(df):
#     df = df.drop('Unnamed: 0', axis=1)
#     df = df.sample(frac=1,random_state=123).reset_index(drop=True)
#     df['offensive'] = np.where(df['offensive'] == 'not_offensive', 'not_offensive', 'offensive')

#     df['label'] = df['offensive'].map({
#         'not_offensive': 0, 
#         'offensive': 1
#     })
    
#     new_df = df[['image_name','text_corrected','label']]
#     return new_df

In [7]:
# df = preprocess(df)
# df.head()

In [8]:
# df.loc[df['text_corrected'].isna(),'text_corrected'] = 'nan'

In [9]:
# df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
# df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42)

In [10]:
# label_counts = df_train.label.value_counts()
# # Convert the value counts to a list
# nSamples = label_counts.tolist()

# # Print the list
# print(nSamples)

In [11]:
# # downsampling for balanced class in training data

# df_nof = df_train[df_train['label'] == 0]
# df_of = df_train[df_train['label'] == 1]

# df_of_downsampled = resample(df_of,
#                     replace=False, 
#                     n_samples=len(df_nof),  
#                     random_state=123)

# df_train = pd.concat([df_of_downsampled, df_nof])


# label_counts = df_train.label.value_counts()
# # Convert the value counts to a list
# nSamples = label_counts.tolist()

# # Print the list
# print(nSamples)

In [12]:
df_train,df_val,df_test = load_data(CSV_FILE, downsample = downsample)

train : 
 label
1    3080
0    1953
Name: count, dtype: int64
val : 
 label
1    343
0    217
Name: count, dtype: int64
test : 
 label
1    856
0    543
Name: count, dtype: int64


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [13]:
y_train = df_train["label"].values.tolist()
class_weights = class_weight.compute_class_weight(class_weight ='balanced',
                                                 classes = np.unique(y_train),
                                                 y = y_train)
print(class_weights)

[1.28853047 0.81704545]


In [14]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
feature_extractor = ViTImageProcessor.from_pretrained(img_feature_name)
feature_model = ViTModel.from_pretrained(img_feature_name)

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [15]:
class HatefulMemesData(Dataset):
    def __init__(self, df, tokenizer,print_text=False):        
        self.tokenizer = tokenizer
        self.print_text = print_text

        texts = df["text"].values.tolist()
        labels = df["label"].values.tolist()
        images = df["image_name"].values.tolist()
        
        self.transform = transforms.Compose([transforms.Resize(IMAGE_SIZE),transforms.ToTensor()])

        self.dataset = []
        for i, inp in enumerate(texts):
            self.dataset.append({"text": inp, "label": labels[i], 'image_name': images[i]})
  
    def __len__(self):
        return len(self.dataset)


    def tokenize_data(self, example):
        
        # Tokenize text
        name = example['image_name']
        encoded_dict = tokenizer(example['text'], padding='max_length', max_length=MAX_LEN, truncation=True, return_tensors='pt')
        tokens = encoded_dict['input_ids']
        token_type_ids = encoded_dict['token_type_ids']
        attn_mask = encoded_dict['attention_mask']
        targets = torch.tensor(example['label']).type(torch.int64)

        ## Get Visual Embedding     
        img = Image.open(os.path.join(ROOT_DIR,name)).convert("RGB")
        if self.transform:
            img = self.transform(img)
        inputs = feature_extractor(images=img, return_tensors="pt")
        outputs = feature_model(**inputs)
        visual_embeds = outputs.last_hidden_state
        visual_embeds = visual_embeds.cpu()
        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)

        inputs={"input_ids": tokens.squeeze(),
            "attention_mask": attn_mask.squeeze(),
            "token_type_ids": token_type_ids.squeeze(),
            "visual_embeds": visual_embeds.squeeze(),
            "visual_token_type_ids": visual_token_type_ids.squeeze(),
            "visual_attention_mask": visual_attention_mask.squeeze(),
            "label": targets.squeeze()
        }
        
        return inputs
  
    def __getitem__(self, index):
        inputs = self.tokenize_data(self.dataset[index])
        
        if self.print_text:
            for k in inputs.keys():
                print(k, inputs[k].shape, inputs[k].dtype)

        return inputs

In [16]:
dataset = HatefulMemesData(df_val, tokenizer,True)

In [17]:
example1 = dataset[100]

It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


input_ids torch.Size([128]) torch.int64
attention_mask torch.Size([128]) torch.int64
token_type_ids torch.Size([128]) torch.int64
visual_embeds torch.Size([197, 768]) torch.float32
visual_token_type_ids torch.Size([197]) torch.int64
visual_attention_mask torch.Size([197]) torch.int64
label torch.Size([]) torch.int64


## Define the Model

In [18]:
class VisualBERTClassifier(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(VisualBERTClassifier, self).__init__()
        
        configuration = VisualBertConfig.from_pretrained(multi_model_name,
                                                         hidden_dropout_prob=dropout_prob, 
                                                         attention_probs_dropout_prob=dropout_prob)
        
        self.visualbert = VisualBertModel.from_pretrained(multi_model_name, config=configuration)
        self.embed_cls = nn.Linear(768, 1024)
        self.num_labels = 2
        self.dropout = nn.Dropout(dropout_prob)
        self.cls=  nn.Linear(768, self.num_labels)
        
        #normedWeights = [1 - (x / sum(nSamples)) for x in nSamples]
        #weight=torch.FloatTensor(normedWeights)
        self.loss_fct = CrossEntropyLoss()
        
    
    def forward(self, input_ids, attention_mask, token_type_ids, visual_embeds, visual_attention_mask,
                visual_token_type_ids, labels):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        visual_embeds_cls = self.embed_cls(visual_embeds)
        outputs = self.visualbert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                visual_embeds=visual_embeds_cls,
                visual_attention_mask=visual_attention_mask,
                visual_token_type_ids=visual_token_type_ids,
            )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.cls(pooled_output)
        reshaped_logits = logits.view(-1, self.num_labels)

        loss = self.loss_fct(reshaped_logits, labels.view(-1))
      
        return loss, reshaped_logits

## Fine Tuning using HuggingFace Trainer

In [19]:
model = VisualBERTClassifier()

Downloading config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [20]:
# Initialize scheduler

optimizer = AdamW(model.parameters(), lr=LR, eps= ADAM_E)

scheduler = get_scheduler(
    name="linear",                      
    optimizer=optimizer,                
    num_warmup_steps= NUM_WARMUP_STEPS, 
    num_training_steps= NUM_TRAINING_STEPS 
)




In [21]:
metric_name = "f1_macro"

args = TrainingArguments(
    output_dir = "model-checkpoint",
    seed = 110, 
    evaluation_strategy = "steps",
    learning_rate=LR,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs= epochs,
    weight_decay = weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    eval_steps = 100,
    save_steps = 100,
    save_total_limit= 2,
    fp16 = False,
    gradient_accumulation_steps = 2,
    report_to="none"
)

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    #print(labels,predictions)
    
    # Accuracy
    acc = accuracy_score(labels, predictions)

    # Compute per-class metrics
    f1_per_class = f1_score(labels, predictions, average=None)
    precision_per_class = precision_score(labels, predictions, average=None)
    recall_per_class = recall_score(labels, predictions, average=None)

    # Compute macro-average metrics
    f1_macro = f1_score(labels, predictions, average='macro')
    precision_macro = precision_score(labels, predictions, average='macro')
    recall_macro = recall_score(labels, predictions, average='macro')

    return {
        "accuracy": acc,
        "f1_class_0": f1_per_class[0],
        "f1_class_1": f1_per_class[1],
        "precision_class_0": precision_per_class[0],
        "precision_class_1": precision_per_class[1],
        "recall_class_0": recall_per_class[0],
        "recall_class_1": recall_per_class[1],
        "f1_macro": f1_macro,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro
    }


In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset = HatefulMemesData(df_train,tokenizer=tokenizer),
    eval_dataset =  HatefulMemesData(df_val,tokenizer=tokenizer),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

In [24]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1 Class 0,F1 Class 1,Precision Class 0,Precision Class 1,Recall Class 0,Recall Class 1,F1 Macro,Precision Macro,Recall Macro
100,No log,0.670141,0.6125,0.0,0.75969,0.0,0.6125,0.0,1.0,0.379845,0.30625,0.5
200,No log,0.669764,0.6125,0.0,0.75969,0.0,0.6125,0.0,1.0,0.379845,0.30625,0.5
300,No log,0.668025,0.6125,0.0,0.75969,0.0,0.6125,0.0,1.0,0.379845,0.30625,0.5
400,No log,0.667473,0.6125,0.0,0.75969,0.0,0.6125,0.0,1.0,0.379845,0.30625,0.5
500,0.668700,0.667561,0.6125,0.0,0.75969,0.0,0.6125,0.0,1.0,0.379845,0.30625,0.5
600,0.668700,0.668022,0.6125,0.0,0.75969,0.0,0.6125,0.0,1.0,0.379845,0.30625,0.5
700,0.668700,0.668414,0.6125,0.0,0.75969,0.0,0.6125,0.0,1.0,0.379845,0.30625,0.5
800,0.668700,0.667547,0.6125,0.0,0.75969,0.0,0.6125,0.0,1.0,0.379845,0.30625,0.5
900,0.668700,0.667787,0.6125,0.0,0.75969,0.0,0.6125,0.0,1.0,0.379845,0.30625,0.5
1000,0.668700,0.667632,0.6125,0.0,0.75969,0.0,0.6125,0.0,1.0,0.379845,0.30625,0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

TrainOutput(global_step=1570, training_loss=0.66829406835471, metrics={'train_runtime': 17939.1944, 'train_samples_per_second': 2.806, 'train_steps_per_second': 0.088, 'total_flos': 0.0, 'train_loss': 0.66829406835471, 'epoch': 9.97})

In [25]:
trainer.evaluate(eval_dataset=HatefulMemesData(df_test,tokenizer=tokenizer))  # Evaluate on test data

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6706590056419373,
 'eval_accuracy': 0.6118656182987848,
 'eval_f1_class_0': 0.0,
 'eval_f1_class_1': 0.7592017738359201,
 'eval_precision_class_0': 0.0,
 'eval_precision_class_1': 0.6118656182987848,
 'eval_recall_class_0': 0.0,
 'eval_recall_class_1': 1.0,
 'eval_f1_macro': 0.37960088691796007,
 'eval_precision_macro': 0.3059328091493924,
 'eval_recall_macro': 0.5,
 'eval_runtime': 409.7337,
 'eval_samples_per_second': 3.414,
 'eval_steps_per_second': 0.215,
 'epoch': 9.97}