In [2]:
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import precision_recall_fscore_support
import json
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support
from transformers import BertTokenizer,BertModel
from utils import *

In [3]:
# importing the data
datafolder = '../../data/hateful_memes/'
train = datafolder+'train_with_features.csv'
test = datafolder+'test_with_features.csv'
dev = datafolder+'dev_with_features.csv'
df_train = pd.read_csv(train, skip_blank_lines=False)
df_dev = pd.read_csv(dev, skip_blank_lines=False)
df_test = pd.read_csv(test, skip_blank_lines=False)

In [4]:
def get_embeddings(df, column, tokenizer, model):
    # Set batch size and number of batches
    batch_size = 50
    num_batches = (len(df) + batch_size - 1) // batch_size

    # Tokenize input texts and create input tensors
    embeddings_list = []
    for i in range(num_batches):
        # Get the current batch of texts
        batch_texts = df[column][i * batch_size:(i + 1) * batch_size].tolist()

        # If it's the last batch, pad it with dummy texts
        if i == num_batches - 1 and len(batch_texts) < batch_size:
            dummy_texts = ['' for _ in range(batch_size - len(batch_texts))]
            batch_texts += dummy_texts

        # Tokenize the batch of texts and create input tensors
        input_ids = []
        attention_masks = []
        for text in batch_texts:
            encoded_dict = tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=64,
                pad_to_max_length=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])

        input_ids = torch.cat(input_ids, dim=0).to(device)
        attention_masks = torch.cat(attention_masks, dim=0).to(device)

        # Get embeddings
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)
            embeddings = outputs[0]

        embeddings_list.append(embeddings)

    # Stack all the embeddings tensors along the first dimension
    embeddings_tensor = torch.cat(embeddings_list, dim=0)

    return embeddings_tensor

In [4]:
# Set up GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased')
# tokenizer = BertTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
# model = BertTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
model.to(device)

# Get embeddings
train_vectors = get_embeddings(df_train, 'text', tokenizer, model)
Y_train = df_train.label.values


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max le

In [5]:
train_img = torch.stack(torch.load('train_img_tensors.pt'))

In [6]:
train_img.view(-1, 2048, 7, 7).view(8500, -1).shape

torch.Size([8500, 100352])

In [7]:
train_vectors.view(8500, -1).shape

torch.Size([8500, 49152])

In [8]:
# Reshape image_tensors to have the same length as text_embeddings
train_img = train_img.view(-1, 2048, 7, 7)

# Flatten image tensors to have shape (8500, 2048*7*7)
train_img = train_img.view(8500, -1)
train_img = train_img[:6000]

# Flatten image tensors to have shape (8500, 64*768)
train_vectors = train_vectors.view(8500, -1)
train_vectors = train_vectors[:6000]

# Concatenate the text_embeddings and image_tensors along a new dimension
fusion_representation = torch.cat([train_vectors, train_img], dim=1)

# # Pad the fusion_representation to have a fixed length along the first dimension
# # This is necessary if you want to batch the representations later
# fusion_representation = pad_sequence([fusion_representation], batch_first=True)

In [9]:
fusion_representation.shape

torch.Size([6000, 149504])

In [10]:

clf_svc = SVC(kernel='linear', C=10) # parameter C was selected based on grid search
clf_svc.fit(fusion_representation.cpu(), Y_train[:6000])

In [11]:
dev_vectors = get_embeddings(df_dev, 'text', tokenizer, model)
Y_dev = df_dev.label.values
dev_img = torch.stack(torch.load('dev_img_tensors.pt'))

# Reshape image_tensors to have the same length as text_embeddings
dev_img = dev_img.view(-1, 2048, 7, 7)

# Flatten image tensors to have shape (500, 2048*7*7)
dev_img = dev_img.view(500, -1)

# Flatten image tensors to have shape (500, 64*768)
dev_vectors = dev_vectors.view(500, -1)

# Concatenate the text_embeddings and image_tensors along a new dimension
fusion_representation_dev = torch.cat([dev_vectors, dev_img], dim=1)



In [13]:
Y_pred = clf_svc.predict(fusion_representation_dev.cpu())
results = pd.DataFrame(
    [list(precision_recall_fscore_support(Y_dev, Y_pred, average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results

Unnamed: 0,precision,recall,F1
0,0.562121,0.542646,0.505556


In [6]:
# Set up GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained tokenizer and model
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# model = BertModel.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
model = BertModel.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
model.to(device)

# Get embeddings
train_vectors = get_embeddings(df_train, 'text', tokenizer, model)
Y_train = df_train.label.values


Some weights of the model checkpoint at Hate-speech-CNERG/dehatebert-mono-english were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
train_img = torch.stack(torch.load('train_img_tensors.pt'))

In [8]:
train_img.view(-1, 2048, 7, 7).view(8500, -1).shape

torch.Size([8500, 100352])

In [9]:
train_vectors.view(8500, -1).shape

torch.Size([8500, 49152])

In [10]:
# Reshape image_tensors to have the same length as text_embeddings
train_img = train_img.view(-1, 2048, 7, 7)

# Flatten image tensors to have shape (8500, 2048*7*7)
train_img = train_img.view(8500, -1)
train_img = train_img[:6000]

# Flatten image tensors to have shape (8500, 64*768)
train_vectors = train_vectors.view(8500, -1)
train_vectors = train_vectors[:6000]

# Concatenate the text_embeddings and image_tensors along a new dimension
fusion_representation = torch.cat([train_vectors, train_img], dim=1)

# # Pad the fusion_representation to have a fixed length along the first dimension
# # This is necessary if you want to batch the representations later
# fusion_representation = pad_sequence([fusion_representation], batch_first=True)

In [11]:
fusion_representation.shape

torch.Size([6000, 149504])

In [12]:

clf_svc = SVC(kernel='linear', C=10) # parameter C was selected based on grid search
clf_svc.fit(fusion_representation.cpu(), Y_train[:6000])

In [13]:
dev_vectors = get_embeddings(df_dev, 'text', tokenizer, model)
Y_dev = df_dev.label.values
dev_img = torch.stack(torch.load('dev_img_tensors.pt'))

# Reshape image_tensors to have the same length as text_embeddings
dev_img = dev_img.view(-1, 2048, 7, 7)

# Flatten image tensors to have shape (500, 2048*7*7)
dev_img = dev_img.view(500, -1)

# Flatten image tensors to have shape (500, 64*768)
dev_vectors = dev_vectors.view(500, -1)

# Concatenate the text_embeddings and image_tensors along a new dimension
fusion_representation_dev = torch.cat([dev_vectors, dev_img], dim=1)



In [14]:
Y_pred = clf_svc.predict(fusion_representation_dev.cpu())
results = pd.DataFrame(
    [list(precision_recall_fscore_support(Y_dev, Y_pred, average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results

Unnamed: 0,precision,recall,F1
0,0.519391,0.515218,0.489058


In [18]:
test_vectors = get_embeddings(df_test, 'text', tokenizer, model)
Y_test = df_test.label.values
test_img = torch.stack(torch.load('test_img_tensors.pt'))

# Reshape image_tensors to have the same length as text_embeddings
test_img = test_img.view(-1, 2048, 7, 7)

# Flatten image tensors to have shape (500, 2048*7*7)
test_img = test_img.view(2000, -1)

# Flatten image tensors to have shape (500, 64*768)
test_vectors = test_vectors.view(2000, -1)

# Concatenate the text_embeddings and image_tensors along a new dimension
fusion_representation_test = torch.cat([test_vectors, test_img], dim=1)

Y_pred_test = clf_svc.predict(fusion_representation_test.cpu())
results = pd.DataFrame(
    [list(precision_recall_fscore_support(Y_test, Y_pred_test, average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results



RuntimeError: CUDA out of memory. Tried to allocate 38.00 MiB (GPU 0; 12.00 GiB total capacity; 11.17 GiB already allocated; 0 bytes free; 11.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF