In [3]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import precision_recall_fscore_support
from scipy.sparse import hstack
import spacy

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, pipeline
from utils import *
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# importing the data
datafolder = '../../data/hateful_memes/'
train = datafolder+'train_with_features.csv'
test = datafolder+'test_with_features.csv'
dev = datafolder+'dev_with_features.csv'
df_train = pd.read_csv(train, skip_blank_lines=False)
df_dev = pd.read_csv(dev, skip_blank_lines=False)
df_test = pd.read_csv(test, skip_blank_lines=False)

# Bert

### using wordembeddings from bert Hate-speech-CNERG/dehatebert-mono-english

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
model = AutoModel.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english").to(device)

# Set the model to evaluation mode
model.eval()

# get word embeddings of the sentences in the the text column of text
train_vectors = utils.get_vectors(df_train.text.to_list(), tokenizer, model)
dev_vectors = utils.get_vectors(df_dev.text.to_list(), tokenizer, model)
test_vectors = utils.get_vectors(df_test.text.to_list(), tokenizer, model)


Some weights of the model checkpoint at Hate-speech-CNERG/dehatebert-mono-english were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
Y_train = df_train.label.values
Y_dev = df_dev.label.values
Y_test = df_test.label.values

In [19]:
clf_svc = SVC(kernel='linear', max_iter=100000) # parameter C was selected based on grid search
clf_svc.fit(train_vectors, Y_train)
Y_pred = clf_svc.predict(dev_vectors)
results = pd.DataFrame(
    [list(precision_recall_fscore_support(Y_dev, Y_pred, average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results



Unnamed: 0,precision,recall,F1
0,0.603766,0.558024,0.505507


In [21]:
Y_pred2 = clf_svc.predict(test_vectors)
df_dev['hatebert_vectors'] = Y_pred
df_test['hatebert_vectors'] = Y_pred2

df_dev.to_csv(datafolder+'dev_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_with_features.csv', index=False)

### Trial 2: using Hate-speech-CNERG/dehatebert-mono-english directly

In [1]:
model = pipeline("text-classification", model="Hate-speech-CNERG/dehatebert-mono-english")

In [12]:
# df_train['predicted'] = df_train.text.apply(lambda x: 0 if model(x)[0]['label'] == 'NON_HATE' else 1)

In [None]:
df_dev['hatebert_direct'] = df_dev.text.apply(lambda x: 0 if model(x)[0]['label'] == 'NON_HATE' else 1)
df_test['hatebert_direct'] = df_test.text.apply(lambda x: 0 if model(x)[0]['label'] == 'NON_HATE' else 1)

In [None]:
df_dev.to_csv(datafolder+'dev_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_with_features.csv', index=False)

In [None]:
results = pd.DataFrame(
    [list(precision_recall_fscore_support(df_dev['label'], df_dev['hatebert_direct'], average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results

### trial 3: fine tuning bert_base_cased

##### First we just test the performance of the pre-trained model without fine-tuning

In [22]:
# Set up GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
model.to(device)

# # Load dataframe
# df_train = pd.read_csv('train.csv')

# Define function to classify text
def classify_text(text):
    # Tokenize input text
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt').to(device)

    # Make prediction with model
    model.eval()
    with torch.no_grad():
        output = model(input_ids)

    # Get predicted label
    predicted_label = torch.argmax(output[0], dim=1).item()

    # Return predicted label
    return predicted_label

# Apply function to df_train['text'] column
df_train['predicted_label'] = df_train['text'].apply(lambda x: classify_text(x))

# Print results
print(df_train.head())

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

      id            img  label  \
0  42953  img/42953.png      0   
1  23058  img/23058.png      0   
2  13894  img/13894.png      0   
3  37408  img/37408.png      0   
4  82403  img/82403.png      0   

                                                text  \
0   its their character not their color that matters   
1  don't be afraid to love again everyone is not ...   
2                           putting bows on your pet   
3  i love everything and everybody! except for sq...   
4  everybody loves chocolate chip cookies, even h...   

                                              tokens  \
0   its their character not their color that matters   
1  do n't be afraid to love again everyone is not...   
2                           putting bows on your pet   
3  i love everything and everybody ! except for s...   
4  everybody loves chocolate chip cookies , even ...   

                                              lemmas  \
0    its their character not their color that matter   
1  do not

In [24]:
results = pd.DataFrame(
    [list(precision_recall_fscore_support(df_train['label'], df_train['predicted_label'], average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,F1
0,0.322412,0.5,0.392032


##### And here we tune it

In [27]:
# Set up GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
model.to(device)

# Load dataframe
# df_train = pd.read_csv('train.csv')

# Tokenize input texts and create input tensors
input_ids = []
attention_masks = []
labels = []
for text, label in zip(df_train['text'], df_train['label']):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    labels.append(int(label))

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Create dataset and dataloader
dataset = TensorDataset(input_ids, attention_masks, labels)
train_sampler = RandomSampler(dataset)
train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=32)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tune model
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)
        model.zero_grad()
        loss, logits = model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_masks, labels=batch_labels, return_dict=False)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Finished epoch {epoch+1} with average training loss of {avg_train_loss}.")



Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Finished epoch 1 with average training loss of 0.5794730981937924.
Finished epoch 2 with average training loss of 0.47785264040742603.
Finished epoch 3 with average training loss of 0.40099139349128965.
Finished epoch 4 with average training loss of 0.3310820721650034.
Finished epoch 5 with average training loss of 0.28724317031359314.
Finished epoch 6 with average training loss of 0.24667913720179768.
Finished epoch 7 with average training loss of 0.2228365904910672.
Finished epoch 8 with average training loss of 0.20607491077056952.
Finished epoch 9 with average training loss of 0.19099854277376843.
Finished epoch 10 with average training loss of 0.177815996467563.


In [28]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Evaluate model on validation data
# df_val = pd.read_csv('val.csv')
input_ids = []
attention_masks = []
labels = []
for text, label in zip(df_dev['text'], df_dev['label']):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    labels.append(int(label))
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
val_sampler = SequentialSampler(dataset)
val_dataloader = DataLoader(dataset, sampler=val_sampler, batch_size=32)

model.eval()
total_val_accuracy = 0
for batch in val_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)
    with torch.no_grad():
        _, logits = model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_masks, labels=batch_labels, return_dict=False)
    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()
    # print(logits, label_ids)
    total_val_accuracy += flat_accuracy(logits, label_ids)
avg_val_accuracy = total_val_accuracy / len(val_dataloader)
print(f"Validation accuracy: {avg_val_accuracy}")

Validation accuracy: 0.562890625


In [35]:
from sklearn.metrics import classification_report

# Evaluate model on validation data
# df_val = pd.read_csv('val.csv')
input_ids = []
attention_masks = []
labels = []
for text, label in zip(df_dev['text'], df_dev['label']):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    labels.append(int(label))

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
val_sampler = SequentialSampler(dataset)
val_dataloader = DataLoader(dataset, sampler=val_sampler, batch_size=32)

# Evaluate model on validation data
model.eval()
total_val_accuracy = 0
preds = []
for batch in val_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)
    with torch.no_grad():
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_masks)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()
    preds.extend(np.argmax(logits, axis=1))
    total_val_accuracy += flat_accuracy(logits, label_ids)

avg_val_accuracy = total_val_accuracy / len(val_dataloader)
print(f"Validation accuracy: {avg_val_accuracy}")

# Print precision, recall, and F1-score
print(classification_report(labels, preds))

Validation accuracy: 0.562890625
              precision    recall  f1-score   support

           0       0.54      0.84      0.66       253
           1       0.62      0.26      0.37       247

    accuracy                           0.56       500
   macro avg       0.58      0.55      0.51       500
weighted avg       0.58      0.56      0.52       500



In [36]:
from sklearn.metrics import classification_report

# Evaluate model on validation data
# df_val = pd.read_csv('val.csv')
input_ids = []
attention_masks = []
labels = []
for text, label in zip(df_test['text'], df_test['label']):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    labels.append(int(label))

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
val_sampler = SequentialSampler(dataset)
val_dataloader = DataLoader(dataset, sampler=val_sampler, batch_size=32)

# Evaluate model on validation data
model.eval()
total_val_accuracy = 0
preds2 = []
for batch in val_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)
    with torch.no_grad():
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_masks)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()
    preds2.extend(np.argmax(logits, axis=1))
    total_val_accuracy += flat_accuracy(logits, label_ids)

avg_val_accuracy = total_val_accuracy / len(val_dataloader)
print(f"Validation accuracy: {avg_val_accuracy}")

# Print precision, recall, and F1-score
print(classification_report(labels, preds2))



Validation accuracy: 0.5888671875
              precision    recall  f1-score   support

           0       0.56      0.83      0.67       510
           1       0.64      0.32      0.43       490

    accuracy                           0.58      1000
   macro avg       0.60      0.57      0.55      1000
weighted avg       0.60      0.58      0.55      1000



In [38]:
df_dev['bert_base_cased_finetuned'] = preds
df_test['bert_base_cased_finetuned'] = preds2

df_dev.to_csv(datafolder+'dev_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_with_features.csv', index=False)