In [1]:
import json
from PIL import Image
from IPython.display import display
import numpy as np
import pandas as pd
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
# from torch.nn.utils.rnn import pad_sequence
# import torch

import nltk, csv, collections
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support
from scipy.sparse import hstack
import spacy

nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# importing the data
datafolder = '../../data/hateful_memes/'
train = datafolder+'train.jsonl'
test = datafolder+'test_seen.jsonl'
dev = datafolder+'dev_seen.jsonl'
# Load the data from the JSON file
df_train = pd.read_json(train, lines = True)
df_dev = pd.read_json(dev, lines = True)
df_test = pd.read_json(test, lines = True)

In [15]:
def preprocess_row(row):
    text = row['text']
    doc = nlp(text)
    tokens = []
    for token in doc:
        pos = token.pos_
        lemma = token.lemma_
        tokens.append((token.text, lemma, pos))
    row['tokens'] = " ".join([t[0] for t in tokens])
    row['lemmas'] = " ".join([t[1] for t in tokens])
    row['upos'] = " ".join([t[2] for t in tokens])
    return row

In [28]:
df_train = df_train.apply(preprocess_row, axis=1)
df_dev = df_dev.apply(preprocess_row, axis=1)
df_test = df_test.apply(preprocess_row, axis=1)


# basline

In [119]:
vectorizer01 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 1)) # to build n-grams (n=1-5) from the word ==> BoW
                
X_train = vectorizer01.fit_transform(df_train.tokens)
X_dev = vectorizer01.transform(df_dev.tokens) 
X_test = vectorizer01.transform(df_test.tokens)

Y_train = df_train.label.values
Y_dev = df_dev.label.values
Y_test = df_test.label.values

In [120]:
clf_svc = LinearSVC(max_iter=100000) # parameter C was selected based on grid search
clf_svc.fit(X_train, Y_train)
Y_pred = clf_svc.predict(X_dev)

In [121]:
results = pd.DataFrame(
    [list(precision_recall_fscore_support(Y_dev, Y_pred, average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
print("Text classifier results using only n1 BoW")
results

Text classifier results using only n1 BoW


Unnamed: 0,precision,recall,F1
0,0.54845,0.539414,0.518258


# advanced SVM 
#### - pos_fw_emo = representation of the text through POS tags, function words, and emotion words (from this representation n-grams (n=1-3) are built, see vectorize below)
#### - count = number of emotion words in a text
#### - emotion_associations = emotion associations from the NRC emotion lexicon

In [29]:
# load the NRC emotion lexicon into a dictionary with emotion words and corresponding associations
lexicon = '../../data/hateful_memes/nrc-lexicon-en.txt' # path to the NRC emotion lexicon on Google drive
emotions = {}
for line in open(lexicon).read().split('\n'):	
    emotion_word = line.split('\t')[0]
    emotion = line.split('\t')[1]
    association = line.split('\t')[2]
    if association == "1":
        if emotion_word in emotions:
            emotions[emotion_word].append(emotion)
        else:
            emotions[emotion_word] = [emotion] 

list(emotions.items())[:3] # print first 3 entries

[('smut', ['disgust', 'fear', 'negative']),
 ('expletive', ['anger', 'negative']),
 ('greeting', ['positive', 'surprise'])]

In [30]:
# extract features as described in the paper:
# - pos_fw_emo = representation of the text through POS tags, function words, and emotion words (from this representation n-grams (n=1-3) are built, see vectorize below)
# - count = number of emotion words in a text
# - emotion_associations = emotion associations from the NRC emotion lexicon

fw_list = ['ADP', 'AUX', 'CCONJ', 'DET', 'NUM', 'PART', 'PRON', 'SCONJ'] # POS tags that correspond to function words

def get_feats_en(upos, lemmas):	
    pos_fw_emo = []
    count = 0
    emotion_associations = []
    for i, lemma in enumerate(lemmas.split()):		
        if lemma.lower() in emotions:
            pos_fw_emo.append(lemma)
            count += 1
            emotion_associations.append(emotions[lemma.lower()])     
        else:
            if upos.split()[i] in fw_list:
                pos_fw_emo.append(lemma)
            else:
                pos_fw_emo.append(upos.split()[i])
    emotion_associations = [emo for sublist in emotion_associations for emo in sublist]
    return pd.Series([' '.join(pos_fw_emo), count, ' '.join(emotion_associations)])

df_train[['pos_fw_emo', 'count', 'emotion_associations']] = df_train.apply(lambda x: get_feats_en(x['upos'], x['lemmas']), axis=1) 
df_dev[['pos_fw_emo', 'count', 'emotion_associations']] = df_dev.apply(lambda x: get_feats_en(x['upos'], x['lemmas']), axis=1) 
df_test[['pos_fw_emo', 'count', 'emotion_associations']] = df_test.apply(lambda x: get_feats_en(x['upos'], x['lemmas']), axis=1) 

In [116]:
vectorizer1 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 3)) # to build n-grams (n=1-3) from the pos_fw_emo representation
vectorizer2 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 1)) # unigrams of emotion associations
vectorizer3 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 1)) 


# combine the features
X_train = hstack((vectorizer1.fit_transform(df_train.pos_fw_emo), vectorizer2.fit_transform(df_train.emotion_associations), df_train[['count']].values, vectorizer03.fit_transform(df_train.tokens)), format='csr') 
X_dev = hstack((vectorizer1.transform(df_dev.pos_fw_emo), vectorizer2.transform(df_dev.emotion_associations), df_dev[['count']].values, vectorizer03.transform(df_dev.tokens)), format='csr') 
X_test = hstack((vectorizer1.transform(df_test.pos_fw_emo), vectorizer2.transform(df_test.emotion_associations), df_test[['count']].values, vectorizer03.transform(df_test.tokens)), format='csr') 

Y_train = df_train.label.values
Y_dev = df_dev.label.values
Y_test = df_test.label.values

In [117]:
clf_svc = LinearSVC(max_iter=100000) # parameter C was selected based on grid search
clf_svc.fit(X_train, Y_train)
Y_pred = clf_svc.predict(X_dev)

In [118]:
results = pd.DataFrame(
    [list(precision_recall_fscore_support(Y_dev, Y_pred, average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results

Unnamed: 0,precision,recall,F1
0,0.543518,0.528476,0.485714


# Bert

### using wordembeddings from bert Hate-speech-CNERG/dehatebert-mono-english

In [123]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.nn.utils.rnn import pad_sequence
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
model = AutoModel.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english").to(device)

# Set the model to evaluation mode
model.eval()

# Define a function to get the vector representation of a text
def get_text_vector(text, tokenizer = tokenizer, model = model):
    # Tokenize the input text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    # Move the inputs to the device
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get the output of the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the last hidden state of the BERT model
    last_hidden_state = outputs.last_hidden_state

    # Get the mean of the last hidden state across all tokens
    mean_last_hidden_state = torch.mean(last_hidden_state, dim=1)

    # Move the mean_last_hidden_state to the cpu and convert to a numpy array
    return mean_last_hidden_state.cpu().numpy()


# Get the vectors for each string in the list
def get_vectors(dataframe):
    vector_list = []
    for text in dataframe.text.to_list():
        text_vector = get_text_vector(text)
        vector_list.append(text_vector)
    return vector_list


# Convert the vector_list to a numpy array and print the shape
# vectors = np.array(vector_list)
# print(vectors.shape)

Some weights of the model checkpoint at Hate-speech-CNERG/dehatebert-mono-english were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [112]:
train_vectors = get_vectors(df_train)
dev_vectors = get_vectors(df_dev)
test_vectors = get_vectors(df_test)


In [113]:
Y_train = df_train.label.values
Y_dev = df_dev.label.values
Y_test = df_test.label.values

In [115]:
clf_svc = LinearSVC(max_iter=100000) # parameter C was selected based on grid search
clf_svc.fit(X_train, Y_train)
Y_pred = clf_svc.predict(X_dev)
results = pd.DataFrame(
    [list(precision_recall_fscore_support(Y_dev, Y_pred, average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results

Unnamed: 0,precision,recall,F1
0,0.543518,0.528476,0.485714


### Trial 2: using Hate-speech-CNERG/dehatebert-mono-english directly

In [129]:
from transformers import pipeline
model = pipeline("text-classification", model="Hate-speech-CNERG/dehatebert-mono-english")

In [135]:
model(df_train.text[0])[0]['label']

'NON_HATE'

In [136]:
df_train['predicted'] = df_train.text.apply(lambda x: 0 if model(x)[0]['label'] == 'NON_HATE' else 1)

In [137]:
results = pd.DataFrame(
    [list(precision_recall_fscore_support(df_train['label'], df_train['predicted'], average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results

Unnamed: 0,precision,recall,F1
0,0.655363,0.580117,0.566793


### trial 3: fine tuning bert_base_cased

##### First we just test the performance of the pre-trained model without fine-tuning

In [9]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Set up GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
model.to(device)

# # Load dataframe
# df_train = pd.read_csv('train.csv')

# Define function to classify text
def classify_text(text):
    # Tokenize input text
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt').to(device)

    # Make prediction with model
    model.eval()
    with torch.no_grad():
        output = model(input_ids)

    # Get predicted label
    predicted_label = torch.argmax(output[0], dim=1).item()

    # Return predicted label
    return predicted_label

# Apply function to df_train['text'] column
df_train['predicted_label'] = df_train['text'].apply(lambda x: classify_text(x))

# Print results
print(df_train.head())

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

      id            img  label  \
0  42953  img/42953.png      0   
1  23058  img/23058.png      0   
2  13894  img/13894.png      0   
3  37408  img/37408.png      0   
4  82403  img/82403.png      0   

                                                text  predicted_label  
0   its their character not their color that matters                0  
1  don't be afraid to love again everyone is not ...                0  
2                           putting bows on your pet                0  
3  i love everything and everybody! except for sq...                0  
4  everybody loves chocolate chip cookies, even h...                0  


In [11]:
results = pd.DataFrame(
    [list(precision_recall_fscore_support(df_train['label'], df_train['predicted_label'], average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,F1
0,0.322412,0.5,0.392032


##### And here we tune it

In [22]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Set up GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
model.to(device)

# Load dataframe
# df_train = pd.read_csv('train.csv')

# Tokenize input texts and create input tensors
input_ids = []
attention_masks = []
labels = []
for text, label in zip(df_train['text'], df_train['label']):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    labels.append(int(label))

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Create dataset and dataloader
dataset = TensorDataset(input_ids, attention_masks, labels)
train_sampler = RandomSampler(dataset)
train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=32)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tune model
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)
        model.zero_grad()
        loss, logits = model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_masks, labels=batch_labels, return_dict=False)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Finished epoch {epoch+1} with average training loss of {avg_train_loss}.")



Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Finished epoch 1 with average training loss of 0.5795393059576365.
Finished epoch 2 with average training loss of 0.46517806304128545.
Finished epoch 3 with average training loss of 0.3902210938583191.
Finished epoch 4 with average training loss of 0.3300486235018063.


FileNotFoundError: [Errno 2] No such file or directory: 'val.csv'

In [29]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Evaluate model on validation data
# df_val = pd.read_csv('val.csv')
input_ids = []
attention_masks = []
labels = []
for text, label in zip(df_dev['text'], df_dev['label']):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    labels.append(int(label))
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
val_sampler = SequentialSampler(dataset)
val_dataloader = DataLoader(dataset, sampler=val_sampler, batch_size=32)

model.eval()
total_val_accuracy = 0
for batch in val_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)
    with torch.no_grad():
        _, logits = model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_masks, labels=batch_labels, return_dict=False)
    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()
    # print(logits, label_ids)
    total_val_accuracy += flat_accuracy(logits, label_ids)
avg_val_accuracy = total_val_accuracy / len(val_dataloader)
print(f"Validation accuracy: {avg_val_accuracy}")

Validation accuracy: 0.545703125


In [30]:
from sklearn.metrics import classification_report

# Evaluate model on validation data
# df_val = pd.read_csv('val.csv')
input_ids = []
attention_masks = []
labels = []
for text, label in zip(df_dev['text'], df_dev['label']):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    labels.append(int(label))

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
val_sampler = SequentialSampler(dataset)
val_dataloader = DataLoader(dataset, sampler=val_sampler, batch_size=32)

# Evaluate model on validation data
model.eval()
total_val_accuracy = 0
preds = []
for batch in val_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)
    with torch.no_grad():
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_masks)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()
    preds.extend(np.argmax(logits, axis=1))
    total_val_accuracy += flat_accuracy(logits, label_ids)

avg_val_accuracy = total_val_accuracy / len(val_dataloader)
print(f"Validation accuracy: {avg_val_accuracy}")

# Print precision, recall, and F1-score
print(classification_report(labels, preds))

Validation accuracy: 0.545703125
              precision    recall  f1-score   support

           0       0.53      0.85      0.65       253
           1       0.59      0.23      0.33       247

    accuracy                           0.54       500
   macro avg       0.56      0.54      0.49       500
weighted avg       0.56      0.54      0.49       500

