In [1]:
import json
import numpy as np
import pandas as pd
from utils import *

from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.svm import LinearSVC, SVC


# Preprocessing

In [9]:
# importing the data
datafolder = '../../data/hateful_memes/'
test = datafolder+'test_unseen.jsonl'
dev = datafolder+'dev_unseen.jsonl'
# Load the data from the JSON file
df_dev = pd.read_json(dev, lines = True)
df_test = pd.read_json(test, lines = True)

In [None]:
df_dev = df_dev.apply(preprocess_row, axis=1)
df_test = df_test.apply(preprocess_row, axis=1)

In [None]:
# load the NRC emotion lexicon into a dictionary with emotion words and corresponding associations
lexicon = '../../data/hateful_memes/nrc-lexicon-en.txt' # path to the NRC emotion lexicon
emotions = {}
for line in open(lexicon).read().split('\n'):	
    emotion_word = line.split('\t')[0]
    emotion = line.split('\t')[1]
    association = line.split('\t')[2]
    if association == "1":
        if emotion_word in emotions:
            emotions[emotion_word].append(emotion)
        else:
            emotions[emotion_word] = [emotion] 

fw_list = ['ADP', 'AUX', 'CCONJ', 'DET', 'NUM', 'PART', 'PRON', 'SCONJ'] # POS tags that correspond to function words

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-e2m-intent")
intent_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-e2m-intent")
def get_intent(event):
    input_text = "%s </s>" % event
    features = tokenizer([input_text], return_tensors='pt')

    output = intent_model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'])

    return tokenizer.decode(output[0])[6:-4]

sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
def senti(text):
    output = sentiment_analysis(text)[0]
    if output['label'] == 'POSITIVE':
        return output['score']
    else:
        return 1 - output['score']
    
def get_feats_en(upos, lemmas, text):	
    pos_fw_emo = []
    count = 0
    emotion_associations = []
    sentiment_score = senti(text)
    intent = get_intent(text)
    for i, lemma in enumerate(lemmas.split()):
        if lemma.lower() in emotions:
            pos_fw_emo.append(lemma)
            count += 1
            emotion_associations.append(emotions[lemma.lower()])     
        else:
            if upos.split()[i] in fw_list:
                pos_fw_emo.append(lemma)
            else:
                pos_fw_emo.append(upos.split()[i])
    emotion_associations = [emo for sublist in emotion_associations for emo in sublist]
    return pd.Series([' '.join(pos_fw_emo), count, ' '.join(emotion_associations), sentiment_score, intent])

df_dev[['pos_fw_emo', 'count', 'emotion_associations', 'sentiment_score', 'intent']] = df_dev.apply(lambda x: get_feats_en(x['upos'], x['lemmas'], x['text']), axis=1) 
df_test[['pos_fw_emo', 'count', 'emotion_associations', 'sentiment_score', 'intent']] = df_test.apply(lambda x: get_feats_en(x['upos'], x['lemmas'], x['text']), axis=1) 


In [None]:
df_dev.to_csv(datafolder+'dev_unseen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)

## Importing

In [2]:
datafolder = '../../data/hateful_memes/'
train1 = datafolder+'train_with_features.csv'
train2 = datafolder+'test_with_features.csv'
train3 = datafolder+'dev_with_features.csv'
df_train1 = pd.read_csv(train1, skip_blank_lines=False)
df_train2 = pd.read_csv(train2, skip_blank_lines=False)
df_train3 = pd.read_csv(train3, skip_blank_lines=False)
df_train = pd.concat([df_train1, df_train2, df_train3], ignore_index=True, axis=0)

In [3]:
# importing the data
datafolder = '../../data/hateful_memes/'
test = datafolder+'test_unseen_with_features.csv'
dev = datafolder+'dev_unseen_with_features.csv'
df_dev = pd.read_csv(dev, keep_default_na=False)
df_test = pd.read_csv(test, keep_default_na=False)

# BERTs

## HateBert wordembeddings

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
model = AutoModel.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english").to(device)

# Set the model to evaluation mode
model.eval()

# get word embeddings of the sentences in the the text column of text
train_vectors = get_vectors(df_train.text.to_list(), tokenizer, model)
dev_vectors = get_vectors(df_dev.text.to_list(), tokenizer, model)
test_vectors = get_vectors(df_test.text.to_list(), tokenizer, model)
Y_train = df_train.label.values
Y_dev = df_dev.label.values
Y_test = df_test.label.values

Some weights of the model checkpoint at Hate-speech-CNERG/dehatebert-mono-english were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
clf_svc = SVC(kernel='linear', max_iter=1000000, C=10) # parameter C was selected based on grid search
clf_svc.fit(train_vectors, Y_train)

df_dev['hatebert_vectors'] = clf_svc.predict(dev_vectors)
df_test['hatebert_vectors'] = clf_svc.predict(test_vectors)

df_dev.to_csv(datafolder+'dev_unseen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)



## HateBert Direct

In [6]:
model = pipeline("text-classification", model="Hate-speech-CNERG/dehatebert-mono-english")

df_dev['hatebert_direct'] = df_dev.text.apply(lambda x: 0 if model(x)[0]['label'] == 'NON_HATE' else 1)
df_test['hatebert_direct'] = df_test.text.apply(lambda x: 0 if model(x)[0]['label'] == 'NON_HATE' else 1)

df_dev.to_csv(datafolder+'dev_unseen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)

## FineTune Bert

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification

In [5]:
# Set up GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
model.to(device)

tokenizer, model = fine_tune(df_train, tokenizer, model)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Finished epoch 1 with average training loss of 0.6018809530491265.
Finished epoch 2 with average training loss of 0.5053156623349022.
Finished epoch 3 with average training loss of 0.4280216864789256.
Finished epoch 4 with average training loss of 0.3582127199481471.
Finished epoch 5 with average training loss of 0.312639360753492.
Finished epoch 6 with average training loss of 0.2812319396022029.
Finished epoch 7 with average training loss of 0.2606272713873333.
Finished epoch 8 with average training loss of 0.24544338860546058.
Finished epoch 9 with average training loss of 0.23105390569843803.
Finished epoch 10 with average training loss of 0.21971434107222876.


In [6]:
df_dev['bert_base_cased_finetuned'] = predict_from_fine_tuned(df_dev, tokenizer, model)
df_test['bert_base_cased_finetuned'] = predict_from_fine_tuned(df_test, tokenizer, model)
df_dev.to_csv(datafolder+'dev_unseen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)

# SVMs

## Baseline: BoW

In [4]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC


In [5]:
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 1)) # to build 1n-grams from the word ==> BoW
                
X_train = vectorizer.fit_transform(df_train.tokens)
X_dev = vectorizer.transform(df_dev.tokens) 
X_test = vectorizer.transform(df_test.tokens)

Y_train = df_train.label.values
Y_dev = df_dev.label.values
Y_test = df_test.label.values

clf_svc = LinearSVC(max_iter=1000000, C = 10,random_state =456)  # TODO: Hyperparms tuning
clf_svc.fit(X_train, Y_train)
Y_pred = clf_svc.predict(X_dev)
Y_pred2 = clf_svc.predict(X_test)

df_dev['svm_BoW_baseline_linear_C10'] = Y_pred
df_test['svm_BoW_baseline_linear_C10'] = Y_pred2
df_dev.to_csv(datafolder+'dev_unseen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)

## character-n-grams:

In [6]:
vectorizer2 = CountVectorizer(tokenizer=lambda x: list(x), analyzer='char', ngram_range=(1, 3)) # to build 1n-grams from the word ==> BoW

X_train = vectorizer2.fit_transform(df_train.tokens)
X_dev = vectorizer2.transform(df_dev.tokens) 
X_test = vectorizer2.transform(df_test.tokens)

Y_train = df_train.label.values
Y_dev = df_dev.label.values
Y_test = df_test.label.values

clf_svc = LinearSVC(max_iter=1000000, C = 10,random_state =456)
clf_svc.fit(X_train, Y_train)
Y_pred = clf_svc.predict(X_dev)
Y_pred2 = clf_svc.predict(X_test)

df_dev['baseline_svm_char_kernelC10'] = Y_pred
df_test['baseline_svm_char_kernelC10'] = Y_pred2
df_dev.to_csv(datafolder+'dev_unseen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)



## Advanced SVM 

In [9]:
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC


In [14]:
vectorizer1 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 3)) # to build n-grams (n=1-3) from the pos_fw_emo representation
vectorizer2 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 1)) # unigrams of emotion associations
vectorizer3 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 1)) # unigrams of tokens (BoW)
vectorizer4 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(4, 4)) # unigrams of intent (BoW)

df_train = df_train.fillna('')
# combine the features
X_train = hstack((vectorizer1.fit_transform(df_train.pos_fw_emo), vectorizer2.fit_transform(df_train.emotion_associations), df_train[['count']].values, \
                df_train[['sentiment_score']].values, vectorizer3.fit_transform(df_train.tokens), vectorizer4.fit_transform(df_train.intent)), format='csr') 

X_dev = hstack((vectorizer1.transform(df_dev.pos_fw_emo), vectorizer2.transform(df_dev.emotion_associations), df_dev[['count']].values, \
                df_dev[['sentiment_score']].values, vectorizer3.transform(df_dev.tokens), vectorizer4.transform(df_dev.intent) ), format='csr') 

X_test = hstack((vectorizer1.transform(df_test.pos_fw_emo), vectorizer2.transform(df_test.emotion_associations), df_test[['count']].values, \
                df_test[['sentiment_score']].values, vectorizer3.transform(df_test.tokens), vectorizer4.transform(df_test.intent) ), format='csr') 

Y_train = df_train.label.values
Y_dev = df_dev.label.values
Y_test = df_test.label.values

clf_svc = LinearSVC(max_iter=1000000, C = 10,random_state =456)
clf_svc.fit(X_train, Y_train)
Y_pred = clf_svc.predict(X_dev)
Y_pred2 = clf_svc.predict(X_test)

df_dev['Advanced_svm_linear_100K_C10'] = Y_pred
df_test['Advanced_svm_linear_100K_C10'] = Y_pred2
df_dev.to_csv(datafolder+'dev_unseen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)

# ResNet50

### create image tensors

In [4]:
from datasets import load_dataset, Image

In [5]:
# importing the data
datafolder = '../../data/hateful_memes/'
test = datafolder+'test_unseen_with_features.csv'
dev = datafolder+'dev_unseen_with_features.csv'

dev_data = load_dataset('csv', data_files=dev, num_proc=8).cast_column("img", Image(decode=False))
test_data = load_dataset('csv', data_files=test, num_proc=8).cast_column("img", Image(decode=False))

dev_img = get_image_vectors(dev_data, 'train', datafolder)
test_img = get_image_vectors(test_data, 'train', datafolder)
torch.save(dev_img, 'dev_unseen_img_tensors.pt')
torch.save(test_img, 'test_unseen_img_tensors.pt')

Found cached dataset csv (C:/Users/Hisha/.cache/huggingface/datasets/csv/default-98dec141641a1da8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset csv (C:/Users/Hisha/.cache/huggingface/datasets/csv/default-0c869af6f3d44c59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of the model checkpoint at microsoft/resnet-50 were not used when initializing ResNetModel: ['classifier.1.weight', 'classifier.1.bias']
- This IS expected if you are initializing ResNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ResNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of the model checkpoint at microsoft

In [6]:
# We already have the following 
train_img1 = torch.stack(torch.load('train_img_tensors.pt'))
train_img2 = torch.stack(torch.load('dev_img_tensors.pt'))
train_img3 = torch.stack(torch.load('test_img_tensors.pt'))
# we combine them into one big training set
# train_img = 

In [14]:
train_img = torch.cat((train_img1,train_img2, train_img3), dim=0)

In [15]:
Y_train = df_train['label']
Y_dev = np.asarray(dev_data['train']['label'])
Y_test = np.asarray(test_data['train']['label'])

In [16]:
train_X = [np.array(x.cpu()).flatten() for x in train_img]
dev_X = [np.array(x.cpu()).flatten() for x in dev_img]
test_X = [np.array(x.cpu()).flatten() for x in test_img]

### train model

In [17]:
from sklearn.svm import SVC
clf_svc = SVC(kernel='rbf', random_state =456)
clf_svc.fit(train_X, Y_train)
Y_pred = clf_svc.predict(dev_X)
Y_pred2 = clf_svc.predict(test_X)
df_dev['ResNet_svm_rbf_kernel'] = Y_pred
df_test['ResNet_svm_rbf_kernel'] = Y_pred2

df_dev.to_csv(datafolder+'dev_unseen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)

In [18]:
clf_svc = SVC(kernel='linear', C=10, random_state =456)
clf_svc.fit(train_X, Y_train)
Y_pred = clf_svc.predict(dev_X)
Y_pred2 = clf_svc.predict(test_X)

df_dev['ResNet_svm_linear_kernelC10'] = Y_pred
df_test['ResNet_svm_linear_kernelC10'] = Y_pred2

df_dev.to_csv(datafolder+'dev_unseen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)

# Ensemble

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

In [20]:
datafolder = '../../data/hateful_memes/'
train1 = datafolder+'test_with_features.csv'
train2 = datafolder+'dev_with_features.csv'
df_train1 = pd.read_csv(train1, skip_blank_lines=False)
df_train2 = pd.read_csv(train2, skip_blank_lines=False)
df_train = pd.concat([df_train1, df_train2], ignore_index=True, axis=0)

In [21]:
# importing the data
datafolder = '../../data/hateful_memes/'
test = datafolder+'test_unseen_with_features.csv'
dev = datafolder+'dev_unseen_with_features.csv'
df_dev = pd.read_csv(dev, keep_default_na=False)
df_test = pd.read_csv(test, keep_default_na=False)

In [31]:
models = ['hatebert_vectors', 'hatebert_direct', 'bert_base_cased_finetuned',
          'svm_BoW_baseline_linear_C10', 'baseline_svm_char_kernelC10', 'Advanced_svm_linear_100K_C10',
         'ResNet_svm_rbf_kernel', 'ResNet_svm_linear_kernelC10']

In [32]:
classifier = GradientBoostingClassifier()
# train the simple classifier on the stacked features
classifier.fit(X = df_train[models], y = df_train['label'])

# make predictions on the test set
y_dev = classifier.predict(df_dev[models])
y_test = classifier.predict(df_test[models])

In [46]:
df_dev['GBensemble'] = y_dev
df_test['GBensemble'] = y_test

df_dev.to_csv(datafolder+'dev_unseen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)