In [1]:
import json
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support
from scipy.sparse import hstack
import spacy

from transformers import pipeline, AutoModelWithLMHead, AutoTokenizer

nlp = spacy.load('en_core_web_sm')

In [2]:
# importing the data
datafolder = '../../data/hateful_memes/'
train = datafolder+'train.jsonl'
test = datafolder+'test_seen.jsonl'
dev = datafolder+'dev_seen.jsonl'
# Load the data from the JSON file
df_train = pd.read_json(train, lines = True)
df_dev = pd.read_json(dev, lines = True)
df_test = pd.read_json(test, lines = True)

In [2]:
# importing the data
datafolder = '../../data/hateful_memes/'
test = datafolder+'test_unseen.jsonl'
dev = datafolder+'dev_unseen.jsonl'
# Load the data from the JSON file
df_dev = pd.read_json(dev, lines = True)
df_test = pd.read_json(test, lines = True)

In [3]:
def preprocess_row(row):
    text = row['text']
    doc = nlp(text)
    tokens = []
    for token in doc:
        pos = token.pos_
        lemma = token.lemma_
        tokens.append((token.text, lemma, pos))
    row['tokens'] = " ".join([t[0] for t in tokens])
    row['lemmas'] = " ".join([t[1] for t in tokens])
    row['upos'] = " ".join([t[2] for t in tokens])
    return row

In [4]:
# df_train = df_train.apply(preprocess_row, axis=1)
df_dev = df_dev.apply(preprocess_row, axis=1)
df_test = df_test.apply(preprocess_row, axis=1)

In [5]:
# load the NRC emotion lexicon into a dictionary with emotion words and corresponding associations
lexicon = '../../data/hateful_memes/nrc-lexicon-en.txt' # path to the NRC emotion lexicon
emotions = {}
for line in open(lexicon).read().split('\n'):	
    emotion_word = line.split('\t')[0]
    emotion = line.split('\t')[1]
    association = line.split('\t')[2]
    if association == "1":
        if emotion_word in emotions:
            emotions[emotion_word].append(emotion)
        else:
            emotions[emotion_word] = [emotion] 

list(emotions.items())[:10] # print first 3 entries

[('smut', ['disgust', 'fear', 'negative']),
 ('expletive', ['anger', 'negative']),
 ('greeting', ['positive', 'surprise']),
 ('measles', ['disgust', 'fear', 'negative', 'sadness']),
 ('proven', ['trust']),
 ('inept', ['anger', 'disgust', 'negative']),
 ('perverted', ['disgust', 'negative']),
 ('inconsequential', ['negative', 'sadness']),
 ('unfulfilled', ['anger', 'anticipation', 'negative', 'sadness', 'surprise']),
 ('tantalizing', ['anticipation', 'joy', 'negative', 'positive', 'surprise'])]

In [6]:
# - pos_fw_emo = representation of the text through POS tags, function words, and emotion words (from this representation n-grams (n=1-3) are built, see vectorize below)
# - count = number of emotion words in a text
# - emotion_associations = emotion associations from the NRC emotion lexicon
# - Sentiment score = using siebert/sentiment-roberta-large-english from huggingface we retrieve the sentiment score of the whole sentence
# - Intent = using mrm8488/t5-base-finetuned-e2m-intent we retrieve the intent of the sentence 

fw_list = ['ADP', 'AUX', 'CCONJ', 'DET', 'NUM', 'PART', 'PRON', 'SCONJ'] # POS tags that correspond to function words

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-e2m-intent")
intent_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-e2m-intent")
def get_intent(event):
    input_text = "%s </s>" % event
    features = tokenizer([input_text], return_tensors='pt')

    output = intent_model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'])

    return tokenizer.decode(output[0])[6:-4]

sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
def senti(text):
    output = sentiment_analysis(text)[0]
    if output['label'] == 'POSITIVE':
        return output['score']
    else:
        return 1 - output['score']
    
def get_feats_en(upos, lemmas, text):	
    pos_fw_emo = []
    count = 0
    emotion_associations = []
    sentiment_score = senti(text)
    intent = get_intent(text)
    for i, lemma in enumerate(lemmas.split()):
        if lemma.lower() in emotions:
            pos_fw_emo.append(lemma)
            count += 1
            emotion_associations.append(emotions[lemma.lower()])     
        else:
            if upos.split()[i] in fw_list:
                pos_fw_emo.append(lemma)
            else:
                pos_fw_emo.append(upos.split()[i])
    emotion_associations = [emo for sublist in emotion_associations for emo in sublist]
    return pd.Series([' '.join(pos_fw_emo), count, ' '.join(emotion_associations), sentiment_score, intent])

# df_train[['pos_fw_emo', 'count', 'emotion_associations', 'sentiment_score', 'intent']] = df_train.apply(lambda x: get_feats_en(x['upos'], x['lemmas'], x['text']), axis=1) 
df_dev[['pos_fw_emo', 'count', 'emotion_associations', 'sentiment_score', 'intent']] = df_dev.apply(lambda x: get_feats_en(x['upos'], x['lemmas'], x['text']), axis=1) 
df_test[['pos_fw_emo', 'count', 'emotion_associations', 'sentiment_score', 'intent']] = df_test.apply(lambda x: get_feats_en(x['upos'], x['lemmas'], x['text']), axis=1) 

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [None]:
# df_train['sentiment_score'] = df_train.apply(lambda x: senti(x['text']), axis=1)
# df_dev['sentiment_score'] = df_dev.apply(lambda x: senti(x['text']), axis=1)
# df_test['sentiment_score'] = df_test.apply(lambda x: senti(x['text']), axis=1)

In [7]:
# df_train.to_csv(datafolder+'train_with_features.csv', index=False)
df_dev.to_csv(datafolder+'dev_uneen_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_unseen_with_features.csv', index=False)