# Data Pre-processing and dataloaders

In [1]:
# Data source download: https://drive.google.com/file/d/1y7yjshepNRPhnh-Qz5MTRbnopGn7KzUm/view?usp=sharing
# Originally from: https://github.com/sarnthil/unify-emotion-datasets

In [2]:
from data.unified_emotion import unified_emotion

unified = unified_emotion("./data/datasets/unified-dataset.jsonl")

unified.prep()
#unified.prep(text_tokenizer=manual_tokenizer)

In [3]:
unified.lens

{'grounded_emotions': 2585,
 'crowdflower': 40000,
 'dailydialog': 102979,
 'tales-emotion': 14771,
 'tec': 21051,
 'emoint': 7102}

In [4]:
for k in unified.lens.keys():
    trainloader, testloader = unified.get_dataloader(k, k=1)
    _, text = next(trainloader)
    print(k)
    print(text)
    print()

grounded_emotions
['@NBCNewYork WTF?? SERIOUSLY?!?!', '@assforDLS na my niggas @BlizzardStorm27  and @Smoke_EvryDay got me beat']

crowdflower
["@Puddynface2 Don't know yet  Lemme know if you come up with something though.", "Wee laddie's been SO upset for about 2 hours. Tried soothing him in bed, nursing, etc. Nope. Up at 3:30am for real food. Blue Clues now.", '@themanwhofell compliment taken. Thanks. Key is to be yourself', "gosh it's anoher cloudy day  wish they would go away.. or rain..", '@turhangross Wow! Some person u are!', "TIRED! goodnight twitter  its mother's day  happy mother's day  lov my moomy &lt;3 yayy! God Bless.", "@Zaraa_x ah that's annoying", '#todo Cleaning the Apartment - again - who keeps making this mess? oh yeah .. me. $10 + hug for the person to help come clean']

dailydialog
["Everything seems to be getting worse . I don't know what to do with it .", 'I hope so . And I will definitely tell you if I can not .', "sounds good , and I don't have to queue up at 

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

trainloader, testloader = unified.get_dataloader('grounded_emotions', tokenizer=tokenizer, shuffle=True)
next(trainloader)

(tensor([1, 1, 1, 1, 0, 0, 0, 0]),
 tensor([[  101, 19387,  1030, 23647,  6873, 26677, 10288,  2319,  1024,  1030,
          10819, 18098,  6610, 26876,  1030,  8962,  2271,  1030, 11271, 22083,
          16257, 10695,  1030,  4702, 16558, 19253, 10262,  2017,  1005,  2128,
           2006,  8398,  2015,  2572, 11283,  2078,  2933,  1029,  2129,  2079,
           2017,  4682,  1004, 23713,  1025,  3637,  2012,  2305,  1029,  1029,
           1029,   102],
         [  101, 19387,  1030,  8902,  5302, 18752, 16150, 18891,  2015,  1024,
           1045,  2106,  7773,  2197,  5353,  1004, 23713,  1025,  2057,  1005,
           2128,  7079,  2676,  1003,  2164,  4171,  2006,  2026,  3394,  2510,
           3477,  1012,  1030,  2613,  5280, 19058, 24456,  2361,  2081,  1002,
           5018,  2213,  1004, 23713,  1025,  2006,  2721, 29649,   102,     0,
              0,     0],
         [  101,  2317,  2160, 26536, 17125,  2740,  2729,  1010,  7318,  2696,
           2361,  4366,  1011, 1322

Sample a dataset using the square-root of their size as a probabilistic weight

In [6]:
from data.utils.sampling import dataset_sampler

source_name = dataset_sampler(unified, sampling_method='sqrt')
source_name

'crowdflower'

Raises StopIteration when there is not enough data left to generate an N x K shot
Done to avoid overfit in small datasets

In [8]:
while True:
    next(testloader)

StopIteration: Some classes ran out of data.

# Custom Tokenizer
Here we define some rules for manually cleaning the imported data.
Given this is all internet sourced, it's strongly recommended to define something at least.
Current manual tokenizer will:
- Correct the text encodings
- Align contractions with BERT tokenizers
- Handles emojis (using emoji package) and twitter handles
- Deals with some edge cases where Spacy's tokenizer fails

In [68]:
from transformers import AutoTokenizer, AutoModel

from data.unified_emotion import unified_emotion
from data.utils.tokenizer import manual_tokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [2]:
unified = unified_emotion("./data/datasets/unified-dataset.jsonl")

unified.prep()

The raw data

In [5]:
trainloader, testloader = unified.get_dataloader('grounded_emotions', shuffle=True, device='cpu')
labels, text = next(trainloader)
text

["RT @marcushjohnson: Recentering whiteness is what Dems should be doing? We're good. https://t.co/VPVPbWf0AG",
 'Got my new piece of jewelry today. I love Crystals! Hey I will toâ\x80¦ https://t.co/wBhS6eOQYf',
 "RT @chrislhayes: It's not just that they lie, it's that they lie the way an 8 year old lies.",
 "Seth Meyers Shuts Down Steve King Over His 'Overtly Racist' Tweet - The Huffington Post https://t.co/4J3EHUSyqS",
 "RT @Amy_Siskind: Wow, Trump sure packed 'em in, in Nashville tonight ð\x9f\x98\x82ð\x9f\x98\x85ð\x9f¤£ð\x9f\x98\x82ð\x9f\x98\x85ð\x9f¤£ https://t.co/ZJGqmWuKep",
 "RT @JYSexton: If you can watch this clip of Steve King and think he has any business in Congress, then I don't know what to say to you. httâ\x80¦",
 "RT @ChrisMurphyCT: I don't care what party you are - a State Dept run in secret is just bad for America and our allies. https://t.co/GMlnvZâ\x80¦",
 '@ChelseaClinton real question is... Is this public outrage or will @SpeakerRyan find his ð\x9f\x8f\x80???']

The same, but now manually tokenized, sample

In [6]:
list(map(manual_tokenizer, text))

['rt @USER : recentering whiteness is what dems should be doing ? we are good . HTTPURL',
 'got my new piece of jewelry today . i love crystals ! hey i will to ... HTTPURL',
 'rt @USER : it s not just that they lie , it s that they lie the way an 8 year old lies .',
 'seth meyers shuts down steve king over his overtly racist tweet - the huffington post HTTPURL',
 'rt @USER : wow , trump sure packed them in , in nashville tonight :face_with_tears_of_joy: :grinning_face_with_sweat: :rolling_on_the_floor_laughing: :face_with_tears_of_joy: :grinning_face_with_sweat: :rolling_on_the_floor_laughing: HTTPURL',
 'rt @USER : if you can watch this clip of steve king and think he has any business in congress , then i do not know what to say to you . htt ...',
 'rt @USER : i do not care what party you are - a state dept run in secret is just bad for america and our allies . HTTPURL ...',
 '@USER real question is ... is this public outrage or will @USER find his :basketball: ? ? ?']

Can be easily slotted into the data loading process
Does quite a lot longer though...

In [8]:
# Use below if you additionally want to limit sentences to those that overlap well with BERT
# Not recommended for initial training 
#unified.prep(text_tokenizer=manual_tokenizer, text_tokenizer_kwargs={'bert_vocab': tokenizer.vocab.keys(), 'OOV_cutoff' :0.5, 'verbose':True})

unified.prep(text_tokenizer=manual_tokenizer)

In [135]:
    print('\nExample data')
    for k in unified.lens.keys():
        trainloader, _ = unified.get_dataloader(k, k=3, device='cpu')
        labels, text = next(trainloader)
        print(k)

        label_map = {v: k for k, v in unified.label_map[k].items()}
        tokenized_texts = list(map(tokenizer.decode, tokenizer(text)['input_ids']))
        for txt, label in zip(tokenized_texts, labels):
            print(label_map[label], txt)
        print()


Example data
grounded_emotions
sadness <s>oh yeah, they real cute, and baby so am i! # confident # i2i20proof # 2017living # strengthening # building # encouraging # happy # relationships</s>
sadness <s>rt @USER : and this... HTTPURL</s>
sadness <s>rt @USER : potus wanted bharara gone once he found out he could not control him. bharara was investigating the russian connections...</s>
joy <s>@USER _ are u # triggered :winking_face: HTTPURL</s>
joy <s>rt @USER : playing carly rae jepsen on an aux cord 2 an audience of hundreds is actually extremely important art</s>
joy <s>@USER " dining it differently " because do not have votes? ha! like obama did when gop blocked everything?! # liar</s>



{0: 'joy', 1: 'sadness'}

In [124]:
tokenizer.additional_special_tokens = ["HTTPURL", "@USER"]

list(map(tokenizer.decode, tokenizer(text)['input_ids']))

['<s>@USER @USER HTTPURL tell us again how trump is working for the american people as a whole!</s>',
 '<s>rt @USER : why the health care industry is the mess that it is today : HTTPURL HTTPURL</s>',
 '<s>rt @USER : # breaking doc : the 4 ambassadors that came to trump s april policy speech were all from countries involved in russian oil deal...</s>',
 '<s>@USER @USER @USER # joshgroban doing it live # doit # workit</s>',
 '<s>rt @USER : if he s not joking : sean spicer loses it when nbc reporter asks when can we trust the HTTPURL / dj3vaya4km</s>',
 '<s>rt @USER : what did russia get in exchange for hacking our electorate?# trumprussia # russiagate # resist # trumpleaks HTTPURL...</s>']

In [60]:
tokenizer('@user', add_special_tokens=True)

{'input_ids': [101, 1030, 5310, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [77]:
tokenizer.add_special_tokens({'additional_special_tokens': ["HTTPURL", "@USER"]})

2

In [79]:
tokenizer('@USER')

{'input_ids': [101, 30523, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

30524

In [72]:
model = AutoModel.from_pretrained('bert-base-uncased')

In [87]:
model.resize_token_embeddings(len(tokenizer.vocab))

Embedding(30524, 768)

In [95]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

Downloading: 100%|██████████| 779/779 [00:00<00:00, 391kB/s]
Downloading: 100%|██████████| 899k/899k [00:03<00:00, 294kB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 529kB/s]
Downloading: 100%|██████████| 150/150 [00:00<00:00, 150kB/s]
Downloading: 100%|██████████| 499M/499M [02:43<00:00, 3.06MB/s]


In [None]:
scores

In [110]:
list(map(tokenizer.decode, tokenizer('rt @USER : check it out : " as of march 10 , trump s odds of leaving office before his first term are even . " HTTPURL 😊', return_tensors='pt')['input_ids']))

['<s>rt @USER : check it out : " as of march 10, trump s odds of leaving office before his first term are even. " HTTPURL 😊</s>']

In [117]:
tokenizer.decode(tokenizer('😊')['input_ids'][2])

'�'

In [119]:
model = AutoModel.from_pretrained(MODEL)
print(model)

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=

In [120]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
print(model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN