# Imports

In [1]:
#!pip install transformers datasets pandas

In [1]:

import pandas as pd
import datasets
from sklearn.model_selection import train_test_split
from datasets import Dataset, Features, ClassLabel, Value

  from .autonotebook import tqdm as notebook_tqdm


## Get data

In [2]:
df = pd.read_csv("DATA/reddit_with_target.csv")
df['created'] = pd.to_datetime(df['created'])
titles = df["title"]
bodies = df["body"].dropna()
df["text"] = df["title"].str.cat(df["body"], sep=" ### ", na_rep="(Empty body)")

df.head()

Unnamed: 0.1,Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,text,sentiment,labels
0,0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,2021-01-28 19:37:41,,2021-01-28 21:37:41,"It's not about the money, it's about sending a...",0.0,negative
1,1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,2021-01-28 19:32:10,,2021-01-28 21:32:10,Math Professor Scott Steiner says the numbers ...,-0.7034,negative
2,2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,2021-01-28 19:30:35,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,Exit the system ### The CEO of NASDAQ pushed t...,-0.4199,negative
3,3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,2021-01-28 19:28:57,,2021-01-28 21:28:57,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,-0.4822,negative
4,4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,2021-01-28 19:26:56,,2021-01-28 21:26:56,"Not to distract from GME, just thought our AMC...",0.2235,neutral


We'll use the following example sentence

In [3]:
sentence = df["text"].iloc[15]
sentence

'420 wasn’t a meme. GME 🚀 🚀 🚀 ### (Empty body)'

# Calculate simple target

In [4]:
df["labels"] = pd.qcut(df["sentiment"],[0,0.3,0.7,1], labels=["negative", "neutral", "positive"]).astype(str)
df["labels"].value_counts()

negative    26964
positive    14940
neutral      8166
Name: labels, dtype: int64

## Clean data
Not needed, these are Transformers!

In [5]:
df_train, df_test = train_test_split(df, random_state=22141)
df_train

Unnamed: 0.1,Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,text,sentiment,labels
30426,30426,This floppy dongus and the 69 price is the las...,19,ldvrax,https://i.redd.it/ew3dzgiwbuf61.jpg,6,2021-02-06 19:13:03,,2021-02-06 21:13:03,This floppy dongus and the 69 price is the las...,0.0000,negative
16710,16710,Brokerage discussion - finding a new broker th...,47,l7w5vv,https://www.reddit.com/r/wallstreetbets/commen...,21,2021-01-30 00:01:25,"Now that I am leaving Robinhood, I wanted to s...",2021-01-30 02:01:25,Brokerage discussion - finding a new broker th...,-0.7074,negative
35331,35331,"💎💪🏽🦍 GME, BB and RKT",56,lqkr1i,https://i.redd.it/3vad1gg2v8j61.jpg,9,2021-02-23 23:17:49,,2021-02-24 01:17:49,"💎💪🏽🦍 GME, BB and RKT ### (Empty body)",0.0000,negative
32167,32167,Tattle tale falsely accuses guy of calling wsb...,37,ley0yt,https://greenwald.substack.com/p/the-journalis...,10,2021-02-08 07:01:35,,2021-02-08 09:01:35,Tattle tale falsely accuses guy of calling wsb...,-0.3400,negative
32246,32246,Saving Private WoooooG. I'm still down here in...,14694,lf1xx2,https://i.redd.it/v6jdhcc7z5g61.png,776,2021-02-08 10:29:45,,2021-02-08 12:29:45,Saving Private WoooooG. I'm still down here in...,0.3506,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...
34898,34898,hodl,56,lroq45,https://i.redd.it/74pmihmm3ij61.png,16,2021-02-25 06:21:50,,2021-02-25 08:21:50,hodl ### (Empty body),0.0000,negative
10891,10891,Just bought my first share of gme. All I could...,1,l71iuq,https://i.redd.it/p4b05xovo3e61.jpg,1,2021-01-29 00:33:33,,2021-01-29 02:33:33,Just bought my first share of gme. All I could...,0.8225,positive
47641,47641,I lost so much money this year...Only thing sa...,17,nmsxye,https://i.redd.it/xe919w2ukt171.jpg,29,2021-05-28 16:12:48,,2021-05-28 19:12:48,I lost so much money this year...Only thing sa...,-0.3182,negative
49765,49765,Swerve Into VERV,0,o244cg,https://www.reddit.com/r/wallstreetbets/commen...,17,2021-06-18 02:29:25,**Tl;dr:** I like Verve Therapeutics. It coul...,2021-06-18 05:29:25,Swerve Into VERV ### **Tl;dr:** I like Verve T...,0.9928,positive


## Transformers model 1

In [21]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [22]:
# tokenizer2 = AutoTokenizer.from_pretrained("flax-community/t5-recipe-generation")


In [23]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [31]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

sentence = "100000 wasn’t a meme. GME 🚀:rocket::rocket: ### (Empty body)"
encoded = preprocess_function({"text":[sentence]})
print(sentence)
print(encoded)
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

100000 wasn’t a meme. GME 🚀:rocket::rocket: ### (Empty body)
{'input_ids': [[101, 6694, 8889, 2347, 1521, 1056, 1037, 2033, 4168, 1012, 13938, 2063, 100, 1024, 7596, 1024, 1024, 7596, 1024, 1001, 1001, 1001, 1006, 4064, 2303, 1007, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
101 [CLS]
6694 1000
8889 ##00
2347 wasn
1521 ’
1056 t
1037 a
2033 me
4168 ##me
1012 .
13938 gm
2063 ##e
100 [UNK]
1024 :
7596 rocket
1024 :
1024 :
7596 rocket
1024 :
1001 #
1001 #
1001 #
1006 (
4064 empty
2303 body
1007 )
102 [SEP]


In [32]:
sentence = "If you use obscure, arcane, abstruse, recondite or incorectly spelled words, or even antweirpse woorden die in het stadstheater zouden passen, ca marche hé alors."
encoded = preprocess_function({"text":[sentence]})
print(sentence)
# print(encoded)
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

If you use obscure, arcane, abstruse, recondite or incorectly spelled words, or even antweirpse woorden die in het stadstheater zouden passen, ca marche hé alors.
101 [CLS]
2065 if
2017 you
2224 use
14485 obscure
1010 ,
8115 arc
7231 ##ane
1010 ,
14689 abs
16344 ##tr
8557 ##use
1010 ,
28667 rec
15422 ##ond
4221 ##ite
2030 or
4297 inc
5686 ##ore
6593 ##ct
2135 ##ly
11479 spelled
2616 words
1010 ,
2030 or
2130 even
14405 ant
19845 ##wei
14536 ##rp
3366 ##se
15854 woo
18246 ##rden
3280 die
1999 in
21770 het
2358 st
19303 ##ads
10760 ##the
24932 ##ater
1062 z
19224 ##oud
2368 ##en
3413 pass
2368 ##en
1010 ,
6187 ca
28791 marche
2002 he
2632 al
5668 ##ors
1012 .
102 [SEP]


In [34]:
sentence = "Zelfs antweirpse woorden die in het stadstheatre zouden passen, ca marche hé alors. 即使是晦澀難懂的中文單詞和概念也可以"
encoded = preprocess_function({"text":[sentence]})
print(sentence)
# print(encoded)
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

Zelfs antweirpse woorden die in het stadstheatre zouden passen, ca marche hé alors. 即使是晦澀難懂的中文單詞和概念也可以
101 [CLS]
27838 ze
10270 ##lf
2015 ##s
14405 ant
19845 ##wei
14536 ##rp
3366 ##se
15854 woo
18246 ##rden
3280 die
1999 in
21770 het
2358 st
19303 ##ads
10760 ##the
4017 ##at
2890 ##re
1062 z
19224 ##oud
2368 ##en
3413 pass
2368 ##en
1010 ,
6187 ca
28791 marche
2002 he
2632 al
5668 ##ors
1012 .
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
1916 的
1746 中
1861 文
100 [UNK]
100 [UNK]
1796 和
100 [UNK]
100 [UNK]
1750 也
100 [UNK]
100 [UNK]
102 [SEP]


In [20]:
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

101 [CLS]
27838 ze
10270 ##lf
2015 ##s
14405 ant
19845 ##wei
14536 ##rp
3366 ##se
15854 woo
18246 ##rden
3280 die
1999 in
21770 het
2358 st
19303 ##ads
10760 ##the
24932 ##ater
1062 z
19224 ##oud
2368 ##en
3413 pass
2368 ##en
1010 ,
6187 ca
28791 marche
2002 he
2632 al
5668 ##ors
1012 .
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
1916 的
1746 中
1861 文
100 [UNK]
100 [UNK]
1796 和
100 [UNK]
100 [UNK]
1750 也
100 [UNK]
100 [UNK]
102 [SEP]


In [49]:
class_names = ["negative", "neutral", "positive"]
emotion_features = Features({'__index_level_0__': Value('string'), 
                             'text': Value('string'), 
                             'labels': ClassLabel(names=class_names)})


In [50]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [51]:
train = Dataset.from_pandas(df_train[["text", "labels"]], features=emotion_features)
train

Dataset({
    features: ['__index_level_0__', 'text', 'labels'],
    num_rows: 37552
})

In [52]:
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_train

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|███████████████████████████████████████████| 38/38 [00:04<00:00,  9.14ba/s]


Dataset({
    features: ['__index_level_0__', 'text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 37552
})

In [53]:

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [54]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    # eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 37552
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2347


Step,Training Loss


KeyboardInterrupt: 

This takes a long time. I trained this in the cloud instead, results are in the lecture powerpoint.

## Evaluate

In [None]:
from evaluate import evaluator
from datasets import load_dataset
task_evaluator = evaluator("text-classification")

results = task_evaluator.compute(
    model_or_pipeline=model,
    data=tokenized_test,
    metric="accuracy",
    label_mapping={
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
    # label_mapping=emotion_features["labels"]._str2int,
    label_column="labels",
    n_resamples=1,
    random_state=0,
    tokenizer=tokenizer
)

In [None]:
results