# Imports

In [6]:
!pip install transformers datasets pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [7]:

import pandas as pd
import datasets
from datasets import Dataset, Features, ClassLabel, Value

## Get data

In [8]:
df = pd.read_pickle("DATA/reddit_wsb_sentiment.pkl")
df['created'] = pd.to_datetime(df['created'], unit="s")
titles = df["title"]
bodies = df["body"].dropna()
df["text"] = df["title"].str.cat(df["body"], sep=" ### ", na_rep="(Empty body)")

df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,text,sentiment
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,2021-01-28 19:37:41,,2021-01-28 21:37:41,"It's not about the money, it's about sending a...",0.0
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,2021-01-28 19:32:10,,2021-01-28 21:32:10,Math Professor Scott Steiner says the numbers ...,-0.7034
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,2021-01-28 19:30:35,The CEO of NASDAQ pushed to halt trading ‚Äúto g...,2021-01-28 21:30:35,Exit the system ### The CEO of NASDAQ pushed t...,-0.4199
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,2021-01-28 19:28:57,,2021-01-28 21:28:57,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,-0.4822
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,2021-01-28 19:26:56,,2021-01-28 21:26:56,"Not to distract from GME, just thought our AMC...",0.2235


We'll use the following example sentence

In [8]:
sentence = df["text"].iloc[15]
sentence

'420 wasn‚Äôt a meme. GME üöÄ üöÄ üöÄ ### (Empty body)'

# Calculate simple target

In [10]:
df["labels"] = pd.qcut(df["sentiment"],[0,0.3,0.7,1], labels=["negative", "neutral", "positive"]).astype(str)
df["labels"].value_counts()

negative    26964
positive    14940
neutral      8166
Name: labels, dtype: int64

In [11]:
df.to_csv("DATA/reddit_with_target.csv")

## Clean data
Not needed, these are Transformers!

In [90]:
df_train, df_test = train_test_split(df, random_state=22141)
df_train

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,text,sentiment,sentiment_category,labels
30426,This floppy dongus and the 69 price is the las...,19,ldvrax,https://i.redd.it/ew3dzgiwbuf61.jpg,6,2021-02-06 19:13:03,,2021-02-06 21:13:03,This floppy dongus and the 69 price is the las...,0.0000,negative,negative
16710,Brokerage discussion - finding a new broker th...,47,l7w5vv,https://www.reddit.com/r/wallstreetbets/commen...,21,2021-01-30 00:01:25,"Now that I am leaving Robinhood, I wanted to s...",2021-01-30 02:01:25,Brokerage discussion - finding a new broker th...,-0.7074,negative,negative
35331,"üíéüí™üèΩü¶ç GME, BB and RKT",56,lqkr1i,https://i.redd.it/3vad1gg2v8j61.jpg,9,2021-02-23 23:17:49,,2021-02-24 01:17:49,"üíéüí™üèΩü¶ç GME, BB and RKT ### (Empty body)",0.0000,negative,negative
32167,Tattle tale falsely accuses guy of calling wsb...,37,ley0yt,https://greenwald.substack.com/p/the-journalis...,10,2021-02-08 07:01:35,,2021-02-08 09:01:35,Tattle tale falsely accuses guy of calling wsb...,-0.3400,negative,negative
32246,Saving Private WoooooG. I'm still down here in...,14694,lf1xx2,https://i.redd.it/v6jdhcc7z5g61.png,776,2021-02-08 10:29:45,,2021-02-08 12:29:45,Saving Private WoooooG. I'm still down here in...,0.3506,neutral,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...
34898,hodl,56,lroq45,https://i.redd.it/74pmihmm3ij61.png,16,2021-02-25 06:21:50,,2021-02-25 08:21:50,hodl ### (Empty body),0.0000,negative,negative
10891,Just bought my first share of gme. All I could...,1,l71iuq,https://i.redd.it/p4b05xovo3e61.jpg,1,2021-01-29 00:33:33,,2021-01-29 02:33:33,Just bought my first share of gme. All I could...,0.8225,positive,positive
47641,I lost so much money this year...Only thing sa...,17,nmsxye,https://i.redd.it/xe919w2ukt171.jpg,29,2021-05-28 16:12:48,,2021-05-28 19:12:48,I lost so much money this year...Only thing sa...,-0.3182,negative,negative
49765,Swerve Into VERV,0,o244cg,https://www.reddit.com/r/wallstreetbets/commen...,17,2021-06-18 02:29:25,**Tl;dr:** I like Verve Therapeutics. It coul...,2021-06-18 05:29:25,Swerve Into VERV ### **Tl;dr:** I like Verve T...,0.9928,positive,positive


## Transformers model 1

In [23]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading tokenizer_config.json: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28.0/28.0 [00:00<00:00, 13.7kB/s]
Downloading config.json: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 483/483 [00:00<00:00, 142kB/s]
Downloading vocab.txt: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñ

In [98]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

preprocess_function({"text":[sentence]})

{'input_ids': [[101, 17442, 2347, 1521, 1056, 1037, 2033, 4168, 1012, 13938, 2063, 100, 100, 100, 1001, 1001, 1001, 1006, 4064, 2303, 1007, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
encoded_dataset = dataset.map(lambda examples: tokenizer(examples['sentence1']), batched=True)
encoded_dataset.column_names
encoded_dataset[0]

In [91]:
class_names = ["negative", "neutral", "positive"]
emotion_features = Features({'__index_level_0__': Value('string'), 'text': Value('string'), 'labels': ClassLabel(names=class_names)})


In [92]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [99]:
train = Dataset.from_pandas(df_train[["text", "labels"]], features=emotion_features)
train

Dataset({
    features: ['__index_level_0__', 'text', 'labels'],
    num_rows: 37552
})

In [102]:
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_train

 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 37/38 [00:05<00:00,  7.05ba/s]


Dataset({
    features: ['__index_level_0__', 'text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 37552
})

In [30]:
# %%time
# tokenized_data = preprocess_function(list(X_train))


CPU times: user 16.3 s, sys: 2.03 s, total: 18.3 s
Wall time: 5.76 s


In [103]:

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /Users/jospolfliet/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.3",
  "vocab_size": 30522


In [104]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    # eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 37552
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 11735
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.


Step,Training Loss


KeyboardInterrupt: 