In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('data/final_data.csv')
dataset

Unnamed: 0,text,labels,id,author,subreddit,created_utc,cluster,topic
0,My favourite food is anything I didn't have to...,27,eebbqej,Rockzilla78,KitchenConfidential,2019-01-18,1,other
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,cocaineluna,morbidquestions,2019-01-01,0,humor
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,minnesotagoat_,timberwolves,2019-01-26,2,sport
3,To make her feel threatened,14,ed7ypvh,AlexNic1013,askwomenadvice,2019-01-04,4,love and relationship
4,Dirty Southern Wankers,3,ed0bdzj,having_a_nap,london,2019-01-01,1,other
...,...,...,...,...,...,...,...,...
54258,Thanks. I was diagnosed with BP 1 after the ho...,15,efeeasc,scaredyk4t,bipolar,2019-01-30,4,love and relationship
54259,Well that makes sense.,4,ef9c7s3,LX_Emergency,ABoringDystopia,2019-01-29,3,politics
54260,Daddy issues [NAME],27,efbiugo,your_ex_girlfriend-,SquaredCircle,2019-01-30,5,film and TV series
54261,So glad I discovered that subreddit a couple m...,0,efbvgp9,Paladin-Arda,AskALiberal,2019-01-30,3,politics


In [36]:
import torch

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


### Train-Validation-Test split

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X = dataset.drop(columns=['labels'])
y = dataset['labels']

# split train - test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# split train - validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) 

In [None]:
# lenght training set
len(X_train)

39069

In [34]:
# lenght validation set
len(X_val)

4341

In [35]:
# lenght testing set
len(X_test)

10853

### Tokenization

In [15]:
from transformers import AutoTokenizer

In [17]:
model_name = "monologg/bert-base-cased-goemotions-original"

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(
    list(X_train['text']),
    truncation=True,
    padding=True,
    return_tensors='pt'
)

val_encodings = tokenizer(
    list(X_val['text']),
    truncation=True,
    padding=True,
    return_tensors='pt' 
)

test_encodings = tokenizer(
    list(X_test['text']),
    truncation=True,
    padding=True,
    return_tensors='pt'
)

In [18]:
idx = 0

# original
print("Original:", X_train.iloc[idx]['text'])

print("-------------------------------------------")

# token_id and mask
print("Token IDs:", train_encodings['input_ids'][idx][:10])
print("Attention Mask:", train_encodings['attention_mask'][idx][:10])

print("-------------------------------------------")

# show tokens
tokens = tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][idx])
print("Tokenizzati:", tokens[:10])

Original: Chicken. Mailman.
-------------------------------------------
Token IDs: tensor([  101, 18770,   119, 11508,  1399,   119,   102,     0,     0,     0])
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])
-------------------------------------------
Tokenizzati: ['[CLS]', 'Chicken', '.', 'Mail', '##man', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]']


### Model

In [19]:
from transformers import AutoModelForSequenceClassification

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [28]:
import json

print(json.dumps(model.config.to_dict(), indent=4))

{
    "return_dict": true,
    "output_hidden_states": false,
    "output_attentions": false,
    "torchscript": false,
    "torch_dtype": "float32",
    "use_bfloat16": false,
    "tf_legacy_loss": false,
    "pruned_heads": {},
    "tie_word_embeddings": true,
    "chunk_size_feed_forward": 0,
    "is_encoder_decoder": false,
    "is_decoder": false,
    "cross_attention_hidden_size": null,
    "add_cross_attention": false,
    "tie_encoder_decoder": false,
    "max_length": 20,
    "min_length": 0,
    "do_sample": false,
    "early_stopping": false,
    "num_beams": 1,
    "num_beam_groups": 1,
    "diversity_penalty": 0.0,
    "temperature": 1.0,
    "top_k": 50,
    "top_p": 1.0,
    "typical_p": 1.0,
    "repetition_penalty": 1.0,
    "length_penalty": 1.0,
    "no_repeat_ngram_size": 0,
    "encoder_no_repeat_ngram_size": 0,
    "bad_words_ids": null,
    "num_return_sequences": 1,
    "output_scores": false,
    "return_dict_in_generate": false,
    "forced_bos_token_id": null

In [30]:
print("Number of labels:", model.config.num_labels)
print("------------------------------------------")
map_labels = model.config.id2label
for idx in sorted(map_labels.keys()):
    print(f"Index {idx} -> {map_labels[idx]}")

Number of labels: 28
------------------------------------------
Index 0 -> admiration
Index 1 -> amusement
Index 2 -> anger
Index 3 -> annoyance
Index 4 -> approval
Index 5 -> caring
Index 6 -> confusion
Index 7 -> curiosity
Index 8 -> desire
Index 9 -> disappointment
Index 10 -> disapproval
Index 11 -> disgust
Index 12 -> embarrassment
Index 13 -> excitement
Index 14 -> fear
Index 15 -> gratitude
Index 16 -> grief
Index 17 -> joy
Index 18 -> love
Index 19 -> nervousness
Index 20 -> optimism
Index 21 -> pride
Index 22 -> realization
Index 23 -> relief
Index 24 -> remorse
Index 25 -> sadness
Index 26 -> surprise
Index 27 -> neutral


### Training

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True
)
