In [1]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import spacy

from torch.utils.data import Dataset, DataLoader

torch.manual_seed(1)

<torch._C.Generator at 0x7511757c26d0>

In [22]:
EMOTION_LABEL = {
    "anger": 0,
    "joy": 1,
    "optimism": 2,
    "sadness": 3
}

class EmotionDataset(Dataset):

    def __init__(self, training):
        spacy.prefer_gpu()
        self.nlp = spacy.load("en_core_web_sm")
        self.data = None

        if training:
            self.path_text   = "./data/tweeteval/emotion/train_text.txt"
            self.path_labels = "./data/tweeteval/emotion/train_labels.txt"
        else:
            self.path_text   = "./data/tweeteval/emotion/test_text.txt"
            self.path_labels = "./data/tweeteval/emotion/test_labels.txt"

        self.data   = self._load_txt_file(self.path_text)
        self.labels = self._load_txt_file(self.path_labels, perform_nlp=False)

    def _load_txt_file(self, path, perform_nlp=True):

        with open(path, "r") as f:
            lines = f.readlines()

            print(f"Loading {path} with {len(lines)} lines")

            for i in range(len(lines)):
                if i % 500 == 0:
                    print(f"Processing line {i}/{len(lines)}")

                lines[i] = lines[i].strip()
                lines[i] = lines[i].replace("\n", "")

                if perform_nlp:
                    lines[i] = self.nlp(lines[i])

            return lines

    def __getitem__(self, index):
        return self.data[index], self.labels[index]

    def __len__(self):
        return len(self.data)

train_dataset = EmotionDataset(training=True)
test_dataset  = EmotionDataset(training=False)

Loading ./data/tweeteval/emotion/train_text.txt with 3257 lines
Processing line 0/3257
Processing line 500/3257
Processing line 1000/3257
Processing line 1500/3257
Processing line 2000/3257
Processing line 2500/3257
Processing line 3000/3257
Loading ./data/tweeteval/emotion/train_labels.txt with 3257 lines
Processing line 0/3257
Processing line 500/3257
Processing line 1000/3257
Processing line 1500/3257
Processing line 2000/3257
Processing line 2500/3257
Processing line 3000/3257
Loading ./data/tweeteval/emotion/test_text.txt with 1421 lines
Processing line 0/1421
Processing line 500/1421
Processing line 1000/1421
Loading ./data/tweeteval/emotion/test_labels.txt with 1421 lines
Processing line 0/1421
Processing line 500/1421
Processing line 1000/1421


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device {device}")

Using device cuda


In [24]:
word_to_ix  = {}
word_counts = {}

maximum_length = 0

# Count the occurrences of each word
for sent, tags in train_dataset:

    if len(sent) > maximum_length:
        maximum_length = len(sent)

    for word in sent:
        if word.text not in word_counts:
            word_counts[word.text] = 1
        else:
            word_counts[word.text] += 1

print(f"Maximum length: {maximum_length}")

# Sort words by counts
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Assign ID to the most 5000 frequent words
for word, _ in sorted_word_counts[100:5100]:
    word_to_ix[word] = len(word_to_ix) + 1

def get_idx(word):
    if word in word_to_ix:
        return word_to_ix[word]
    else:
        return 0

def prepare_sequence(seq):
    idxs = [get_idx(w) for w in seq]

    template_tensor = torch.zeros(maximum_length, dtype=torch.long)

    for i, idx in enumerate(idxs):
        template_tensor[i] = idx

    return template_tensor

Maximum length: 85


In [30]:
class EmotionTensorDataset(Dataset):

    def __init__(self, dataset):
        self.dataset = dataset
        self.data    = []

        for sentence, label in self.dataset:
            self.data.append((prepare_sequence([word.text for word in sentence]), label))

    def __getitem__(self, index):
        input_ids = self.data[index][0]
        label     = self.data[index][1]

        return {'input_ids': input_ids, 'label': int(label)}

    def __len__(self):
        return len(self.dataset)

train_tensor_dataset = EmotionTensorDataset(train_dataset)
test_tensor_dataset  = EmotionTensorDataset(test_dataset)

{'input_ids': tensor([ 316, 2301,    0,    0,   82, 3965,    0,    0,  389,    0,  317,   46,
            0,    0,    0, 1293, 3966, 3967,    0,    0,    0, 1677,    0,  905,
            0,  149,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0]),
 'label': 2}

In [60]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", truncation=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

class EmotionBertDataset(Dataset):

    def __init__(self, dataset):
        self.dataset   = dataset

        self.preprocess()

    def preprocess(self):
        self.data = []

        for sentence, label in self.dataset:

            dict_input = tokenizer(sentence.text, padding='max_length', max_length=maximum_length, return_tensors="pt")

            dict_input["input_ids"] = dict_input["input_ids"].squeeze(0)
            dict_input["attention_mask"] = dict_input["attention_mask"].squeeze(0)

            dict_input["label"] = int(label)

            self.data.append(dict_input)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.dataset)

train_bert_dataset = EmotionBertDataset(train_dataset)
test_bert_dataset  = EmotionBertDataset(test_dataset)

train_bert_dataset[0]



{'input_ids': tensor([  101,  1523,  4737,  2003,  1037,  2091,  7909,  2006,  1037,  3291,
         2017,  2089,  2196,  2031,  1005,  1012, 11830, 11527,  1012,  1001,
        14354,  1001,  4105,  1001,  4737,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'label': 2

In [64]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

id2label = {0: "anger", 1: "joy", 2: "optimism", 3: "sadness"}
label2id = {"anger": 0, "joy": 1, "optimism": 2, "sadness": 3}

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=30,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_bert_dataset,
    eval_dataset=test_bert_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.615939,0.788177,0.702175,0.805829,0.688677
2,No log,0.631836,0.790289,0.735336,0.79893,0.711815
3,No log,0.754719,0.784659,0.751411,0.764824,0.746746
4,No log,0.898285,0.781844,0.747633,0.764309,0.741684
5,0.355500,1.234415,0.771288,0.723272,0.773087,0.69876
6,0.355500,1.030157,0.794511,0.75231,0.777852,0.740311
7,0.355500,1.205954,0.78677,0.753967,0.746361,0.764002
8,0.355500,1.347947,0.787474,0.749131,0.762742,0.741929
9,0.355500,1.362073,0.78677,0.752075,0.768892,0.74199
10,0.039800,1.677583,0.766362,0.723693,0.743843,0.712493


TrainOutput(global_step=3060, training_loss=0.0683016069355159, metrics={'train_runtime': 996.951, 'train_samples_per_second': 98.009, 'train_steps_per_second': 3.069, 'total_flos': 2148881542995600.0, 'train_loss': 0.0683016069355159, 'epoch': 30.0})

In [66]:
trainer_prev = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tensor_dataset,
    eval_dataset=test_tensor_dataset,
    tokenizer=None,
    data_collator=None,
    compute_metrics=compute_metrics,
)

trainer_prev.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.292542,0.392681,0.14098,0.09817,0.25
2,No log,1.304042,0.392681,0.14098,0.09817,0.25
3,No log,1.288129,0.392681,0.14098,0.09817,0.25
4,No log,1.294428,0.395496,0.147274,0.154985,0.252618
5,1.243700,1.336082,0.314567,0.180018,0.193671,0.267562
6,1.243700,1.345885,0.31316,0.176735,0.199547,0.268524
7,1.243700,1.32034,0.325827,0.192744,0.192052,0.270396
8,1.243700,1.372412,0.374384,0.229429,0.342014,0.30181
9,1.243700,1.29583,0.460943,0.277697,0.285423,0.348824
10,1.080800,1.217484,0.479944,0.303855,0.294215,0.360828


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=3060, training_loss=0.6195948678683612, metrics={'train_runtime': 1052.0671, 'train_samples_per_second': 92.874, 'train_steps_per_second': 2.909, 'total_flos': 2148881542995600.0, 'train_loss': 0.6195948678683612, 'epoch': 30.0})

#### DistilBert with preprocessing from A5

Accuracy: ~51.8%

#### DistilBert with Bert Preprocessing

Accuracy: ~79.2%

#### LSTM

Accuracy: ~61.3%

#### GRU

Accuracy: ~59.75%