In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import spacy

from spacy import displacy

from torch.utils.data import Dataset, DataLoader

torch.manual_seed(1)

<torch._C.Generator at 0x7bbce41c66f0>

In [2]:
EMOTION_LABEL = {
    "anger": 0,
    "joy": 1,
    "optimism": 2,
    "sadness": 3
}

class EmotionDataset(Dataset):

    def __init__(self, training):
        spacy.prefer_gpu()
        self.nlp = spacy.load("en_core_web_sm")
        self.data = None

        if training:
            self.path_text   = "./data/tweeteval/emotion/train_text.txt"
            self.path_labels = "./data/tweeteval/emotion/train_labels.txt"
        else:
            self.path_text   = "./data/tweeteval/emotion/test_text.txt"
            self.path_labels = "./data/tweeteval/emotion/test_labels.txt"

        self.data   = self._load_txt_file(self.path_text)
        self.labels = self._load_txt_file(self.path_labels, perform_nlp=False)

    def _load_txt_file(self, path, perform_nlp=True):

        with open(path, "r") as f:
            lines = f.readlines()

            print(f"Loading {path} with {len(lines)} lines")

            for i in range(len(lines)):
                if i % 500 == 0:
                    print(f"Processing line {i}/{len(lines)}")

                lines[i] = lines[i].strip()
                lines[i] = lines[i].replace("\n", "")

                if perform_nlp:
                    lines[i] = self.nlp(lines[i])

            return lines

    def __getitem__(self, index):
        return (self.data[index], int(self.labels[index]))

    def __len__(self):
        return len(self.data)

train_dataset = EmotionDataset(training=True)
test_dataset  = EmotionDataset(training=False)

Loading ./data/tweeteval/emotion/train_text.txt with 3257 lines
Processing line 0/3257
Processing line 500/3257
Processing line 1000/3257
Processing line 1500/3257
Processing line 2000/3257
Processing line 2500/3257
Processing line 3000/3257
Loading ./data/tweeteval/emotion/train_labels.txt with 3257 lines
Processing line 0/3257
Processing line 500/3257
Processing line 1000/3257
Processing line 1500/3257
Processing line 2000/3257
Processing line 2500/3257
Processing line 3000/3257
Loading ./data/tweeteval/emotion/test_text.txt with 1421 lines
Processing line 0/1421
Processing line 500/1421
Processing line 1000/1421
Loading ./data/tweeteval/emotion/test_labels.txt with 1421 lines
Processing line 0/1421
Processing line 500/1421
Processing line 1000/1421


In [36]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim=128, hidden_dim=256, vocab_size=5001, tagset_size=4):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim * 85, tagset_size)

        self.embedding_dim = embedding_dim

    def forward(self, sentence):
        batch_size = len(sentence)
        embeds = self.word_embeddings(sentence)
        embeds = nn.utils.rnn.pack_padded_sequence(embeds, [85 for i in range(sentence.shape[0])], batch_first=True)
        lstm_out, _ = self.lstm(embeds)
        out_unpacked, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        # tag_space = self.hidden2tag(out_unpacked.view(batch_size, sentence.shape[1], -1))
        out_unpacked = out_unpacked.reshape(batch_size, -1)
        tag_space = self.hidden2tag(out_unpacked)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
print(f"Using device {device}")

Using device cuda


In [44]:
word_to_ix  = {}
word_counts = {}

maximum_length = 0

# Count the occurrences of each word
for sent, tags in train_dataset:

    if len(sent) > maximum_length:
        maximum_length = len(sent)

    for word in sent:
        if word.text not in word_counts:
            word_counts[word.text] = 1
        else:
            word_counts[word.text] += 1

print(f"Maximum length: {maximum_length}")

# Sort words by counts
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Assign ID to the most 5000 frequent words
for word, _ in sorted_word_counts[100:5100]:
    word_to_ix[word] = len(word_to_ix) + 1

def get_idx(word):
    if word in word_to_ix:
        return word_to_ix[word]
    else:
        return 0

def prepare_sequence(seq):
    idxs = [get_idx(w) for w in seq]

    template_tensor = torch.zeros(maximum_length, dtype=torch.long)

    for i, idx in enumerate(idxs):
        template_tensor[i] = idx

    return template_tensor

Maximum length: 85


In [45]:
class EmotionTensorDataset(Dataset):

    def __init__(self, dataset):
        self.dataset = dataset
        self.data    = []

        for sentence, label in self.dataset:
            self.data.append((prepare_sequence([word.text for word in sentence]), label))

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.dataset)

train_tensor_dataset = EmotionTensorDataset(train_dataset)
test_tensor_dataset  = EmotionTensorDataset(test_dataset)

train_loader = DataLoader(train_tensor_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_tensor_dataset, batch_size=64, shuffle=True)

In [46]:
model = LSTMTagger()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_function = nn.NLLLoss()

model.to(device)

MAX_EPOCHS = 100

for epoch in range(MAX_EPOCHS):
    model.train()

    for i, (sentence, label) in enumerate(train_loader):
        sentence, label = sentence.to(device), label.to(device)
        model.zero_grad()

        tag_scores = model(sentence)
        loss = loss_function(tag_scores, label)

        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f"Epoch {epoch}, iteration {i}, loss: {loss.item()}")

    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for sentence, label in test_loader:
            sentence, label = sentence.to(device), label.to(device)

            tag_scores = model(sentence)

            _, predicted = torch.max(tag_scores, 1)

            total += label.size(0)
            correct += (predicted == label).sum().item()

    print(f"Epoch {epoch}, accuracy: {correct/total}")

Epoch 0, iteration 0, loss: 1.3759361505508423
Epoch 0, accuracy: 0.41379310344827586
Epoch 1, iteration 0, loss: 1.0314100980758667
Epoch 1, accuracy: 0.4644616467276566
Epoch 2, iteration 0, loss: 0.927951455116272
Epoch 2, accuracy: 0.48909218859957776
Epoch 3, iteration 0, loss: 0.7023993730545044
Epoch 3, accuracy: 0.525686136523575
Epoch 4, iteration 0, loss: 0.3833940625190735
Epoch 4, accuracy: 0.5446868402533427
Epoch 5, iteration 0, loss: 0.2607744634151459
Epoch 5, accuracy: 0.5453905700211119
Epoch 6, iteration 0, loss: 0.1475040465593338
Epoch 6, accuracy: 0.5601688951442646
Epoch 7, iteration 0, loss: 0.08462411165237427
Epoch 7, accuracy: 0.5679099225897255
Epoch 8, iteration 0, loss: 0.0878586620092392
Epoch 8, accuracy: 0.5721323011963406
Epoch 9, iteration 0, loss: 0.01772087626159191
Epoch 9, accuracy: 0.5608726249120338
Epoch 10, iteration 0, loss: 0.00876993965357542
Epoch 10, accuracy: 0.5742434904996482
Epoch 11, iteration 0, loss: 0.005613474640995264
Epoch 11, 

KeyboardInterrupt: 

In [43]:
true_positive = 0
total = 0


with torch.no_grad():
    for sentence, label in test_loader:
        sentence, label = sentence.to(device), label.to(device)

        tag_scores = model(sentence)

        _, predicted = torch.max(tag_scores, 1)

        total += label.size(0)
        true_positive += (predicted == label).sum().item()

print(f"Final accuracy: {true_positive/total}")

Final accuracy: 0.5819845179451091
