In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import spacy

from spacy import displacy

from torch.utils.data import Dataset, DataLoader

torch.manual_seed(1)

<torch._C.Generator at 0x73ba900c66f0>

In [2]:
EMOTION_LABEL = {
    "anger": 0,
    "joy": 1,
    "optimism": 2,
    "sadness": 3
}

class EmotionDataset(Dataset):

    def __init__(self, training):
        spacy.prefer_gpu()
        self.nlp = spacy.load("en_core_web_sm")
        self.data = None

        if training:
            self.path_text   = "./data/tweeteval/emotion/train_text.txt"
            self.path_labels = "./data/tweeteval/emotion/train_labels.txt"
        else:
            self.path_text   = "./data/tweeteval/emotion/test_text.txt"
            self.path_labels = "./data/tweeteval/emotion/test_labels.txt"

        self.data   = self._load_txt_file(self.path_text)
        self.labels = self._load_txt_file(self.path_labels, perform_nlp=False)

    def _load_txt_file(self, path, perform_nlp=True):

        with open(path, "r") as f:
            lines = f.readlines()

            print(f"Loading {path} with {len(lines)} lines")

            for i in range(len(lines)):
                if i % 500 == 0:
                    print(f"Processing line {i}/{len(lines)}")

                lines[i] = lines[i].strip()
                lines[i] = lines[i].replace("\n", "")

                if perform_nlp:
                    lines[i] = self.nlp(lines[i])

            return lines

    def __getitem__(self, index):
        return (self.data[index], int(self.labels[index]))

    def __len__(self):
        return len(self.data)

train_dataset = EmotionDataset(training=True)
test_dataset  = EmotionDataset(training=False)

Loading ./data/tweeteval/emotion/train_text.txt with 3257 lines
Processing line 0/3257
Processing line 500/3257
Processing line 1000/3257
Processing line 1500/3257
Processing line 2000/3257
Processing line 2500/3257
Processing line 3000/3257
Loading ./data/tweeteval/emotion/train_labels.txt with 3257 lines
Processing line 0/3257
Processing line 500/3257
Processing line 1000/3257
Processing line 1500/3257
Processing line 2000/3257
Processing line 2500/3257
Processing line 3000/3257
Loading ./data/tweeteval/emotion/test_text.txt with 1421 lines
Processing line 0/1421
Processing line 500/1421
Processing line 1000/1421
Loading ./data/tweeteval/emotion/test_labels.txt with 1421 lines
Processing line 0/1421
Processing line 500/1421
Processing line 1000/1421


In [3]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim=128, hidden_dim=256, vocab_size=5000, tagset_size=4):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [4]:
model = LSTMTagger()
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_function = nn.NLLLoss()

word_to_ix = {}

word_counts = {}

# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in train_dataset:
    for word in sent:
        if word.text not in word_counts:
            word_counts[word.text] = 1
        else:
            word_counts[word.text] += 1

# Sort words by counts
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Assign ID to the most 5000 frequent words
for word, _ in sorted_word_counts[:5000]:
    word_to_ix[word] = len(word_to_ix) + 1

print(len(word_to_ix))

def get_idx(word):
    if word in word_to_ix:
        return word_to_ix[word]
    else:
        return 0

def prepare_sequence(seq, to_ix):
    idxs = [get_idx(w) for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

with torch.no_grad():
    inputs = prepare_sequence(train_dataset[0][0], word_to_ix)
    tag_scores = model(inputs)

for epoch in range(10):

    iteration_counter = 0

    for sentence, label in train_dataset:
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor(label)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)[-1]

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

        iteration_counter += 1

        if iteration_counter % 100 == 0:
            print(f"Epoch {epoch}, iteration {iteration_counter}/{len(train_dataset)}, loss: {loss}")

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(train_dataset[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

5000
Epoch 0, iteration 100/3257, loss: 1.8801770210266113
Epoch 0, iteration 200/3257, loss: 2.9997172355651855
Epoch 0, iteration 300/3257, loss: 1.7394258975982666
Epoch 0, iteration 400/3257, loss: 0.9320502877235413
Epoch 0, iteration 500/3257, loss: 1.047295093536377
Epoch 0, iteration 600/3257, loss: 1.1256669759750366
Epoch 0, iteration 700/3257, loss: 1.338423252105713
Epoch 0, iteration 800/3257, loss: 0.943281352519989
Epoch 0, iteration 900/3257, loss: 0.6726950407028198
Epoch 0, iteration 1000/3257, loss: 0.8542113304138184
Epoch 0, iteration 1100/3257, loss: 1.6902644634246826
Epoch 0, iteration 1200/3257, loss: 1.1624033451080322
Epoch 0, iteration 1300/3257, loss: 0.918684720993042
Epoch 0, iteration 1400/3257, loss: 1.2157502174377441
Epoch 0, iteration 1500/3257, loss: 2.37111496925354
Epoch 0, iteration 1600/3257, loss: 1.147670865058899
Epoch 0, iteration 1700/3257, loss: 1.2347530126571655
Epoch 0, iteration 1800/3257, loss: 0.688170850276947
Epoch 0, iteration 190

KeyboardInterrupt: 