In [1]:
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam, RMSprop
from time import perf_counter

In [2]:
class DatasetSeq(Dataset):

    def __init__(self, data_dir='./', train_lang='en'):
        # open file
        with open(data_dir + train_lang + '.train', 'r',
                  encoding='utf-8') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]
        # init vocabs of tokens for encoding { token:  id}
        self.target_vocab = {}  # {NOUN: 1, VERB: 2, ADP: 3, NOUN: 1, PUNCT: 4}
        self.word_vocab = {}  # {cat: 1, sat: 2, on: 3, mat: 4, '.': 5}
        self.char_vocab = {}  # {c: 1, a: 2, t: 3, ' ': 4, s: 5}

        # init encoded sequences lists (processed data)
        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        # n=1 because first value is padding
        n_word = 1
        n_target = 1
        n_char = 1
        for line in train:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence) # n_seq x words_in_seq
            self.encoded_targets.append(target) # n_seq x words_in_seq
            self.encoded_char_sequences.append(chars) # n_seq x words_in_seq x word_len

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # words_in_seq
            'char': self.encoded_char_sequences[index], # words_in_seq
            'target': self.encoded_targets[index], # words_in_seq x word_len
        }

In [3]:
dataset = DatasetSeq()

In [18]:
def collate_fn_w_chars(input_data):
    data = []
    chars = []
    targets = []
    data_len = len(input_data)
    max_len = 0
    for item in input_data:
        if len(item['data']) > max_len:
            max_len = len(item['data'])
        data.append(torch.as_tensor(item['data']))
        chars.append(item['char'])
        targets.append(torch.as_tensor(item['target']))
    chars_seq = [[torch.as_tensor([0]) for _ in range(data_len)]
                 for _ in range(max_len)] # max_seq_len x batch_len
    for j in range(data_len): # batch_len
        i = 0
        while i < len(chars[j]): # max_seq_len
            chars_seq[i][j] = torch.as_tensor(chars[j][i]) # batch_len x seq_len x word_len
            i += 1
    for i in range(max_len):
        chars_seq[i] = pad_sequence(chars_seq[i],
                                    batch_first=True,
                                    padding_value=0)
    data = pad_sequence(data, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)
    return {'data': data, 'chars': chars_seq, 'target': targets}

In [33]:
from pprint import pprint
data = []
chars = []
targets = []
data_len = 3
max_len = 0
for i, item in enumerate(dataset):
    if len(item['data']) > max_len:
        max_len = len(item['data'])
    data.append(torch.as_tensor(item['data']))
    chars.append(item['char'])
    targets.append(torch.as_tensor(item['target']))
print(max_len)
chars_seq = [[torch.as_tensor([0]) for _ in range(data_len)]
             for _ in range(max_len)] # max_seq_len x batch_len
for j in range(data_len): # batch_len
    i = 0
    while i < len(chars[j]): # max_seq_len
        chars_seq[i][j] = torch.as_tensor(chars[j][i]) # batch_len x seq_len x word_len
        i += 1
        
for i in range(max_len):
    chars_seq[i] = pad_sequence(chars_seq[i],
                                batch_first=True,
                                padding_value=0)
    
    pprint(chars_seq)
    if i > 2:
        break

209
[tensor([[ 1,  2,  0],
        [30,  0,  0],
        [34, 35,  1]]),
 [tensor([3]), tensor([31, 19, 11, 15]), tensor([8])],
 [tensor([4, 5, 6, 5, 7]),
  tensor([16, 11,  2,  2, 11,  7, 32]),
  tensor([36, 10,  5, 25, 11])],
 [tensor([8]),
  tensor([14, 13]),
  tensor([ 5, 21, 23, 19, 14, 10, 11, 23, 11,  9, 15])],
 [tensor([ 1,  6,  9, 10, 11, 12,  5,  7]),
  tensor([5]),
  tensor([ 5,  7,  7, 14, 21,  7, 12,  9, 17])],
 [tensor([13, 14, 10, 12,  9, 15]),
  tensor([10,  9, 15, 24,  9, 12, 23,  9, 17]),
  tensor([23, 19,  5, 23])],
 [tensor([16, 11,  2,  2,  9, 17]),
  tensor([12,  2,  9, 10, 11, 12]),
  tensor([23, 19,  9, 28])],
 [tensor([18, 19,  5, 11, 16, 19]),
  tensor([26, 11,  2,  2]),
  tensor([19,  5, 17])],
 [tensor([ 1, 20, 17, 21,  2,  2,  5, 19]),
  tensor([20,  9]),
  tensor([20, 21, 15, 23,  9, 17])],
 [tensor([5, 2]), tensor([12,  5, 21, 15, 11,  7, 32]), tensor([21, 24])],
 [tensor([3]), tensor([21, 15]), tensor([37])],
 [tensor([ 1,  7, 11]),
  tensor([23, 10, 14,

[tensor([[ 1,  2,  0],
        [30,  0,  0],
        [34, 35,  1]]),
 tensor([[ 3,  0,  0,  0],
        [31, 19, 11, 15],
        [ 8,  0,  0,  0]]),
 [tensor([4, 5, 6, 5, 7]),
  tensor([16, 11,  2,  2, 11,  7, 32]),
  tensor([36, 10,  5, 25, 11])],
 [tensor([8]),
  tensor([14, 13]),
  tensor([ 5, 21, 23, 19, 14, 10, 11, 23, 11,  9, 15])],
 [tensor([ 1,  6,  9, 10, 11, 12,  5,  7]),
  tensor([5]),
  tensor([ 5,  7,  7, 14, 21,  7, 12,  9, 17])],
 [tensor([13, 14, 10, 12,  9, 15]),
  tensor([10,  9, 15, 24,  9, 12, 23,  9, 17]),
  tensor([23, 19,  5, 23])],
 [tensor([16, 11,  2,  2,  9, 17]),
  tensor([12,  2,  9, 10, 11, 12]),
  tensor([23, 19,  9, 28])],
 [tensor([18, 19,  5, 11, 16, 19]),
  tensor([26, 11,  2,  2]),
  tensor([19,  5, 17])],
 [tensor([ 1, 20, 17, 21,  2,  2,  5, 19]),
  tensor([20,  9]),
  tensor([20, 21, 15, 23,  9, 17])],
 [tensor([5, 2]), tensor([12,  5, 21, 15, 11,  7, 32]), tensor([21, 24])],
 [tensor([3]), tensor([21, 15]), tensor([37])],
 [tensor([ 1,  7, 11]),

 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor

 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor([0])],
 [tensor([0]), tensor([0]), tensor

In [5]:
class SelectItem(nn.Module):

    def __init__(self, item_index):
        super().__init__()
        self._name = 'selectitem'
        self.item_index = item_index

    def forward(self, inputs):
        return inputs[self.item_index]

In [6]:
class CharRNN(nn.Module):

    def __init__(self, vocab_size, emb_dim, hid_dim):
        super().__init__()
        self.seq = nn.Sequential()
        self.seq.append(nn.Embedding(vocab_size, emb_dim))
        self.seq.append(nn.GRU(emb_dim, hid_dim, batch_first=True))
        self.seq.append(SelectItem(1))

    def forward(self, x):
        out = self.seq(x)  # 1 x B x Hid

        return out.squeeze().unsqueeze(1)  # B x 1 x Hid

In [7]:
class RNNPredictor(nn.Module):

    def __init__(self, vocab_size, emb_dim, hid_dim, n_classes, char_vocab,
                 char_emb, char_hid):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.seq = nn.Sequential()
        # TODO try to use other RNN archicetures, e.g. RNN and LSTM
        self.char_rnn = CharRNN(char_vocab, char_emb, char_hid)
        self.seq.append(nn.GRU(emb_dim + char_hid, hid_dim, batch_first=True))
        self.seq.append(SelectItem(0))
        self.seq.append(nn.Dropout(0.1))
        self.seq.append(nn.Linear(hid_dim, n_classes))

    def forward(self, x, chars):
        char_feat = [self.char_rnn(c.to(x.device)) for c in chars]
        char_feat = torch.cat(char_feat, dim=1)  # B x T x Hid_char
        emb = self.word_emb(x)  # B x T x Emb_dim
        pred = self.seq(torch.cat((emb, char_feat),
                                  dim=-1))  # B x T x N_classes

        return pred

In [8]:
# hyper params
vocab_size = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
# TODO try to use other model parameters
emb_dim = 256
hidden = 256
char_hid = 64
char_emb = 32
n_epochs = 10
batch_size = 100
device = "cuda" if torch.cuda.is_available() else "cpu"

In [9]:
model = RNNPredictor(vocab_size, emb_dim, hidden, n_classes, n_chars, char_emb,
                     char_hid).to(device)
optim = Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()

In [13]:
model.load_state_dict(torch.load('RNN_char.pth'))

RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

In [41]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, data_dict in enumerate(dataloader):
        words = data_dict['data'].to(device)
        chars = data_dict['chars']
        targets = data_dict['target'].to(device).view(-1)

        # Compute prediction error
        pred = model(words, chars).view(-1, n_classes)
        loss = loss_fn(pred, targets)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(words)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [42]:
for t in range(n_epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    dataloader = DataLoader(
        dataset,
        batch_size,
        shuffle=True,
        collate_fn=collate_fn_w_chars,
        drop_last=True,
    )
    train(dataloader, model, loss_fn, optim)
print("Done!")

Epoch 1
-------------------------------
loss: 2.941659  [    0/21235]
loss: 0.338969  [10000/21235]
loss: 0.077432  [20000/21235]
Epoch 2
-------------------------------
loss: 0.100163  [    0/21235]
loss: 0.095712  [10000/21235]
loss: 0.107192  [20000/21235]
Epoch 3
-------------------------------
loss: 0.069409  [    0/21235]
loss: 0.106560  [10000/21235]
loss: 0.084917  [20000/21235]
Epoch 4
-------------------------------
loss: 0.039601  [    0/21235]
loss: 0.058593  [10000/21235]
loss: 0.050783  [20000/21235]
Epoch 5
-------------------------------
loss: 0.044977  [    0/21235]
loss: 0.043461  [10000/21235]
loss: 0.043864  [20000/21235]
Epoch 6
-------------------------------
loss: 0.030705  [    0/21235]
loss: 0.048214  [10000/21235]
loss: 0.057217  [20000/21235]
Epoch 7
-------------------------------
loss: 0.041543  [    0/21235]
loss: 0.068672  [10000/21235]
loss: 0.048047  [20000/21235]
Epoch 8
-------------------------------
loss: 0.045796  [    0/21235]
loss: 0.027256  [100

In [85]:
torch.save(model.state_dict(), 'RNN_char.pth')

In [81]:
dataloader = DataLoader(
    dataset,
    batch_size,
    shuffle=True,
    collate_fn=collate_fn_w_chars,
    drop_last=True,
)
for data_dict in dataloader:
    chars = data_dict['chars']
    print(chars[:2][0].size())
    break

torch.Size([100, 11])


In [83]:
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]
max_len = max(map(len, words))
chars_seq = [[torch.as_tensor([0]) for _ in range(len(words))]
             for _ in range(max_len)]
for j in range(len(words)):
    for i in range(max_len):
        if i < len(chars[j]):
            chars_seq[i][j] = torch.as_tensor(chars[j][i])
for i in range(max_len):
    chars_seq[i] = pad_sequence(chars_seq[i],
                                batch_first=True,
                                padding_value=0)
print(chars_seq[:2][0].size())
max_len

torch.Size([10, 14])


7

In [84]:
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]
max_len = max(map(len, words))
chars_seq = [[torch.as_tensor([0]) for _ in range(len(words))]
             for _ in range(max_len)]
for j in range(len(words)):
    for i in range(max_len):
        if i < len(chars[j]):
            chars_seq[i][j] = torch.as_tensor(chars[j][i])
for i in range(max_len):
    chars_seq[i] = pad_sequence(chars_seq[i],
                                batch_first=True,
                                padding_value=0)

start = perf_counter()
with torch.no_grad():
    pred = model(torch.tensor(tokens).unsqueeze(0).to(device), chars_seq)
    labels = torch.argmax(pred, dim=-1).squeeze().cpu().detach().tolist()
    dur = perf_counter() - start

print(f'prediction time: {dur:.f3}')
target_labels = list(dataset.target_vocab.keys())
print([target_labels[l - 1] for l in labels])

RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 1 but got size 10 for tensor number 1 in the list.