In [1]:
import datetime

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import pandas as pd
from itertools import product
from IPython.display import clear_output

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data_dir = 'drive/My Drive/'
train_lang = 'en'

In [4]:
class DatasetSeq(Dataset):
    def __init__(self, data_dir, train_lang='en'):
	#open file
        with open(data_dir + train_lang + '.train', 'r') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]
	    #init vocabs of tokens for encoding {<str> token: <int> id}
        self.target_vocab = {} # {p: 1, a: 2, r: 3, pu: 4}
        self.word_vocab = {} # {cat: 1, sat: 2, on: 3, mat: 4, '.': 5}
        self.char_vocab = {} # {c: 1, a: 2, t: 3, ' ': 4, s: 5}
	    
        # Cat sat on mat. -> [1, 2, 3, 4, 5]
        # p    a  r  p pu -> [1, 2, 3, 1, 4]
        # chars  -> [1, 2, 3, 4, 5, 2, 3, 4]

	    #init encoded sequences lists (processed data)
        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        # n=1 because first value is padding
        n_word = 1
        n_target = 1
        n_char = 1
        for line in train:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence)
            self.encoded_targets.append(target)
            self.encoded_char_sequences.append(chars)

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # [1, 2, 3, 4, 6] len=5
            'char': self.encoded_char_sequences[index],# [[1,2,3], [4,5], [1,2], [2,6,5,4], []] len=5
            'target': self.encoded_targets[index], #  (1)
        }

In [5]:
dataset = DatasetSeq(data_dir)

In [6]:
#padding
# seq1 = [1, 2, 3, 4]
# seq2 = [9, 7, 6, 4, 3, 7, 5]
# pad seq1 equal seq2
# seq1 = [1, 2, 3, 4, 0, 0, 0]
# concat(seq1, seq2) [[1, 2, 3, 4, 0, 0, 0],
#                     [9, 7, 6, 4, 3, 7, 5]]

In [7]:
def collate_fn(input_data):
    data = []
    chars = []
    targets = []
    max_len = 0
    for item in input_data:
        if len(item['data']) > max_len:
            max_len = len(item['data'])
        data.append(torch.as_tensor(item['data']))
        chars.append(item['char'])
        targets.append(torch.as_tensor(item['target']))
    chars_seq = [[torch.as_tensor([0]) for _ in range(len(input_data))] for _ in range(max_len)]
    for j in range(len(input_data)):
        for i in range(max_len):
            if len(chars[j]) > i:
                chars_seq[i][j] = torch.as_tensor(chars[j][i])
    for j in range(max_len):
        chars_seq[j] = pad_sequence(chars_seq[j], batch_first=True, padding_value=0)
    data = pad_sequence(data, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)
    return {'data': data, 'chars': chars_seq, 'target': targets}

In [8]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        emb = self.char_emb(x) # B x T x Emb_dim
        _, out = self.rnn(emb)
        # _: B x T x Hidden 
        # out: 1 x B x Hidden

        return out.transpose(0, 1) # B x 1 x Hidden

# #TODO try to use other RNN archicetures, f.e. RNN and LSTM

In [9]:
class RNNPredictor(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes,
                 char_vocab, char_emb, char_hidden):
        super().__init__()
        #TODO try to use other RNN archicetures, f.e. RNN and LSTM
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        # batch_first = False: T x B x Vec
        # batch_first = True: B x T x Vec
        self.rnn = nn.RNN(emb_dim + char_hidden, hidden_dim, batch_first=True) 
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
        self.hidden_dim = hidden_dim
        self.char_rnn = CharRNN(char_vocab, char_emb, char_hidden)

    def forward(self, x, chars):
        emb = self.word_emb(x)
        char_features = [self.char_rnn(c.to(x.device)) for c in chars]
        char_features = torch.cat(char_features, dim=1) # конкатенация по времени B x T x Char_hid
        emb = torch.cat((emb, char_features), dim=-1) # конкатенация векторов
        hidden, _ = self.rnn(emb)

        return self.clf(self.do(hidden))


class GRUPredictor(RNNPredictor):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes,
                 char_vocab, char_emb, char_hidden):
        super().__init__(vocab_size, emb_dim, hidden_dim, n_classes,
                 char_vocab, char_emb, char_hidden)
        self.rnn = nn.GRU(emb_dim + char_hidden, hidden_dim, batch_first=True) 

class LSTMPredictor(RNNPredictor):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes,
                 char_vocab, char_emb, char_hidden):
        super().__init__(vocab_size, emb_dim, hidden_dim, n_classes,
                 char_vocab, char_emb, char_hidden)
        self.rnn = nn.LSTM(emb_dim + char_hidden, hidden_dim, batch_first=True)         


In [10]:
#hyper params
vocab_size = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
#TODO try to use other model parameters
emb_dim = 256
hidden = 256
char_hid = 64
char_emb = 32
n_epochs = 10
batch_size = 64
cuda_device = 0
batch_size = 100
device = f'cuda:{cuda_device}' if cuda_device != -1 else 'cpu'

In [11]:
models = []
models.append(
     {'name':'GRU-model',
      'model':GRUPredictor(vocab_size, emb_dim, hidden, n_classes, n_chars, char_emb, char_hid).to(device),
      'loss_func':nn.CrossEntropyLoss()
            }
)

models.append(
     {'name':'RNN-model',
      'model':RNNPredictor(vocab_size, emb_dim, hidden, n_classes, n_chars, char_emb, char_hid).to(device),
      'loss_func':nn.CrossEntropyLoss()
            }
)

models.append(
     {'name':'LSTM-model',
      'model':LSTMPredictor(vocab_size, emb_dim, hidden, n_classes, n_chars, char_emb, char_hid).to(device),
      'loss_func':nn.CrossEntropyLoss()
            }
)


for mdl in models:
  mdl['optim'] = torch.optim.Adam(mdl['model'].parameters(), lr=0.001)
  mdl['model'].train()
  print (mdl['name'], mdl['model'])

GRU-model GRUPredictor(
  (word_emb): Embedding(29588, 256)
  (rnn): GRU(320, 256, batch_first=True)
  (clf): Linear(in_features=256, out_features=18, bias=True)
  (do): Dropout(p=0.1, inplace=False)
  (char_rnn): CharRNN(
    (char_emb): Embedding(168, 32)
    (rnn): GRU(32, 64, batch_first=True)
  )
)
RNN-model RNNPredictor(
  (word_emb): Embedding(29588, 256)
  (rnn): RNN(320, 256, batch_first=True)
  (clf): Linear(in_features=256, out_features=18, bias=True)
  (do): Dropout(p=0.1, inplace=False)
  (char_rnn): CharRNN(
    (char_emb): Embedding(168, 32)
    (rnn): GRU(32, 64, batch_first=True)
  )
)
LSTM-model LSTMPredictor(
  (word_emb): Embedding(29588, 256)
  (rnn): LSTM(320, 256, batch_first=True)
  (clf): Linear(in_features=256, out_features=18, bias=True)
  (do): Dropout(p=0.1, inplace=False)
  (char_rnn): CharRNN(
    (char_emb): Embedding(168, 32)
    (rnn): GRU(32, 64, batch_first=True)
  )
)


In [12]:
p_chekpoint = 100

n_steps=len(dataset)//batch_size

df_train_step = pd.DataFrame(columns=['epoch', 'step'], data=product(range(n_epochs), range(0, n_steps, p_chekpoint)))
#df_train_step.set_index(['epoch'], inplace=True)
df_train_step.set_index(['epoch', 'step'], inplace=True)
df_train_step

for mdl in models:
  model = mdl['model']
  optim = mdl['optim']
  loss_func = mdl['loss_func']

  mdl_name = mdl['name']
  #f_train_step[mdl_name+'_loss']=[None]*
  df_train_step.insert(df_train_step.shape[1], mdl_name+'_loss', None)
  df_train_step.insert(df_train_step.shape[1], mdl_name+'_time', None)

  start = datetime.datetime.now()
  for epoch in range(n_epochs):
      dataloader = DataLoader(dataset, 
                              batch_size, 
                              shuffle=True, 
                              collate_fn=collate_fn,
                              drop_last = True,
                              )
      for i, batch in enumerate(dataloader):
          optim.zero_grad()

          predict = model(batch['data'].to(device), batch['chars'])
          loss = loss_func(predict.view(-1, n_classes),
                          batch['target'].to(device).view(-1), 
                          )
          loss.backward()
          optim.step()
          if i % p_chekpoint == 0:
              clear_output(wait=True)
              df_train_step.loc[(epoch, i)][mdl_name+'_loss'] = loss.item()
              df_train_step.loc[(epoch, i)][mdl_name+'_time'] = datetime.datetime.now()-start
              display(df_train_step)
    
      torch.save(model.state_dict(), f'./rnn_chkpt__{mdl_name}_{epoch}.pth')

Unnamed: 0_level_0,Unnamed: 1_level_0,GRU-model_loss,GRU-model_time,RNN-model_loss,RNN-model_time,LSTM-model_loss,LSTM-model_time
epoch,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,2.91841,0:00:00.606839,2.85751,0:00:00.106013,2.959137,0:00:00.124309
0,100,0.254399,0:00:15.918162,0.218428,0:00:11.911622,0.264779,0:00:12.193780
0,200,0.152588,0:00:28.582636,0.16518,0:00:23.560976,0.141908,0:00:25.141531
1,0,0.173208,0:00:30.282177,0.139539,0:00:24.837209,0.161152,0:00:26.668119
1,100,0.080576,0:00:42.311151,0.091557,0:00:36.408324,0.094041,0:00:38.829182
1,200,0.099854,0:00:54.568568,0.097285,0:00:49.874875,0.115048,0:00:50.485189
2,0,0.103359,0:00:55.982431,0.116699,0:00:51.149632,0.09718,0:00:51.840551
2,100,0.087195,0:01:08.139684,0.075914,0:01:03.259655,0.065279,0:01:04.948186
2,200,0.091689,0:01:21.554053,0.090771,0:01:14.936527,0.070495,0:01:15.901418
3,0,0.062941,0:01:23.070947,0.049428,0:01:16.075935,0.067576,0:01:17.873162


In [1]:
#example

phrase = 'I do love this magic neural networks !'
words = phrase.split(' ') 
tokens = [dataset.word_vocab[w] for w in words]
chars = [torch.tensor([dataset.char_vocab[c] for c in w]).unsqueeze(0).to(device) for w in words]

start = datetime.datetime.now()
for mdl in models:
  model = mdl['model']
  print("predict with ", mdl['name'])
  start = datetime.datetime.now()
  with torch.no_grad():
      model.eval()
      predict = model(torch.tensor(tokens).unsqueeze(0).to(device), chars) # 1 x T x N_classes
      labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
      end = datetime.datetime.now() - start

  target_labels = list(dataset.target_vocab.keys())
  print([target_labels[l-1] for l in labels])
  print("Predicted in", datetime.datetime.now()-start, '\n')

NameError: ignored