In [19]:
import torch
import torch.nn as nn
import time
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import f1_score

In [20]:
data_dir = 'C:/Users/glukh/JupyterFiles/SBER_школа_DS/lesson 18/'
train_lang = 'en'

In [21]:
def collate_fn(input_data):
    data = []
    chars = []
    targets = []
    max_len = 0
    for item in input_data:
        if len(item['data']) > max_len:
            max_len = len(item['data'])
        data.append(torch.as_tensor(item['data']))
        chars.append(item['char'])
        targets.append(torch.as_tensor(item['target']))
    chars_seq = [[torch.as_tensor([0]) for _ in range(len(input_data))] for _ in range(max_len)]
    for j in range(len(input_data)):
        for i in range(max_len):
            if len(chars[j]) > i:
                chars_seq[i][j] = torch.as_tensor(chars[j][i])
    for j in range(max_len):
        chars_seq[j] = pad_sequence(chars_seq[j], batch_first=True, padding_value=0)
    data = pad_sequence(data, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)
    return {'data': data, 'chars': chars_seq, 'target': targets}

In [22]:
class DatasetSeq(Dataset):
    def __init__(self, data_dir, train_lang='en'):

        with open('en.train', 'r', encoding='utf-8') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]

        self.target_vocab = {}
        self.word_vocab = {}
        self.char_vocab = {}

        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        n_word = 1
        n_target = 1
        n_char = 1
        for line in train:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence)
            self.encoded_targets.append(target)
            self.encoded_char_sequences.append(chars)

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # [1, 2, 3, 4, 6] len=5
            'char': self.encoded_char_sequences[index],# [[1,2,3], [4,5], [1,2], [2,6,5,4], []] len=5
            'target': self.encoded_targets[index], #  (1)
        }

In [23]:
batch_size = 100
dataset = DatasetSeq(data_dir)
data_train, data_test = train_test_split(dataset, test_size=0.2, random_state=42)
train_loader = DataLoader(
        dataset=data_train,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
test_loader = DataLoader(
        dataset=data_test,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
data_loaders = {'train' : train_loader, 'test' : test_loader}

In [24]:
#hyper params
vocab_len = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [25]:
class CharModel(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 ):
        super().__init__()
        self.char_emb = nn.Embedding(vocab_size, emb_dim)

        self.hidden_dim = hidden_dim
        self.rnn = nn.GRU(input_size=emb_dim, hidden_size=hidden_dim, batch_first=True)

    def forward(self, x): # B x T
        x = self.char_emb(x)  # B x T x V
        _, out = self.rnn(x)

        return out # B x 1 x V

In [26]:
class GRU_predictor_Chars(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 n_chars: int,
                 char_emb_dim: int,
                 char_hidden_dim: int,
                 ):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)

        self.hidden_dim = hidden_dim
        self.rnn = nn.GRU(input_size=emb_dim+char_emb_dim, hidden_size=hidden_dim, batch_first=True)
        self.char_model = CharModel(n_chars, char_emb_dim, char_hidden_dim)

        self.classifier = nn.Linear(hidden_dim, n_classes)

    def forward(self, x, char_seq): # B x T
        x = self.word_emb(x)  # B(размер батча) x T(длина предложения после паддинга) x V(размерность эмбединга)
        chars = [self.char_model(item.to(x.device)).squeeze().unsqueeze(1) for item in char_seq] # T x 1
        chars = torch.cat(chars, dim=1)
        rnn_out, _ = self.rnn(torch.cat([x, chars], dim=-1))
        out = self.classifier(rnn_out)
        return out

In [27]:
class RNN_predictor_Chars(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 n_chars: int,
                 char_emb_dim: int,
                 char_hidden_dim: int,
                 ):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)

        self.hidden_dim = hidden_dim
        self.rnn = nn.RNN(input_size=emb_dim+char_emb_dim, hidden_size=hidden_dim, batch_first=True)
        self.char_model = CharModel(n_chars, char_emb_dim, char_hidden_dim)

        self.classifier = nn.Linear(hidden_dim, n_classes)

    def forward(self, x, char_seq): # B x T
        x = self.word_emb(x)  # B(размер батча) x T(длина предложения после паддинга) x V(размерность эмбединга)
        chars = [self.char_model(item.to(x.device)).squeeze().unsqueeze(1) for item in char_seq] # T x 1
        chars = torch.cat(chars, dim=1)
        rnn_out, _ = self.rnn(torch.cat([x, chars], dim=-1))
        out = self.classifier(rnn_out)
        return out

In [28]:
class LSTM_predictor_Chars(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 n_chars: int,
                 char_emb_dim: int,
                 char_hidden_dim: int,
                 ):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)

        self.hidden_dim = hidden_dim
        self.rnn = nn.LSTM(input_size=emb_dim+char_emb_dim, hidden_size=hidden_dim, batch_first=True)
        self.char_model = CharModel(n_chars, char_emb_dim, char_hidden_dim)

        self.classifier = nn.Linear(hidden_dim, n_classes)

    def forward(self, x, char_seq): # B x T
        x = self.word_emb(x)  # B(размер батча) x T(длина предложения после паддинга) x V(размерность эмбединга)
        chars = [self.char_model(item.to(x.device)).squeeze().unsqueeze(1) for item in char_seq] # T x 1
        chars = torch.cat(chars, dim=1)
        rnn_out, _ = self.rnn(torch.cat([x, chars], dim=-1))
        out = self.classifier(rnn_out)
        return out

In [29]:
class LSTMBD_predictor_Chars(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 n_chars: int,
                 char_emb_dim: int,
                 char_hidden_dim: int,
                 ):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)

        self.hidden_dim = hidden_dim
        self.rnn = nn.LSTM(input_size=emb_dim+char_emb_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True)
        self.char_model = CharModel(n_chars, char_emb_dim, char_hidden_dim)

        self.classifier = nn.Linear(hidden_dim, n_classes)

    def forward(self, x, char_seq): # B x T
        x = self.word_emb(x)  # B(размер батча) x T(длина предложения после паддинга) x V(размерность эмбединга)
        chars = [self.char_model(item.to(x.device)).squeeze().unsqueeze(1) for item in char_seq] # T x 1
        chars = torch.cat(chars, dim=1)
        rnn_out, _ = self.rnn(torch.cat([x, chars], dim=-1))
        out = self.classifier(rnn_out)
        return out

In [30]:
def train_eval_loop(model, optim, loss_func, num_epoches, train_loader, test_loader):
    
    
    accum_train_time = 0
    accum_test_time = 0
    for epoch in range(num_epoches):
        for k, loader in data_loaders.items():
            
            accum_f1 = 0
            accum_loss = 0
            steps = 0
            
            if k == 'train':
                start_train_time = time.time()
                model.train()
                
                for step, batch in enumerate(loader):
                    optim.zero_grad()
                    data = batch['data'].to(device)  # B x T
                    pred = model(data, batch['chars'])
                    loss = loss_func(pred.view(-1, n_classes), batch['target'].view(-1).to(device))
                    loss.backward()
                    optim.step()
                    accum_f1 += f1_score(batch['target'].view(-1), torch.max(pred, -1)[1].view(-1).detach().cpu(), average='macro')
                    steps += 1
                accum_train_time += round((time.time() - start_train_time), 2)
                f1 = accum_f1 / steps
                print(f'Loader: {k}, Эпоха: {epoch}, loss: {loss}, f1_score: {f1},' 
                  f'Время обучения: {accum_train_time}')

            else:
                start_eval_time = time.time()
                for step, batch in enumerate(loader):
                    model.eval()
                    with torch.no_grad():
                        data = batch['data'].to(device)
                        pred = model(data, batch['chars'])
                        loss = loss_func(pred.view(-1, n_classes), batch['target'].view(-1).to(device))
                        accum_f1 += f1_score(batch['target'].view(-1), torch.max(pred, -1)[1].view(-1).detach().cpu(), average='macro') 
                        steps += 1
                accum_test_time += round((time.time() - start_eval_time), 2)
                f1 = accum_f1 / steps
                print(f'Loader: {k}, Эпоха: {epoch}, loss: {loss}, f1_score: {f1},' 
                  f'Время предсказания {accum_test_time}')

### GRU

In [31]:
gru_model = GRU_predictor_Chars(vocab_len, 200, 256, n_classes, n_chars, 32, 32)
gru_model.train()
gru_model = gru_model.to(device)
gru_optim = torch.optim.Adam(gru_model.parameters(), lr=0.001)
gru_loss_func = nn.CrossEntropyLoss()

In [32]:
train_eval_loop(gru_model, gru_optim, gru_loss_func, 4, train_loader, test_loader)

Loader: train, Эпоха: 0, loss: 0.20330633223056793, f1_score: 0.4525185900856127,Время обучения: 118.06
Loader: test, Эпоха: 0, loss: 0.18548984825611115, f1_score: 0.6405795238393539,Время предсказания 9.39
Loader: train, Эпоха: 1, loss: 0.14182676374912262, f1_score: 0.7168275362167543,Время обучения: 235.35000000000002
Loader: test, Эпоха: 1, loss: 0.09784117341041565, f1_score: 0.7635074337201269,Время предсказания 18.69
Loader: train, Эпоха: 2, loss: 0.06826258450746536, f1_score: 0.794123342628257,Время обучения: 350.86
Loader: test, Эпоха: 2, loss: 0.07977218925952911, f1_score: 0.7925495490144521,Время предсказания 28.44
Loader: train, Эпоха: 3, loss: 0.07260090112686157, f1_score: 0.8330550912896493,Время обучения: 465.84000000000003
Loader: test, Эпоха: 3, loss: 0.1072503924369812, f1_score: 0.8233202410819664,Время предсказания 37.84


### RNN

In [33]:
rnn_model = RNN_predictor_Chars(vocab_len, 200, 256, n_classes, n_chars, 32, 32)
rnn_model.train()
rnn_model = rnn_model.to(device)
rnn_optim = torch.optim.Adam(rnn_model.parameters(), lr=0.001)
rnn_loss_func = nn.CrossEntropyLoss()

In [34]:
train_eval_loop(rnn_model, rnn_optim, rnn_loss_func, 4, train_loader, test_loader)

Loader: train, Эпоха: 0, loss: 0.18631786108016968, f1_score: 0.4613104146559024,Время обучения: 80.62
Loader: test, Эпоха: 0, loss: 0.18278342485427856, f1_score: 0.6117998573099241,Время предсказания 7.25
Loader: train, Эпоха: 1, loss: 0.13750745356082916, f1_score: 0.6985278272238395,Время обучения: 159.26
Loader: test, Эпоха: 1, loss: 0.1310531347990036, f1_score: 0.7504064275772906,Время предсказания 14.45
Loader: train, Эпоха: 2, loss: 0.09383402019739151, f1_score: 0.7880420735072994,Время обучения: 238.24
Loader: test, Эпоха: 2, loss: 0.1275968998670578, f1_score: 0.7926618603088808,Время предсказания 21.29
Loader: train, Эпоха: 3, loss: 0.06715215742588043, f1_score: 0.8255307682182854,Время обучения: 316.07
Loader: test, Эпоха: 3, loss: 0.0857565701007843, f1_score: 0.8181032595384776,Время предсказания 28.53


### LSTM

In [35]:
lstm_model = LSTM_predictor_Chars(vocab_len, 200, 256, n_classes, n_chars, 32, 32)
lstm_model.train()
lstm_model = gru_model.to(device)
lstm_optim = torch.optim.Adam(lstm_model.parameters(), lr=0.001)
lstm_loss_func = nn.CrossEntropyLoss()

In [36]:
train_eval_loop(lstm_model, lstm_optim, lstm_loss_func, 4, train_loader, test_loader)

Loader: train, Эпоха: 0, loss: 0.07668881863355637, f1_score: 0.8584962988414996,Время обучения: 117.38
Loader: test, Эпоха: 0, loss: 0.05088821053504944, f1_score: 0.84009405250518,Время предсказания 9.89
Loader: train, Эпоха: 1, loss: 0.03978373855352402, f1_score: 0.8783776469607943,Время обучения: 232.62
Loader: test, Эпоха: 1, loss: 0.045217856764793396, f1_score: 0.8491442898865617,Время предсказания 19.53
Loader: train, Эпоха: 2, loss: 0.05245556682348251, f1_score: 0.8970508809616826,Время обучения: 348.06
Loader: test, Эпоха: 2, loss: 0.07575657218694687, f1_score: 0.8558113227541445,Время предсказания 29.160000000000004
Loader: train, Эпоха: 3, loss: 0.030374962836503983, f1_score: 0.911372747450588,Время обучения: 462.5
Loader: test, Эпоха: 3, loss: 0.05595991387963295, f1_score: 0.8551675253439428,Время предсказания 39.150000000000006


### LSTM BD

In [37]:
lstmbd_model =LSTMBD_predictor_Chars(vocab_len, 200, 256, n_classes, n_chars, 32, 32)
lstmbd_model.train()
lstmbd_model = gru_model.to(device)
lstmbd_optim = torch.optim.Adam(lstmbd_model.parameters(), lr=0.001)
lstmbd_loss_func = nn.CrossEntropyLoss()

In [38]:
train_eval_loop(lstm_model, lstm_optim, lstm_loss_func, 4, train_loader, test_loader)

Loader: train, Эпоха: 0, loss: 0.021105121821165085, f1_score: 0.9281630614722236,Время обучения: 116.0
Loader: test, Эпоха: 0, loss: 0.08570992946624756, f1_score: 0.8580649131113702,Время предсказания 9.47
Loader: train, Эпоха: 1, loss: 0.027329375967383385, f1_score: 0.9423272670407234,Время обучения: 231.19
Loader: test, Эпоха: 1, loss: 0.06862616539001465, f1_score: 0.8646411978670722,Время предсказания 19.36
Loader: train, Эпоха: 2, loss: 0.019916413351893425, f1_score: 0.9552020515687298,Время обучения: 346.67
Loader: test, Эпоха: 2, loss: 0.07314936071634293, f1_score: 0.8678114240852336,Время предсказания 29.259999999999998
Loader: train, Эпоха: 3, loss: 0.01105126366019249, f1_score: 0.9627370124732679,Время обучения: 462.87
Loader: test, Эпоха: 3, loss: 0.12195298820734024, f1_score: 0.8657059191300963,Время предсказания 38.98
