In [1]:
import pandas as pd
import nltk
import torch
import torch.nn as nn
import torch.optim

from random import random, sample
from typing import List
from collections import Counter
from itertools import chain
from functools import reduce
from tqdm.auto import tqdm
from sklearn import model_selection
from torch.utils.data import DataLoader, TensorDataset

In [2]:
lemm_texts = []
orig_texts = []

In [3]:
with open('lemmatized_texts.txt', 'r', encoding='utf-8') as f:
    lemm_texts = f.read().split('\n')

In [4]:
with open('original_texts.txt', 'r', encoding='utf-8') as f:
    orig_texts = f.read().lower().split('\n')

In [5]:
df = pd.DataFrame(data=zip(lemm_texts, orig_texts), columns=['lemm_texts', 'orig_texts'])

In [6]:
df

Unnamed: 0,lemm_texts,orig_texts
0,я предлагать оригинальный подарок для малыш!,я предлагаю оригинальный подарок для малыша!
1,я обезательный перезвонить в любой случай.,я обезательно перезвоню в любом случае.
2,цена на память я не помнить.,цены на память я не помню.
3,"я не помнить , где находиться.","я не помню, где находились."
4,я работать на высококачественный американский ...,я работаю на высококачественных американских м...
...,...,...
360274,зелёный ящерица застылый на мраморный ступень.,зеленая ящерица застыла на мраморной ступени.
360275,больший ящерица шмыгнуть по песок.,большая ящерица шмыгнула по песку.
360276,домашний ящерица быстро пробежать вдоль штора.,домашняя ящерица быстро пробежала вдоль штор.
360277,крошечный ящерка сбежать с валун.,крошечная ящерка сбежала с валуна.


In [7]:
class Vocab:
    def __init__(self, tokens: List[str], unk_idx: int):
        self._tokens = tokens
        self._token_to_idx = {token: idx for idx, token in enumerate(tokens)}
        self._unk_idx = unk_idx
        
    def token_to_idx(self, token: str) -> int:
        return self._token_to_idx.get(token, self._unk_idx)
    
    def idx_to_token(self, idx: int) -> str:
        return self._tokens[idx]

In [8]:
class TextTransformer:
    def __init__(self, vocab_size: int):
        self.vocab = None
        self.vocab_size = vocab_size
        self.special_tokens_to_idx = {'<UNK>': 0, '<PAD>': 1, '<SOS>': 2, '<EOS>': 3}
        self._tokenizer = nltk.tokenize.wordpunct_tokenize
    
    def tokenize(self, text) -> List[str]:
        return self._tokenizer(text.lower())
    
    def build_vocab(self, tokens: List[str]):
        tokens_ = [special_token for special_token in self.special_tokens_to_idx.keys()]
        special_tokens_amount = len(self.special_tokens_to_idx)
        
        for token, _ in Counter(tokens).most_common(self.vocab_size - special_tokens_amount):
            tokens_.append(token)
        
        unk_idx = self.special_tokens_to_idx.get('<UNK>')
        self.vocab = Vocab(tokens_, unk_idx)
        
    def transform_text(self, text: str) -> List[int]:
        tokenized_text = self.tokenize(text)
        transformed = [self.vocab.token_to_idx(token) for token in tokenized_text]
        return transformed
    
    def fit_transform(self, texts: List[str]) -> None:
        transformed_texts = []
        
        tokenized_texts = [self.tokenize(text) for text in tqdm(texts, 'Tokenizing texts')]
        tokens = chain(*tokenized_texts)
        self.build_vocab(tokens)
        
        for tokenized_text in tqdm(tokenized_texts, 'Transforming texts'):
            transformed = [self.vocab.token_to_idx(token) for token in tokenized_text]
            transformed_texts.append(transformed)
    
    def transform_texts(self, texts: List[str]) -> List[List[int]]:
        transformed_texts = [transform_text(text) for text in tqdm(texts, 'Transforming texts')]
        return transformed_texts
    
    def text_to_tensor(self, text: str, max_seq_len=10) -> torch.tensor:
        transformed_text = self.transform_text(text)
        pad_idx = self.special_tokens_to_idx.get('<PAD>')
        sos_idx = self.special_tokens_to_idx.get('<SOS>')
        eos_idx = self.special_tokens_to_idx.get('<EOS>')
        
        if len(transformed_text) >= max_seq_len:
            transformed_text = transformed_text[:max_seq_len]
        else:
            pad_size = max_seq_len - len(transformed_text)
            transformed_text.extend([pad_idx] * pad_size)   
        transformed_text.insert(0, sos_idx)
        transformed_text.append(eos_idx)
        
        tensor = torch.tensor(transformed_text, dtype=torch.long)
        return tensor.unsqueeze(0)
    
    def texts_to_tensor(self, texts: List[str], max_seq_len=10) -> torch.tensor:
        pad_idx = self.special_tokens_to_idx.get('<PAD>')
        sos_idx = self.special_tokens_to_idx.get('<SOS>')
        eos_idx = self.special_tokens_to_idx.get('<EOS>')
        transformed_texts = []
        
        for text in tqdm(texts, 'Building tensor'):
            transformed_text = self.transform_text(text)
            if len(transformed_text) >= max_seq_len:
                transformed_text = transformed_text[:max_seq_len]
            else:
                pad_size = max_seq_len - len(transformed_text)
                transformed_text.extend([pad_idx] * pad_size)   
            transformed_text.insert(0, sos_idx)
            transformed_text.append(eos_idx)
            transformed_texts.append(transformed_text)
        
        tensor = torch.tensor(transformed_texts, dtype=torch.long)
        return tensor

In [9]:
vocab_size = 35000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
train_df, test_df = model_selection.train_test_split(df, test_size=0.1)

In [11]:
test_df, val_df = model_selection.train_test_split(test_df, test_size=0.5)

In [12]:
lemm_text_transformer = TextTransformer(vocab_size)

In [13]:
lemm_text_transformer.fit_transform(train_df.lemm_texts)

Tokenizing texts:   0%|          | 0/324251 [00:00<?, ?it/s]

Transforming texts:   0%|          | 0/324251 [00:00<?, ?it/s]

In [14]:
train_lemm_tensor = lemm_text_transformer.texts_to_tensor(list(train_df.lemm_texts))
test_lemm_tensor = lemm_text_transformer.texts_to_tensor(list(test_df.lemm_texts))
val_lemm_tensor = lemm_text_transformer.texts_to_tensor(list(val_df.lemm_texts))

Building tensor:   0%|          | 0/324251 [00:00<?, ?it/s]

Building tensor:   0%|          | 0/18014 [00:00<?, ?it/s]

Building tensor:   0%|          | 0/18014 [00:00<?, ?it/s]

In [15]:
orig_text_transformer = TextTransformer(vocab_size)

In [16]:
orig_text_transformer.fit_transform(train_df.orig_texts)

Tokenizing texts:   0%|          | 0/324251 [00:00<?, ?it/s]

Transforming texts:   0%|          | 0/324251 [00:00<?, ?it/s]

In [17]:
train_orig_tensor = orig_text_transformer.texts_to_tensor(list(train_df.orig_texts))
test_orig_tensor = orig_text_transformer.texts_to_tensor(list(test_df.orig_texts))
val_orig_tensor = orig_text_transformer.texts_to_tensor(list(val_df.orig_texts))

Building tensor:   0%|          | 0/324251 [00:00<?, ?it/s]

Building tensor:   0%|          | 0/18014 [00:00<?, ?it/s]

Building tensor:   0%|          | 0/18014 [00:00<?, ?it/s]

In [18]:
# train_lemm_tensor = torch.transpose(train_lemm_tensor, 1, 0)
# test_lemm_tensor = torch.transpose(test_lemm_tensor, 1, 0)
# val_lemm_tensor = torch.transpose(val_lemm_tensor, 1, 0)

In [19]:
# train_orig_tensor = torch.transpose(train_orig_tensor, 1, 0)
# test_orig_tensor = torch.transpose(test_orig_tensor, 1, 0)
# val_orig_tensor = torch.transpose(val_orig_tensor, 1, 0)

In [20]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size: int, embedding_size: int, hidden_size: int, pad_idx: int):
        super(EncoderRNN, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        
        self.embedding = nn.Embedding(vocab_size, embedding_size, pad_idx)
        self.rnn = nn.LSTM(embedding_size, hidden_size)
        
    def forward(self, x, hidden, cell):
        # x_shape: (seq_len, batch_size)
        embedding = self.embedding(x)
        
        # embedding_shape: (seq_len, batch_size, embedding_size)
        output, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # output_shape: (seq_len, batch_size, hidden_size)
        # hidden_shape: 2 tensors of (1, batch_size, hidden_size)
        return output, (hidden, cell)
    
    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [21]:
class DecoderRNN(nn.Module):
    def __init__(self, vocab_size: int, embedding_size: int, hidden_size: int, output_size: int, pad_idx):
        super(DecoderRNN, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.embedding = nn.Embedding(vocab_size, embedding_size, pad_idx)
        self.rnn = nn.LSTM(embedding_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        embedding = self.embedding(x)
        output, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        output = self.fc(output)
        output = output.squeeze(0)
        output = self.softmax(output)
        return output, (hidden, cell)
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [22]:
class Seq2SeqModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size, pad_idx):
        super(Seq2SeqModel, self).__init__()
        self.vocab_size = vocab_size
        self.encoder = EncoderRNN(vocab_size, embedding_size, hidden_size, pad_idx).to(device)
        self.decoder = DecoderRNN(vocab_size, embedding_size, hidden_size, output_size, pad_idx).to(device)
        
    def forward(self, input_tensor, target_tensor, hidden, cell, teacher_force_ratio=0.5):
        input_tensor = torch.transpose(input_tensor, 1, 0)
        target_tensor = torch.transpose(target_tensor, 1, 0)
        # input_tensor_shape: (seq_len, batch_size)
        # target_tensor_shape: (seq_len, batch_size)
        batch_size = input_tensor.shape[1]
        seq_len = target_tensor.shape[0]
        
        outputs = torch.zeros(seq_len, batch_size, self.vocab_size, device=device)
        
#         hidden, cell = self.encoder(input_tensor, hidden)
        _, (hidden, cell) = self.encoder(input_tensor, hidden, cell)
        # hidden_shape: 2 tensors of (1, batch_size, hidden_size)
        
        x = target_tensor[0]
        
        for t in range(1, seq_len):
            output, (hidden, cell) = self.decoder(x, hidden, cell)
            outputs[t] = output
            best_prediction = output.argmax(1)
            x = target_tensor[t] if random() < teacher_force_ratio else best_prediction
        
        return outputs

In [23]:
lr = 0.0001
batch_size = 64
epochs = 50
vocab_size = 35000
hidden_size = 1024
embedding_size = 300
output_size = vocab_size
pad_idx = lemm_text_transformer.special_tokens_to_idx.get('<PAD>')

In [24]:
model = Seq2SeqModel(vocab_size, embedding_size, hidden_size, output_size, pad_idx).to(device)

In [25]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [26]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [27]:
train_dataset = TensorDataset(train_orig_tensor, train_lemm_tensor)
test_dataset = TensorDataset(test_orig_tensor, test_lemm_tensor)
val_dataset = TensorDataset(val_orig_tensor, val_lemm_tensor)

In [28]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [29]:
def decode_lemm(inp):
    idx_to_token = lemm_text_transformer.vocab.idx_to_token
    tokens = map(lambda idx: idx_to_token(idx), inp)
    return reduce(lambda lhs, rhs: lhs + ' ' + rhs, tokens)

In [30]:
def decode_orig(inp):
    idx_to_token = orig_text_transformer.vocab.idx_to_token
    tokens = map(lambda idx: idx_to_token(idx), inp)
    return reduce(lambda lhs, rhs: lhs + ' ' + rhs, tokens)

In [31]:
def train(model, data_loader, val_loader, optimizer, criterion, epochs, lr, batch_size):
    for epoch in range(epochs):
        print(f'Epoch [{epoch + 1}/{epochs}]')
        hidden = model.encoder.initHidden(batch_size)
        cell = model.encoder.initHidden(batch_size)
        
        for iteration, (target_data, input_data) in enumerate(data_loader):
            target_data = target_data.to(device)
            input_data = input_data.to(device)
            try:
                output = model(input_data, target_data, hidden, cell)
            except:
                break
            target_data = torch.transpose(target_data, 0, 1)
            output = output[1:].reshape(-1, output.shape[2])
            target_data = target_data[1:].reshape(-1)
            optimizer.zero_grad()
            loss = criterion(output, target_data)
            training_loss = loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            
            if iteration % 2500 == 0:
                print(f'\tIteration #{iteration}: training_loss = {training_loss}')
            if (iteration + 1) % 5000 == 0:
                with torch.no_grad():
                    val_hidden = model.encoder.initHidden(batch_size)
                    val_cell = model.encoder.initHidden(batch_size)
                    val_loss_list = []
                    for val_iteration, (val_target_data, val_input_data) in enumerate(val_loader):
                        val_target_data = val_target_data.to(device)
                        val_input_data = val_input_data.to(device)
                        try:
                            val_output = model(val_input_data, val_target_data, val_hidden, val_cell)
                        except:
                            break
                        val_target_data = torch.transpose(val_target_data, 0, 1)
                        val_output = val_output[1:].reshape(-1, val_output.shape[2])
                        val_target_data = val_target_data[1:].reshape(-1)
                        val_loss = criterion(val_output, val_target_data)
                        val_loss_list.append(val_loss.item())
                    print(f"Validation loss: {sum(val_loss_list) / len(val_loss_list)}")
                    
                    first_x = sample(list(iter(val_loader)), 1)[0]
                    print(f"Input: {decode_lemm(first_x[1][0])}")
                    print(f"Target: {decode_orig(first_x[0][0])}")
                    val_hidden = model.encoder.initHidden(batch_size)
                    val_cell = model.encoder.initHidden(batch_size)
                    out = model(first_x[1].to(device), first_x[0].to(device), val_hidden, val_cell, 0.5)
                    out = out.argmax(dim=2)
                    out = torch.transpose(out, 0, 1)
                    print(f"Output: {decode_orig(out[0])}")

In [32]:
# train(model, train_loader, val_loader, optimizer, criterion, epochs, lr, batch_size)

Epoch [1/50]
	Iteration #0: training_loss = 10.470785140991211
	Iteration #2500: training_loss = 2.679882764816284
Validation loss: 1.8701063526058537
Input: <SOS> он пошевелиться на сидение . <PAD> <PAD> <PAD> <PAD> <PAD> <EOS>
Target: <SOS> он пошевелился на сиденье . <PAD> <PAD> <PAD> <PAD> <PAD> <EOS>
Output: <UNK> он <UNK> на сиденье . . . <EOS> <EOS> <EOS> <EOS>
	Iteration #5000: training_loss = 1.9530543088912964
Epoch [2/50]
	Iteration #0: training_loss = 2.148900270462036
	Iteration #2500: training_loss = 1.3579150438308716
Validation loss: 1.0511523572151347
Input: <SOS> форд помахать в ответ . <PAD> <PAD> <PAD> <PAD> <PAD> <EOS>
Target: <SOS> форд помахал в ответ . <PAD> <PAD> <PAD> <PAD> <PAD> <EOS>
Output: <UNK> <UNK> помахал в ответ . . <EOS> <EOS> <EOS> <EOS> <EOS>
	Iteration #5000: training_loss = 1.0579689741134644
Epoch [3/50]
	Iteration #0: training_loss = 1.197873830795288
	Iteration #2500: training_loss = 0.6647981405258179
Validation loss: 0.6703477211270044
Input

KeyboardInterrupt: 

In [None]:
import gc
del model
del optimizer
gc.collect()
torch.cuda.empty_cache()
gc.collect()

In [34]:
def evaluate(model, sentence):
    pass