## Импорт необходимых библиотек

In [1]:
import re
import spacy
import csv
import torch
import nltk
import pandas as pd

from collections import Counter
from itertools import chain
from tqdm import tqdm
from typing import List
from sklearn import model_selection

## Подготовка данных

In [2]:
nlp = spacy.load('ru_core_news_lg')

In [3]:
data_path = "./data/some_data/processed_data/"

In [4]:
with open(data_path + "original_texts.txt", 'r', encoding='utf-8') as f:
    orig_texts = f.read().split('\n')

In [5]:
pipeline = nlp.pipe_names.copy()
pipeline

['tok2vec', 'morphologizer', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [6]:
pipeline.remove('parser')
pipeline.remove('tok2vec')
pipeline.remove('lemmatizer')
pipeline.remove('morphologizer')
pipeline

['ner', 'attribute_ruler']

In [7]:
tokenized_lemm_texts = []

In [8]:
for text in tqdm(nlp.pipe(orig_texts, disable=pipeline)):
    lemm_text = []
    for token in text:
        if token.dep_ == 'nsubj':
            lemm_text.append('<PH>')
        else:
            lemm_text.append(token.lemma_.lower())
    tokenized_lemm_texts.append(lemm_text)

360279it [09:51, 608.84it/s]


In [9]:
lemm_texts = []

In [10]:
for text in tqdm(tokenized_lemm_texts):
    assembled = ""
    for token in text:
        assembled += token + ' '
    assembled = re.sub(r'\s([?.!"](?:\s|$))', r'\1', assembled).rstrip()
    lemm_texts.append(assembled)

100%|██████████████████████████████████████████████████████████████████████| 360279/360279 [00:02<00:00, 172355.88it/s]


### Построение csv-файла с необходимыми признаками

In [11]:
pipeline = nlp.pipe_names.copy()
pipeline

['tok2vec', 'morphologizer', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [12]:
pipeline.remove('parser')
pipeline.remove('tok2vec')
pipeline.remove('morphologizer')
pipeline

['ner', 'attribute_ruler', 'lemmatizer']

In [13]:
dataset = []

for i, text in tqdm(enumerate((nlp.pipe(orig_texts, disable=pipeline)))):
    for token in text:
        if token.dep_ == 'nsubj':
            dataset.append({
                'orig_texts': orig_texts[i].lower(),
                'lemm_texts': lemm_texts[i].lower(),
                'nsubj': token.text.lower(),
            })
            break

360279it [03:11, 1883.13it/s]


In [14]:
dataset[:5]

[{'orig_texts': 'я предлагаю оригинальный подарок для малыша!',
  'lemm_texts': '<ph> предлагать оригинальный подарок для малыш!',
  'nsubj': 'я'},
 {'orig_texts': 'я обезательно перезвоню в любом случае.',
  'lemm_texts': '<ph> обезательно перезвонить в любой случай.',
  'nsubj': 'я'},
 {'orig_texts': 'цены на память я не помню.',
  'lemm_texts': 'цена на память <ph> не помнить.',
  'nsubj': 'я'},
 {'orig_texts': 'я не помню, где находились.',
  'lemm_texts': '<ph> не помнить , где находиться.',
  'nsubj': 'я'},
 {'orig_texts': 'я работаю на высококачественных американских материалах.',
  'lemm_texts': '<ph> работать на высококачественный американский материал.',
  'nsubj': 'я'}]

In [15]:
with open(data_path + 'dataset.csv', 'w', encoding='utf-8') as f:
    csv_writer = csv.writer(f, delimiter=',')
    csv_writer.writerow(['orig_texts', 'lemm_texts', 'nsubj'])
    for item in tqdm(dataset):
        csv_writer.writerow(item.values())

100%|██████████████████████████████████████████████████████████████████████| 358502/358502 [00:01<00:00, 323956.07it/s]


In [3]:
class InputEncoderFeatures:
    def __init__(self, tokens_idx: List[int], nsubj_idx: int):
        self.tokens_idx = tokens_idx
        self.nsubj_idx = nsubj_idx

In [4]:
class Vocab:
    def __init__(self, tokens: List[str], unk_idx: int):
        self._tokens = tokens
        self.token_to_idx = {token: i for i, token in enumerate(tokens)}
        self._unk_idx = unk_idx
        
    def __len__(self):
        return len(self._tokens)
    
    def word_to_idx(self, word):
        return self.token_to_idx.get(word, self._unk_idx)
    
    def idx_to_word(self, idx):
        return self._tokens[idx]

In [5]:
class TextTransformer:
    def __init__(self, vocab_size):
        self.vocab = None
        self.vocab_size = vocab_size
        self.special_words_to_idx = {'<PH>': 0, '<UNK>': 1, '<EOS>': 2, '<SOS>': 3, '<PAD>': 4}
        self._tokenizer = nltk.tokenize.wordpunct_tokenize
        
    def tokenize(self, text):
        return self._tokenizer(text.lower())

    def build_vocab(self, tokens):
        inp_tokens = [special_word for special_word in self.special_words_to_idx.keys()]
        for token, _ in Counter(tokens).most_common(self.vocab_size - len(self.special_words_to_idx)):
            inp_tokens.append(token)
        
        self.vocab = Vocab(inp_tokens, self.special_words_to_idx['<UNK>'])
        
    def transform_single_text(self, text):
        tokens = self.tokenize(text)
        idxs = [self.vocab.word_to_idx(token) for token in tokens]
        return idxs
        
    def transform(self, texts):
        result = []
        for text in texts:
            result.append(self.transform_single_text(text))
        return result
    
    def get_vocab(self):
        return self.vocab.token_to_idx
    
    def fit_transform(self, texts):
        result = []
        tokenized_texts = [self.tokenize(text) for text in texts]
        self.build_vocab(chain(*tokenized_texts))
        for tokens in tokenized_texts:
            idxs = [self.vocab.word_to_idx(token) for token in tokens]
            result.append(idxs)
        return result

In [38]:
def build_encoder_features(token_idxs: List[int], nsubj_idx: int, special_idxs: dict, max_seq_len=10):
    inp_idxs = token_idxs.copy()
    pad_idx = special_idxs['<PAD>']
      
    if len(inp_idxs) >= max_seq_len:
        inp_idxs = inp_idxs[:max_seq_len]
    else:
        pad_completion_size = abs(max_seq_len - len(token_idxs))
        inp_idxs = inp_idxs + [pad_idx for i in range(pad_completion_size)]
    
    ph_idx = special_idxs['<PH>']
    sos_idx = special_idxs['<SOS>']
    eos_idx = special_idxs['<EOS>']
    
    inp_idxs.insert(0, ph_idx)
    inp_idxs.insert(0, sos_idx)
    inp_idxs.append(eos_idx)
    
    return InputEncoderFeatures(inp_idxs, nsubj_idx)

In [7]:
def features_to_tensor(features, for_encoder=False):
    if for_encoder:
        text_tensor = torch.tensor([feature.tokens_idx for feature in features], dtype=torch.long)
        nsubj_tensor = torch.tensor([feature.nsubj_idx for feature in features], dtype=torch.long)
        return text_tensor, nsubj_tensor
    else:
        text_tensor = torch.tensor([feature for feature in features], dtype=torch.long)
        return text_tensor

In [8]:
data_path = "./data/some_data/processed_data/dataset.csv"

In [9]:
df = pd.read_csv(data_path)

In [10]:
df.lemm_texts.str.replace('<ph>', '<PH>')

0           <PH> предлагать оригинальный подарок для малыш!
1              <PH> обезательно перезвонить в любой случай.
2                           цена на память <PH> не помнить.
3                         <PH> не помнить , где находиться.
4         <PH> работать на высококачественный американск...
                                ...                        
358497       другая <PH> медленно подбрести к свой товарка.
358498         зелёный <PH> застынуть на мраморный ступень.
358499                      большой <PH> шмыгнуть по песок.
358500          домашний <PH> быстро пробежать вдоль штора.
358501                      крошечный <PH> сбежать с валун.
Name: lemm_texts, Length: 358502, dtype: object

In [11]:
df['lemm_texts'] = df.lemm_texts.str.replace('<ph>', '<PH>')

In [12]:
df = df[df.lemm_texts.str.startswith('<PH>')]

In [13]:
df['lemm_texts'] = df.lemm_texts.str.replace('<PH> <PH>', '<PH>')

In [14]:
sum(df.lemm_texts.str.startswith('<PH> <PH>'))

2340

In [15]:
len(df)

336607

In [16]:
df[df.lemm_texts.str.find('h') != -1]

Unnamed: 0,orig_texts,lemm_texts,nsubj
245060,мы пошли по улочке hanoman,<PH> пойти по улочка hanoman,мы


In [17]:
train_df, test_df = model_selection.train_test_split(df, test_size=0.1)

In [18]:
test_df, val_df = model_selection.train_test_split(test_df, test_size=0.5)

In [48]:
text_to_id = TextTransformer(vocab_size=15000)
nsubj_to_id = TextTransformer(vocab_size=15000)

train_idx = text_to_id.fit_transform([token[4:] for token in train_df['lemm_texts']])
val_idx = text_to_id.transform(val_df['lemm_texts'])
test_idx = text_to_id.transform(test_df['lemm_texts'])

nsubj_train_idx = nsubj_to_id.fit_transform([token for token in train_df['nsubj']])
nsubj_val_idx = nsubj_to_id.transform(val_df['nsubj'])
nsubj_test_idx = nsubj_to_id.transform(test_df['nsubj'])

nsubj_train_to_id = {nsubj: i for i, nsubj in enumerate(train_df['nsubj'])}
nsubj_val_to_id = {nsubj: i for i, nsubj in enumerate(val_df['nsubj'])}
nsubj_test_to_id = {nsubj: i for i, nsubj in enumerate(test_df['nsubj'])}

In [21]:
len(train_df)

302946

In [59]:
train_features = [build_encoder_features(tokens_idx, nsubj_idx, special_idxs=text_to_id.special_words_to_idx) 
                 for tokens_idx, nsubj_idx in zip(train_idx, chain(*nsubj_train_idx))]

val_features = [build_encoder_features(tokens_idx, nsubj_idx, special_idxs=text_to_id.special_words_to_idx) 
                 for tokens_idx, nsubj_idx in zip(val_idx, chain(*nsubj_val_idx))]

test_features = [build_encoder_features(tokens_idx, nsubj_idx, special_idxs=text_to_id.special_words_to_idx) 
                 for tokens_idx, nsubj_idx in zip(val_idx, chain(*nsubj_test_idx))]

In [60]:
len(train_features)

302946

In [61]:
for example in train_features[:10]:
    print(example.__dict__)

{'tokens_idx': [3, 0, 664, 74, 7, 1548, 5, 4, 4, 4, 4, 4, 2], 'nsubj_idx': 9}
{'tokens_idx': [3, 0, 14, 7, 57, 5, 4, 4, 4, 4, 4, 4, 2], 'nsubj_idx': 1633}
{'tokens_idx': [3, 0, 2799, 105, 6, 665, 5, 4, 4, 4, 4, 4, 2], 'nsubj_idx': 9}
{'tokens_idx': [3, 0, 67, 23, 16, 36, 5, 4, 4, 4, 4, 4, 2], 'nsubj_idx': 9}
{'tokens_idx': [3, 0, 1515, 17, 7506, 5, 4, 4, 4, 4, 4, 4, 2], 'nsubj_idx': 170}
{'tokens_idx': [3, 0, 2968, 9, 217, 5, 4, 4, 4, 4, 4, 4, 2], 'nsubj_idx': 7}
{'tokens_idx': [3, 0, 285, 58, 5768, 5, 4, 4, 4, 4, 4, 4, 2], 'nsubj_idx': 6}
{'tokens_idx': [3, 0, 76, 17, 15, 489, 5, 4, 4, 4, 4, 4, 2], 'nsubj_idx': 5}
{'tokens_idx': [3, 0, 7053, 3032, 35, 850, 141, 5, 4, 4, 4, 4, 2], 'nsubj_idx': 7}
{'tokens_idx': [3, 0, 1195, 10, 150, 5, 4, 4, 4, 4, 4, 4, 2], 'nsubj_idx': 282}


In [62]:
train_text_tensor, train_nsubj_tensor = features_to_tensor(train_features, for_encoder=True)
val_text_tensor, val_nsubj_tensor = features_to_tensor(val_features, for_encoder=True)
test_text_tensor, test_nsubj_tensor = features_to_tensor(test_features, for_encoder=True)

In [63]:
print(train_text_tensor)

tensor([[   3,    0,  664,  ...,    4,    4,    2],
        [   3,    0,   14,  ...,    4,    4,    2],
        [   3,    0, 2799,  ...,    4,    4,    2],
        ...,
        [   3,    0,  365,  ...,    4,    4,    2],
        [   3,    0,   45,  ...,    4,    4,    2],
        [   3,    0, 7649,  ...,    4,    4,    2]])


In [64]:
print(train_nsubj_tensor)

tensor([   9, 1633,    9,  ...,    5,    5, 1425])


In [65]:
train_text_tensor.shape

torch.Size([302946, 13])

In [30]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [28]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size,  hidden_size)
        self.lstm = nn.LSTM(input_size, hidden_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [31]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [32]:
encoder = EncoderRNN()
decoder = DecoderRNN()
teacher_forcing_ratio = 0.5


TypeError: __init__() missing 2 required positional arguments: 'input_size' and 'hidden_size'

In [None]:
def train(lemm_tensor, orig_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    