### Задание
1. Скачать [датасет](https://www.manythings.org/anki/rus-eng.zip) англо-русскую пару фраз
2. Обучим seq2seq, оценим качество
3. Добавим +1 рекуррентный слой в encoder и decoder
4. Обучим модель LSTM и оценим качество модели 


### Загрузим данные

In [1]:
%matplotlib inline

In [2]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
!wget https://www.manythings.org/anki/rus-eng.zip

--2025-02-09 17:52:49--  https://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16305013 (16M) [application/zip]
Saving to: 'rus-eng.zip'

     0K .......... .......... .......... .......... ..........  0%  157K 1m41s
    50K .......... .......... .......... .......... ..........  0%  313K 76s
   100K .......... .......... .......... .......... ..........  0% 12,7M 51s
   150K .......... .......... .......... .......... ..........  1%  314K 50s
   200K .......... .......... .......... .......... ..........  1%  314K 50s
   250K .......... .......... .......... .......... ..........  1% 13,5M 42s
   300K .......... .......... .......... .......... ..........  2%  309K 43s
   350K .......... .......... .......... .......... ..........  2%  316K 44s
   400K .......... .......... .......... ..

In [21]:
import os

os.rename('rus.txt', 'eng-rus.txt')

In [7]:
import zipfile

with zipfile.ZipFile(r"D:\Progect\RNN\rus-eng.zip", 'r') as zip_ref:
    zip_ref.extractall("D:\Progect\RNN")       

In [23]:
with open(r"D:\Progect\RNN\eng-rus.txt", 'r') as file:
    lines = file.readlines()
    for line in lines[-10:]:  # Shows last 10 lines
        print(line.strip())

We need to uphold laws against discrimination — in hiring, and in housing, and in education, and in the criminal justice system. That is what our Constitution and our highest ideals require.	Нам нужно отстаивать законы против дискриминации при найме на работу, в жилищной сфере, в сфере образования и правоохранительной системе. Этого требуют наша Конституция и высшие идеалы.	CC-BY 2.0 (France) Attribution: tatoeba.org #5762728 (BHO) & #6390439 (odexed)
I've heard that you should never date anyone who is less than half your age plus seven. Tom is now 30 years old and Mary is 17. How many years will Tom need to wait until he can start dating Mary?	Я слышал, что никогда не следует встречаться с кем-то вдвое младше вас плюс семь лет. Тому 30 лет, a Мэри 17. Сколько лет Тому нужно ждать до тех пор, пока он сможет начать встречаться с Мэри?	CC-BY 2.0 (France) Attribution: tatoeba.org #10068197 (CK) & #10644473 (notenoughsun)
I do have one final ask of you as your president, the same thing I a

### Выполним предварительную обработку данных

In [9]:
SOS_token = 0   # Start of sequence
EOS_token = 1   # End of sequence

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [24]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^а-яА-Я.!?]+", r" ", s)
    return s

In [25]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [28]:
# максимальное количество слов в предложении
MAX_LENGTH = 10

# Русские префиксы для предложений
RUSSIAN_PREFIXES = (
    "я ", "мы ",
    "он ", "она ", "оно ",
    "ты ", "вы ",
    "они ",
    "это ", "эти ",
    "тот ", "та ", "те ",
    "сейчас ", "теперь ",
    "вот ", "здесь ",
    "там ", "туда ",
    "сюда ", "отсюда "
)

def filter_pair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(RUSSIAN_PREFIXES)

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

In [29]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filter_pairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'rus', True)
print(random.choice(pairs))

Reading lines...
Read 496059 sentence pairs
Trimmed to 183398 sentence pairs
Counting words...
Counted words:
rus 4
eng 30241
[' . . ', 'я люблю кошек .', ' .']


### Архитектура Seq2Seq-модели

In [None]:
# Encoder
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input_seq, hidden):
        embedded = self.embedding(input_seq).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
# Decoder
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [33]:
# преобразуем предложения в список индексов и слов
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

# преобразуем предложение в тензор PyTorch
def tensor_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

# создадим тензоры для пары предложений
def tensors_from_pair(pair):
    input_tensor = tensor_from_sentence(input_lang, pair[0])
    target_tensor = tensor_from_sentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [34]:
teacher_forcing_ratio = 0.5

# обучение модели
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.init_hidden()

     # Инициализация
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    # Кодирование входного предложения
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    # декодирование
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Обучение с teacher forcing или без него
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

     # Обратное распространение ошибки и обновление моделей
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [35]:
import time
import math


def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [36]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def show_plot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

### Обучение модели

In [37]:
# обучение модели
def train_iterations(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    # Инициализация оптимизаторов
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    # Подготовка данных для обучения
    training_pairs = [tensors_from_pair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    # Цикл обучения
    for iter in range(1, n_iters + 1):
        # Получение пары для обучения
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        # Обучение на текущей паре
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        
        # Накопление потерь
        print_loss_total += loss
        plot_loss_total += loss

         # Печать статистики
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (time_since(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        # Обновление графика
        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    show_plot(plot_losses)

In [38]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        # Подготовка входных данных
        input_tensor = tensor_from_sentence(input_lang, sentence)
        input_length = input_tensor.size()[0]

        # Инициализация кодировщика
        encoder_hidden = encoder.init_hidden()
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        # Кодирование входного предложения
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        # Инициализация декодировщика
        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
        decoder_hidden = encoder_hidden
        decoded_words = []

        # Генерация перевода
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [39]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

train_iterations(encoder1, decoder1, 75000, print_every=5000)

11m 9s (- 156m 19s) (5000 6%) 4.7906
22m 24s (- 145m 40s) (10000 13%) 4.5216


### Оценка модели

In [None]:
evaluateRandomly(encoder1, decoder1)

> je m en sors .
= i m managing .
< i m managing . <EOS>

> vous etes plus intelligent que moi .
= you re smarter than me .
< you re smarter than me . <EOS>

> je suis a bout de souffle .
= i m short of breath .
< i m familiar of of . <EOS>

> elles sont toutes mauvaises .
= they re all bad .
< they re all bad . <EOS>

> j en ai assez de me disputer .
= i m tired of arguing .
< i m tired of arguing . <EOS>

> je suis vraiment desole pour l erreur .
= i m very sorry about the mistake .
< i m sorry for my mistake . <EOS>

> ils vont faire des conneries .
= they re up to no good .
< they re up to no good . <EOS>

> je suis de la cote est .
= i m from the east coast .
< i m from the mood . <EOS>

> je m ennuie a en mourir .
= i am bored to death .
< i am bored to death . <EOS>

> je ne suis pas tres organise .
= i m not very organized .
< i m not very . <EOS>



### Добавим +1 рекуррентный слой в encoder и decoder

#### Архитектура модели

In [None]:
class EncoderRNN_1(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)

        # Добавлен первый GRU слой
        self.gru1 = nn.GRU(hidden_size, hidden_size)
        
        # Добавлен второй GRU слой
        self.gru2 = nn.GRU(hidden_size, hidden_size)

    def forward(self, input_seq, hidden):
        embedded = self.embedding(input_seq).view(1, 1, -1)
        output = embedded

        # Пропуск через первый GRU слой
        output, hidden1 = self.gru1(output, hidden)
        
        # Пропуск через второй GRU слой
        output, hidden2 = self.gru2(output, hidden1)
        
        return output, hidden2

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class DecoderRNN_1(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        
        # Добавлен первый GRU слой
        self.gru1 = nn.GRU(hidden_size, hidden_size)
        
        # Добавлен второй GRU слой
        self.gru2 = nn.GRU(hidden_size, hidden_size)
        
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        
        # Пропуск через первый GRU слой
        output, hidden1 = self.gru1(output, hidden)
        
        # Пропуск через второй GRU слой
        output, hidden2 = self.gru2(output, hidden1)
        
        output = self.softmax(self.out(output[0]))
        return output, hidden2
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

#### Обучим модель

In [None]:
hidden_size = 256
encoder_1 = EncoderRNN_1(input_lang.n_words, hidden_size).to(device)
decoder_1 = DecoderRNN_1(hidden_size, output_lang.n_words).to(device)

train_iterations(encoder_1, decoder_1, 75000, print_every=5000)

### Заменим GRU на LSTM

#### Архитектура модели

In [None]:
class EncoderRNN_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        
        # Заменены GRU на LSTM
        self.lstm1 = nn.LSTM(hidden_size, hidden_size)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size)
        
    def forward(self, input_seq, hidden):
        embedded = self.embedding(input_seq).view(1, 1, -1)
        output = embedded
        
        # Обработка через первичный LSTM
        output, hidden1 = self.lstm1(output, hidden)
        
        # Обработка через вторичный LSTM
        output, hidden2 = self.lstm2(output, hidden1)
        
        return output, hidden2
    def init_hidden(self):
        return (
            torch.zeros(1, 1, self.hidden_size, device=device),
            torch.zeros(1, 1, self.hidden_size, device=device)
        )

In [None]:
class DecoderRNN_LSTM(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        
        # Заменены GRU на LSTM
        self.lstm1 = nn.LSTM(hidden_size, hidden_size)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size)
        
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        
        # Обработка через первичный LSTM
        output, hidden1 = self.lstm1(output, hidden)
        
        # Обработка через вторичный LSTM
        output, hidden2 = self.lstm2(output, hidden1)
        
        output = self.softmax(self.out(output[0]))
        return output, hidden2

#### Обучим модель

In [None]:
hidden_size = 256
encoder_lstm = EncoderRNN_LSTM(input_lang.n_words, hidden_size).to(device)
decoder_lstm = DecoderRNN_LSTM(hidden_size, output_lang.n_words).to(device)

train_iterations(encoder_lstm, decoder_lstm, 75000, print_every=5000)