Подключение библиотек
==================

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pandas as pd


import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Загружаем данные
==================


In [None]:
path = '' # файл rus-eng-small.tsv
data = pd.read_csv(path, sep='\t', header=None)

In [None]:
df = data.drop([0,2], axis=1).rename(columns={1:'rus', 3:'eng'})
df

Unnamed: 0,rus,eng
0,Один раз в жизни я делаю хорошее дело... И оно...,For once in my life I'm doing a good deed... A...
1,Давайте что-нибудь попробуем!,Let's try something.
2,Мне пора идти спать.,I have to go to sleep.
3,Что ты делаешь?,What are you doing?
4,Что ты делаешь?,What do you make?
...,...,...
199995,Каким было объяснение?,What was the explanation?
199996,Немногие фермы были электрифицированы.,Few farms had electricity.
199997,Людям нравился Джимми Картер.,People liked Jimmy Carter.
199998,Иракцы были окружены.,The Iraqis were surrounded.


In [None]:
SOS_token = 0
EOS_token = 1
# SOS - начало предложения
# EOS - конец

#словарь
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence): #добавление слова в словарь
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
# конвертация в латинские символы
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# убираем спецсимволы и ставим нижний регистр
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-ZА-я.!?]+", r" ", s)
    return s

In [None]:
# деление предложений на пары
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    pairs = df.values.tolist()
    #pairs = df.values[0:60000].tolist()
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in pairs[l]] for l in range(len(pairs))]

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p, reverse=False):
  if reverse == False:
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH and p[1].startswith(eng_prefixes)
  else:
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH and p[0].startswith(eng_prefixes)


def filterPairs(pairs, reverse=False):
    return [pair for pair in pairs if filterPair(pair, reverse)]

In [None]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs, reverse)

    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)

    pairs_train, pairs_test = train_test_split(pairs, test_size=0.3, random_state = 10)

    return input_lang, output_lang, pairs_train, pairs_test


In [None]:
input_lang, output_lang, pairs_train, pairs_test = prepareData('rus', 'eng', False)
print(random.choice(pairs_train))

Reading lines...
Read 200000 sentence pairs
Trimmed to 12775 sentence pairs
Counting words...
Counted words:
rus 7199
eng 3567
['я высокая .', 'i am tall .']


Модель Seq2Seq
=================






 Encoder
-----------






In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

Декодер
-----------
Декодер — это еще одна RNN, которая принимает выходной вектор (векторы) кодировщика и выводит последовательность слов для создания перевода.




In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1) # q * k
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0)) # v * alpha

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0) # Z

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


Обучение
========

Подготовка данных
-----------------------

Для обучения для каждой пары нам понадобится входной тензор (индексы
слова во входном предложении) и целевой тензор (индексы слов в целевом предложении). При создании этих векторов мы добавим токен EOS к обеим последовательностям.




In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

Обучение модели
------------------





In [None]:
teacher_forcing_ratio = 0.5

In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

Весь тренировочный процесс выглядит так:

- Запустить таймер
- Инициализировать оптимизаторы и критерий
- Создать набор обучающих пар
- Запустить пустой массив потерь для построения



In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs_train))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]    #языковые пары
        input_tensor = training_pair[0]   #пары одного языка
        target_tensor = training_pair[1]   #пары другого языка

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

Выводим результаты
----------------





In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

# график фцнкции потерь
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()

    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

Обучение
=======================


In [None]:
hidden_size = 256 # длины последовательностей
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
N_iteration = 50000 #число итераций

Запуск обучения

In [None]:
start = time.time()
trainIters(encoder1, attn_decoder1, N_iteration, print_every=5000)
t_train = time.time()-start
# сохранение параметров
torch.save(encoder1.state_dict(), 'encoder6.pt')
torch.save(attn_decoder1.state_dict(), 'decoder6.pt')

4m 31s (- 40m 41s) (5000 10%) 3.1008
8m 41s (- 34m 47s) (10000 20%) 2.5285
12m 50s (- 29m 57s) (15000 30%) 2.2172
16m 59s (- 25m 29s) (20000 40%) 1.9486
21m 11s (- 21m 11s) (25000 50%) 1.7549
25m 20s (- 16m 53s) (30000 60%) 1.5686
29m 30s (- 12m 38s) (35000 70%) 1.4118
33m 41s (- 8m 25s) (40000 80%) 1.2723
37m 50s (- 4m 12s) (45000 90%) 1.1467
42m 2s (- 0m 0s) (50000 100%) 1.0586


In [None]:
# Если модель уже обучалась можно сразу загрузить её параметры, закомментировав предыдущий блок
# encoder1.load_state_dict(torch.load('encoder6.pt'))
# attn_decoder1.load_state_dict(torch.load('decoder6.pt'))

Evaluation
==========





In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder,pairs,n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
evaluateRandomly(encoder1, attn_decoder1, pairs_train)

> я ее за это убью !
= i m going to kill her for this !
< i m going to kill for it ! <EOS>

> ты не единственная !
= you re not the only one !
< you re not the only one ! <EOS>

> вся ответственность осталась мне .
= i am left with all the responsibility .
< i am the to of . . . <EOS>

> какая же ты идиотка !
= you re such an idiot !
< you re such an idiot ! <EOS>

> ты фантазер .
= you re a dreamer .
< you re a dreamer . <EOS>

> я ожидаю кое кого .
= i m expecting someone .
< i m waiting for someone . <EOS>

> мы особенные .
= we re special .
< we re special . <EOS>

> я уверен ты умеешь что нибудь еще .
= i m sure you have other skills .
< i m sure you you ll like . <EOS>

> она купается в реке .
= she is swimming in the river .
< she is in the the the . . <EOS>

> мы с нетерпением ждем встречи с вами .
= we are looking forward to seeing you .
< we re looking for you with you . <EOS>



In [None]:
def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))


evaluateAndShowAttention("я знаю тебя")

# evaluateAndShowAttention("")


input = я знаю тебя
output = i am the only you i love . <EOS>


In [None]:
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu

def BLEU(sent1,sent2):
  sent1_bleu = []
  sent2_bleu = []

  for w1 in sent2:
    sent2_bleu.append(w1.split())

  for i, w2 in enumerate(sent1):
    sent1_bleu.append([w2.split()])

  sf = SmoothingFunction()

  score = corpus_bleu(sent1_bleu,sent2_bleu,weights = (0,1,0,0),smoothing_function= sf.method4)
  return score

In [None]:
# подсчет средней метрики BLEU для тестовых фраз
def metric_bleu(encoder, decoder, pairs):
  scores = np.zeros(len(pairs))
  for i in range(0, len(pairs)):
    pair = pairs[i]
    output_words, attentions = evaluate(encoder, decoder, pair[0])
    output_sentence = ' '.join(output_words[0:-1]) # объединяем в предложение, убираем слово <EOS>

    score = BLEU([pair[1]], [output_sentence])
    scores[i] = score

  return np.mean(scores)

In [None]:
BLEU_score = metric_bleu(encoder1, attn_decoder1, pairs_test)
BLEU_score

0.350565933728056

In [None]:
#запись результатов
df_results = pd.DataFrame(columns = ['model', 'hidden_size', 'BLEU', 'N_train', 'N_iterations','t_train (min)', 'max_length', 'comment'])
result_path = '/content/drive/MyDrive/Colab Notebooks/Иннополис/seq2seq.csv'
df_results.to_csv(result_path, index = False)