In [None]:
import torch
import torch.nn as nn
from torch.optim import SGD
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [None]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [None]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [None]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [None]:
word = 'ololoasdasddqweqw123456789'
# word = 'hello'

## Датасет.
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [None]:
class WordDataSet:

    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word:
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)

            self.indexs.append(self.chars2idx[c])

        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)

    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x

    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])

    def __len__(self):
        return self.seq_len

    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [None]:
class VanillaRNN(nn.Module):

    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)

    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

## Инициализация переменных

In [57]:
ds = WordDataSet(word=word)
rnn = VanillaGRU(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 1000
optim     = SGD(rnn.parameters(), lr = 0.05, momentum=0.9)

# Обучение

In [58]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(3)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)

        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)

        loss += criterion(y.unsqueeze(0), target)


    loss.backward()

    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else:
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)

#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")

    optim.step()

72.83657836914062
Clip gradient :  tensor(4.3963)
65.51602935791016
Clip gradient :  tensor(2.7448)
54.30173110961914
Clip gradient :  tensor(4.1810)
37.49923324584961
Clip gradient :  tensor(3.4313)
24.63099479675293
Clip gradient :  tensor(2.3857)
18.092092514038086
Clip gradient :  tensor(1.7644)
13.964195251464844
Clip gradient :  tensor(1.6186)
11.252017974853516
Clip gradient :  tensor(1.9620)
10.3819580078125
Clip gradient :  tensor(27.7275)
10.849030494689941
Clip gradient :  tensor(18.5046)
10.75693130493164
Clip gradient :  tensor(5.6473)
9.422608375549316
Clip gradient :  tensor(6.0301)
10.918243408203125
Clip gradient :  tensor(8.4464)
12.61240291595459
Clip gradient :  tensor(15.1382)
10.411482810974121
Clip gradient :  tensor(10.3801)
11.736542701721191
Clip gradient :  tensor(8.5519)
11.328282356262207
Clip gradient :  tensor(15.1890)
12.937443733215332
Clip gradient :  tensor(8.6675)
11.043636322021484
Clip gradient :  tensor(10.7257)
9.922435760498047
Clip gradient :  

# Тестирование

In [None]:
rnn.eval()
hh = torch.zeros(3)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y.unsqueeze(0))
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


# ДЗ
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово

In [118]:
#тестовое слово
word = 'ololoasdasddqweqw123456789asdfzxcv'

## Реализовать LSTM

In [120]:
#Написать реализацию LSTM и обучить предсказывать слово

class VanillaLSTM(nn.Module):
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaLSTM, self).__init__()
        self.hidden_ft = nn.Linear(in_features=in_size + hidden_size, out_features=hidden_size)
        self.hidden_it = nn.Linear(in_features=in_size + hidden_size, out_features=hidden_size)
        self.hidden_ct = nn.Linear(in_features=in_size + hidden_size, out_features=hidden_size)
        self.hidden_ot = nn.Linear(in_features=in_size + hidden_size, out_features=hidden_size)
        self.hidden_ct_ot = nn.Linear(in_features=out_size, out_features=hidden_size)
        self.activation_sigmoid = nn.Sigmoid()
        self.activation_tanh = nn.Tanh()
        self.out = nn.Linear(in_features=hidden_size, out_features=out_size)

    def forward(self, prev_c, x, prev_hidden):
        hidden = torch.cat([x.squeeze(0), prev_hidden])
        hidden_ft = self.hidden_ft(hidden)
        hidden_ft = self.activation_sigmoid(hidden_ft)

        hidden_it = self.hidden_it(hidden)
        hidden_it = self.activation_sigmoid(hidden_it)

        hidden_ct = self.hidden_ct(hidden)
        hidden_ct = self.activation_tanh(hidden_ct)

        hidden_ot = self.hidden_ot(hidden)
        hidden_ot = self.activation_sigmoid(hidden_ot)


        next_ct = prev_c * hidden_ft + hidden_it * hidden_ct
        next_hidden = hidden_ot * self.activation_tanh(next_ct)
        output = self.out(next_hidden)
        return output, next_hidden, next_ct


In [121]:
ds = WordDataSet(word=word)
rnn = VanillaLSTM(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt = 1000
optim = SGD(rnn.parameters(), lr = 0.05, momentum=0.9)

In [122]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(3)
    c = torch.zeros(3)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)

        target = torch.LongTensor([next_sample])

        y, hh, c = rnn(c, x, hh)

        loss += criterion(y.unsqueeze(0), target)


    loss.backward()

    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else:
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)

#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")

    optim.step()

104.06502532958984
Clip gradient :  tensor(5.0829)
97.10514068603516
Clip gradient :  tensor(2.1895)
92.19776916503906
Clip gradient :  tensor(3.3306)
77.46189880371094
Clip gradient :  tensor(5.8421)
63.919002532958984
Clip gradient :  tensor(8.1943)
56.250770568847656
Clip gradient :  tensor(10.2773)
49.300968170166016
Clip gradient :  tensor(5.9047)
43.318603515625
Clip gradient :  tensor(5.3784)
40.756797790527344
Clip gradient :  tensor(21.3137)
37.9990348815918
Clip gradient :  tensor(9.1275)
38.40715789794922
Clip gradient :  tensor(14.0208)
36.11970520019531
Clip gradient :  tensor(7.3123)
33.069664001464844
Clip gradient :  tensor(5.3003)
31.058551788330078
Clip gradient :  tensor(5.3910)
28.899682998657227
Clip gradient :  tensor(5.7410)
27.382781982421875
Clip gradient :  tensor(5.3038)
25.873655319213867
Clip gradient :  tensor(5.8462)
23.898359298706055
Clip gradient :  tensor(3.6185)
21.9276123046875
Clip gradient :  tensor(2.6701)
20.011856079101562
Clip gradient :  tens

In [123]:
rnn.eval()
hh = torch.zeros(3)
c = torch.zeros(3)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for idx in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh, c = rnn(c, x, hh)
    y = softmax(y.unsqueeze(0))
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789asdfzxcv
Original:	 ololoasdasddqweqw123456789asdfzxcv


## Реализовать GRU

In [None]:
#Написать реализацию GRU и обучить предсказывать слово
class VanillaGRU(nn.Module):

    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaGRU, self).__init__()
        self.hidden_rt = nn.Linear(in_features=in_size + hidden_size, out_features=hidden_size)
        self.hidden_zt = nn.Linear(in_features=in_size + hidden_size, out_features=hidden_size)
        self.hidden_ht = nn.Linear(in_features=in_size + hidden_size, out_features=hidden_size)
        self.hidden_activation = nn.Sigmoid()
        self.activation_1  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)


    def forward(self, x, prev_hidden):
        hidden = torch.cat([x.squeeze(0), prev_hidden])
        hidden_rt = self.hidden_rt(hidden)
        hidden_rt = self.hidden_activation(hidden_rt)

        hidden_zt = self.hidden_zt(hidden)
        hidden_zt = self.hidden_activation(hidden_zt)

        hidden_ht = prev_hidden * hidden_rt
        hidden_ht = torch.cat([x.squeeze(0), hidden_ht])
        hidden_ht = self.hidden_ht(hidden_ht)
        hidden_ht = self.activation_1(hidden_ht)

        new_hidden = prev_hidden * (1 - hidden_zt)
        new_hidden += hidden_zt * hidden_ht

        output = self.outweight(new_hidden)
        return output, new_hidden