<a href="https://colab.research.google.com/github/InhyeokYoo/Pytorch-study/blob/master/LSTM_%EC%8B%A4%EC%8A%B5%EC%BD%94%EB%93%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 6.4 LSTM
$$
\begin{array}{ll} \\
            i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
            f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
            o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
            c_t = f_t * c_{(t-1)} + i_t * g_t \\
            h_t = o_t * \tanh(c_t) \\
        \end{array}
$$

In [0]:
import torch
import torch.nn as nn

In [0]:
# input_size:     입력의 특성 개수
# hidden_size:    hidden state의 특성 개수
# num_layers:     LSTM을 몇층으로 쌓을것인가 여부
# bias:           편차의 사용 여부
# batch_first:    사용하면 입력과 출력의 형태가 [batch, seq, feature]
# dropout:        드롭아웃 사용여부
# bidirectional:  참고 http://solarisailab.com/archives/1515

rnn = nn.LSTM(input_size=3, hidden_size=5, num_layers=2)

# 기본적으로 입력의 형태는 (seq_len, batch, input_size) 를 따른다.
input_ = torch.randn(5, 3, 3)

# hidden layer, cell state의 형태는 (num_layers * num_directions, batch, hidden_Size) 의 형태를 따름.
h0 = torch.randn(2, 3, 5)
c0 = torch.randn(2, 3, 5)

# LSTM에 입력을 전달할 때는 input, (h_0, c_0) 처럼 상태를 튜플로 묶어서 전달한다.
output, (hidden_state, cell_state) = rnn(input_, (h0, c0))

print(output.size(), hidden_state.size(), cell_state.size())

torch.Size([5, 3, 5]) torch.Size([2, 3, 5]) torch.Size([2, 3, 5])


In [0]:
### 헷갈리니까 그냥 변수로 저장해서 불러오는게 나을듯.
input_size = 3
hidden_size = 5
num_layers = 2
seq_len = 5 # 이게 뭔지 잘 모르겠음.
batch_size = 3

rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)

# 기본적으로 입력의 형태는 (seq_len, batch, input_size) 를 따른다.
input_ = torch.randn(seq_len, batch_size, input_size)

# hidden layer, cell state의 형태는 (num_layers * num_directions, batch, hidden_Size) 의 형태를 따름.
h0 = torch.randn(num_layers * 1, batch_size, hidden_size)
c0 = torch.randn(num_layers * 1, batch_size, hidden_size)

# LSTM에 입력을 전달할 때는 input, (h_0, c_0) 처럼 상태를 튜플로 묶어서 전달한다.
output, (hidden_state, cell_state) = rnn(input_, (h0, c0))

# output:[seq_len, batch, num_directions * hidden_size] -> 왜?
print(output.size(), hidden_state.size(), cell_state.size())

torch.Size([5, 3, 5]) torch.Size([2, 3, 5]) torch.Size([2, 3, 5])


## Hard Coding

In [0]:
### 이번엔 batch_first를 한번 사용해보도록 하자.

rnn = nn.LSTM(input_size=3, hidden_size=5, num_layers=2, batch_first=True)

# batch_first=True이면 입력은 (batch, seq, input_size) 가 된다: batch랑 seq의 위치가 바뀜.
input_ = torch.randn(3, 5, 3)

# hidden state, cell state의 형태는 아까와 동일하게 (num_layers * num_directions, batch, hidden_size)
h0 = torch.randn(2, 3, 5)
c0 = torch.randn(2, 3, 5)

# LSTM에 입력을 전달할 때는 위와 동일하게 input, (h_0, c_0) 로 전달하면 됨.

# Q. 왜 여기서는 forward 호출 안하지?
output, (hidden_state, cell_state) = rnn(input_, (h0, c0))

print(input_.size(),h0.size(),c0.size())
print(output.size(),hidden_state.size(),cell_state.size())

torch.Size([3, 5, 3]) torch.Size([2, 3, 5]) torch.Size([2, 3, 5])
torch.Size([3, 5, 5]) torch.Size([2, 3, 5]) torch.Size([2, 3, 5])


## Char_LSTM

In [0]:
# Simple Character LSTM
# Char RNN에서 설명한 부분은 생략했습니다.

import torch 
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [0]:
# Preprocessing string data
# alphabet(0-25), space(26),..., start, end 

string = "hello pytorch. how long can a rnn cell remember? show me your limit!"
chars = "abcdefghijklmnopqrstuvwxyz ?!.,:;01"
char_list = [i for i in chars]
char_len = len(char_list)

print(len(string), char_len)

68 35


In [0]:
# String to onehot vector
# a -> [1 0 0 ... 0 0]

def string_to_onehot(string):
    start = np.zeros(shape=char_len ,dtype=int)
    end = np.zeros(shape=char_len ,dtype=int)
    start[-2] = 1
    end[-1] = 1
    for i in string:
        idx = char_list.index(i)
        zero = np.zeros(shape=char_len ,dtype=int)
        zero[idx]=1
        start = np.vstack([start,zero])
    output = np.vstack([start,end])
    return output

# Onehot vector to word
# [1 0 0 ... 0 0] -> a 

def onehot_to_word(onehot_1):
    onehot = torch.Tensor.numpy(onehot_1)
    return char_list[onehot.argmax()]

In [0]:
### Hyper-parameter 설정

batch_size = 1 # 문자열을 하나씩 잘라서 사용하는 것으로하여 bacth_size=1로 고정하였음.

# seq_len은 편의상 1로 설정 -> 이게 뭔지 잘 모르겠음.
seq_len = 1

# num_layers는 입력 형식에만 맞게 형태를 바꾸어 주면 됨. -> 뭔 소리임?
num_layers = 3
input_size = char_len
hidden_size = 35
lr = 0.01
num_epochs = 1000

# string을 one_hot의 모음으로 바꿈: (start + 68 + end) x (35)
one_hot = torch.from_numpy(string_to_onehot(string)).type_as(torch.FloatTensor())

print(one_hot.size())

torch.Size([70, 35])


In [0]:
### LSTM with 1 hidden layer

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()
        self.input_size = input_size
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers) # Q. 이건 왜 있는거지?

    def forward(self, input_, hidden, cell):
        output, (hidden, cell) = self.lstm(input_, (hidden, cell))
        return output, hidden, cell

    def init_hidden_cell(self):
        hidden = torch.zeros(num_layers,batch_size,hidden_size)
        cell = torch.zeros(num_layers,batch_size,hidden_size)
        return hidden,cell

lstm = LSTM(input_size, hidden_size, num_layers)

In [0]:
### Loss function & Optimizer

loss_func = nn.MSELoss() 
optimizer = torch.optim.Adam(lstm.parameters(), lr=lr)

j=0
input_data = one_hot[j:j+seq_len].view(seq_len, batch_size, input_size)
print(input_data.size())

hidden, cell = lstm.init_hidden_cell()
print(hidden.size(),cell.size())

output, hidden, cell = lstm(input_data, hidden,cell)
print(output.size(),hidden.size(),cell.size())

torch.Size([1, 1, 35])
torch.Size([3, 1, 35]) torch.Size([3, 1, 35])
torch.Size([1, 1, 35]) torch.Size([3, 1, 35]) torch.Size([3, 1, 35])


In [0]:
unroll_len = one_hot.size()[0]//seq_len -1
for i in range(num_epochs):
    hidden,cell = lstm.init_hidden_cell()
    
    loss = 0
    for j in range(unroll_len):
        input_data = one_hot[j:j+seq_len].view(seq_len,batch_size,input_size) 
        label = one_hot[j+1:j+seq_len+1].view(seq_len,batch_size,input_size)
        
        optimizer.zero_grad()
        
        output, hidden, cell = lstm(input_data,hidden,cell)
        loss += loss_func(output.view(1,-1),label.view(1,-1))
        
    loss.backward()
    optimizer.step()

    if i%10 ==0:
        print(loss)

tensor(2.1684, grad_fn=<AddBackward0>)
tensor(1.8090, grad_fn=<AddBackward0>)
tensor(1.7084, grad_fn=<AddBackward0>)
tensor(1.5447, grad_fn=<AddBackward0>)
tensor(1.2709, grad_fn=<AddBackward0>)
tensor(0.8998, grad_fn=<AddBackward0>)
tensor(0.5241, grad_fn=<AddBackward0>)
tensor(0.2770, grad_fn=<AddBackward0>)
tensor(0.1345, grad_fn=<AddBackward0>)
tensor(0.0732, grad_fn=<AddBackward0>)
tensor(0.0453, grad_fn=<AddBackward0>)
tensor(0.0308, grad_fn=<AddBackward0>)
tensor(0.0227, grad_fn=<AddBackward0>)
tensor(0.0180, grad_fn=<AddBackward0>)
tensor(0.0150, grad_fn=<AddBackward0>)
tensor(0.0130, grad_fn=<AddBackward0>)
tensor(0.0127, grad_fn=<AddBackward0>)
tensor(0.0109, grad_fn=<AddBackward0>)
tensor(0.0099, grad_fn=<AddBackward0>)
tensor(0.0086, grad_fn=<AddBackward0>)
tensor(0.0078, grad_fn=<AddBackward0>)
tensor(0.0072, grad_fn=<AddBackward0>)
tensor(0.0068, grad_fn=<AddBackward0>)
tensor(0.0067, grad_fn=<AddBackward0>)
tensor(0.0066, grad_fn=<AddBackward0>)
tensor(0.0062, grad_fn=<A

In [0]:
hidden,cell = lstm.init_hidden_cell()

for j in range(unroll_len-1):
    input_data = one_hot[j:j+1].view(1,batch_size,hidden_size) 
    label = one_hot[j+1:j+1+1].view(1,batch_size,hidden_size) 
    
    output, hidden, cell = rnn(input_data,hidden,cell)
    print(onehot_to_word(output.data),end="") 

hello pytorch. how long can a rnn cell remember? show me your limit!

# Char_LSTM_Batch

이번 코드는 batch first 코드로 작성함

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [0]:
# string에 대한 preprocessing 적용
# alphabet(0-25), space(26), start(27), end(28) -> 29 chars (0-28)

string = "hello pytorch. how long can a rnn cell remember? show me your limit!"
chars = "abcdefghijklmnopqrstuvwxyz ?!.,:;01"
char_list = list(chars)
char_len = len(char_list)

print(char_len)

35


In [0]:
# string을 one-hot vector로 바꾸어보자.
# e.g. a -> [1, 0, ..., 0]

def string_to_onehot(string):
    start = np.zeros(shape=char_len, dtype=int)
    end = np.zeros(shape=char_len, dtype=int)

    start[-2] = 1
    end[-1] = 1
    
    for i in string:
        idx = char_list.index(i)
        zero = np.zeros(shape=char_len, dtype=int)
        zero[idx] = 1
        
        start = np.vstack([start, zero])
    output = np.vstack([start, end])
    
    return output

# onehot vector to word
# [1, 0, ..., 0] -> a
# 실제로는 inference 단계에서 수행해 주면 된다.

def onehot_to_string(onehot):
    output = onehot.numpy() # -> memory shared
    # output = onehot.numpy().copy() # -> copy
    return char_list[output.argmax()]

In [0]:
### Hyper-parameter 설정
batch_size = 5

seq_len = 1

num_layers = 3
input_size = char_len # Q. 왜 input size가 이거지?
hidden_size = 35
lr = 0.01
num_epochs = 1000

one_hot = torch.from_numpy(string_to_onehot(string)).type_as(torch.FloatTensor())

print(one_hot.size())

torch.Size([70, 35])


In [0]:
# LSTM with 1 hidden layer

class LSTM(nn.Module):
    def __init__(self, intput_size, hidden_size, num_layers):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)

    def forward(self, input_, hidden, cell):
        # 여기서는 또 concat을 안하네?
        output, (hidden, cell) = self.lstm(input_, (hidden, cell))
        return output, hidden, cell

    def init_hidden_cell(self):
        hidden = torch.zeros(num_layers, batch_size, hidden_size)
        cell = torch.zeros(num_layers, batch_size, hidden_size)
        return hidden, cell

lstm = LSTM(input_size, hidden_size, num_layers)

In [0]:
# Loss and Optimizer
loss_fnc = nn.MSELoss()
optimizer = optim.Adam(lstm.parameters(), lr=lr)

In [0]:
j = 0
# batch_first=True인 경우, input은 [batch, seq, input_size]를 만들어줌.
input_data = one_hot[j:j+batch_size].view(batch_size, seq_len, input_size)
print(one_hot[j:j+batch_size].size(), input_data.size()) # [70*35]를 batch_size=5만큼 잘랐으니, [5x35]에서 [5x1x35]로 바꾸어줌.

# batch_first와 무관하게 [num_layers * num_directions, batch, hidden_size]가 됨.
hidden, cell = lstm.init_hidden_cell()
print(hidden.size(), cell.size()) 

# output은 input과 같은 shape을 갖음.
output, hidden, cell = lstm(input_data, hidden, cell)
print(output.size(), hidden.size(), cell.size())

torch.Size([5, 35]) torch.Size([5, 1, 35])
torch.Size([3, 5, 35]) torch.Size([3, 5, 35])
torch.Size([5, 1, 35]) torch.Size([3, 5, 35]) torch.Size([3, 5, 35])


In [0]:
unroll_len = one_hot.size()[0]//seq_len - 1 # 뭐여 이건 또.
print(unroll_len)

for i in range(num_epochs):
    optimizer.zero_grad()
    hidden, cell = lstm.init_hidden_cell()

    loss = 0

    for j in range(unroll_len - batch_size + 1):
        # batch_size에 맞게 one_hot vector를 stack.
        # e.g. batch_size=3이면, pytorch에서 pyt를 one-hot 벡터로 바꿔서 쌓고,
        # 목표값으로 yto를 one-hot 벡터로 바꿔서 쌓는 과정이다.
        input_data = torch.stack([one_hot[j+k:j+k+seq_len] for k in range(batch_size)], dim=0) # [5, 1, 35]
        label = torch.stack([one_hot[j+k+1:j+k+seq_len+1] for k in range(batch_size)],dim=0)

        output, hidden, cell = lstm(input_data, hidden, cell)
        loss += loss_fnc(output.view(1, -1), label.view(1, -1)) # [1, 5 * 35]

    # Q. batch 마다 gradient를 계산하는건가?
    loss.backward() # loss함수는 mean으로 vector를 scalar로 바꾸어 줌.
    optimizer.step()

    if i % 10 == 0:
        print(loss)

69
tensor(2.3259, grad_fn=<AddBackward0>)
tensor(1.7039, grad_fn=<AddBackward0>)
tensor(1.6237, grad_fn=<AddBackward0>)
tensor(1.4879, grad_fn=<AddBackward0>)
tensor(1.3073, grad_fn=<AddBackward0>)
tensor(0.9638, grad_fn=<AddBackward0>)
tensor(0.5845, grad_fn=<AddBackward0>)
tensor(0.3288, grad_fn=<AddBackward0>)
tensor(0.1696, grad_fn=<AddBackward0>)
tensor(0.0991, grad_fn=<AddBackward0>)
tensor(0.0698, grad_fn=<AddBackward0>)
tensor(0.0534, grad_fn=<AddBackward0>)
tensor(0.0424, grad_fn=<AddBackward0>)
tensor(0.0339, grad_fn=<AddBackward0>)
tensor(0.0279, grad_fn=<AddBackward0>)
tensor(0.0238, grad_fn=<AddBackward0>)
tensor(0.0209, grad_fn=<AddBackward0>)
tensor(0.0188, grad_fn=<AddBackward0>)
tensor(0.0172, grad_fn=<AddBackward0>)
tensor(0.0159, grad_fn=<AddBackward0>)
tensor(0.0146, grad_fn=<AddBackward0>)
tensor(0.0145, grad_fn=<AddBackward0>)
tensor(0.0127, grad_fn=<AddBackward0>)
tensor(0.0117, grad_fn=<AddBackward0>)
tensor(0.0111, grad_fn=<AddBackward0>)
tensor(0.0107, grad_fn

In [0]:
for j in range(unroll_len - batch_size + 1):
        # batch_size에 맞게 one_hot vector를 stack.
        # e.g. batch_size=3이면, pytorch에서 pyt를 one-hot 벡터로 바꿔서 쌓고,
        # 목표값으로 yto를 one-hot 벡터로 바꿔서 쌓는 과정이다.
        input_data = torch.stack([one_hot[j+k:j+k+seq_len] for k in range(batch_size)], dim=0) # [5, 1, 35]
        label = torch.stack([one_hot[j+k+1:j+k+seq_len+1] for k in range(batch_size)],dim=0)

        print("".join([onehot_to_string(i) for i in input_data]))

0hell
hello
ello 
llo p
lo py
o pyt
 pyto
pytor
ytorc
torch
orch.
rch. 
ch. h
h. ho
. how
 how 
how l
ow lo
w lon
 long
long 
ong c
ng ca
g can
 can 
can a
an a 
n a r
 a rn
a rnn
 rnn 
rnn c
nn ce
n cel
 cell
cell 
ell r
ll re
l rem
 reme
remem
ememb
membe
ember
mber?
ber? 
er? s
r? sh
? sho
 show
show 
how m
ow me
w me 
 me y
me yo
e you
 your
your 
our l
ur li
r lim
 limi
limit
imit!
