<a href="https://colab.research.google.com/github/InhyeokYoo/Pytorch-study/blob/master/6_4_4_1_Char_RNN_Naive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터 준비

In [1]:
!rm -r data
import os 

try:
  os.mkdir("./data")
except:
  pass

!wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt -P ./data

--2019-11-29 07:45:08--  https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘./data/input.txt’


2019-11-29 07:45:09 (43.1 MB/s) - ‘./data/input.txt’ saved [1115394/1115394]



In [2]:
!ls

data  sample_data


## 1) Setting

In [0]:
import torch
import torch.nn as nn

In [4]:
!pip install unidecode



In [0]:
import unidecode
import string
import random
import re
import time, math

## 2) Hyper parameters

In [0]:
num_epochs = 2000
print_every = 100
plot_every = 10

chunk_len = 200

hidden_size = 100
batch_size = 1
num_layers = 1
embedding_size = 70
lr = 0.002

# 2. Data

## 1) Prepare characters

In [7]:
# string module에서 출력 가능한 문자를 모두 불러오자.
all_characters = string.printable
print(all_characters)

# 출력가능한 문자들의 개수를 저장해놓는다.
n_characters = len(all_characters)
print(n_characters)

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

100


## 2) Get Text Data

In [8]:
# 앞서 다운받은 텍스트 파일을 열어준다.
file = unidecode.unidecode(open('./data/input.txt').read())
file_len = len(file)
print('file_len', file_len)

file_len 1115394


# 3. functions for text processing

## 1) Random chunk
Q. 왜 필요한건지 이해가 되지 않음.

In [9]:
# 텍스트 파일의 일부를 불러오는 함수
def random_chunk():
    # (시작지점 < 텍스트 전체 길이 - 불러오는 길이)
    start_index = random.randint(0, file_len - chunk_len)
    end_index = start_index + chunk_len + 1
    return file[start_index:end_index]

print(random_chunk())

in some vantage.
But make you ready your stiff bats and clubs:
Rome and her rats are at the point of battle;
The one side must have bale.
Hail, noble Marcius!

MARCIUS:
Thanks. What's the matter, you d


## 2). char to tensor

In [10]:
# 문자열을 받았을 때, 이를 indices로 바꾸어서 return함.
def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_characters.index(string[c])
    return tensor

print(char_tensor('ABCdef'))

tensor([36, 37, 38, 13, 14, 15])


## 3) chunk into input & label

In [0]:
# 인덱스화된 문자열을 입력값과 목표값으로 나눠주는 함수
# pytorch: pytorc -> ytorch

def random_training_set():
    chunk = random_chunk() # random하게 잘린 문자열
    input_ = char_tensor(chunk[:-1])
    target = char_tensor(chunk[1:])
    return input_, target

# 3. Model & Optimizer

In [0]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers=1):
        super().__init__()
        self.input_size = input_size # 실제 input의 사이즈가 아니라, embedding lookup table의 개수.
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        # Embedding
        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.rnn = nn.RNN(self.embedding_size, self.hidden_size, self.num_layers)
        self.decoder = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input_, hidden):
        output = input_.view(1, -1) # [1 x 1]
        
        # embedding에선 indices들을 전달한다
        # e.g. torch.LongTensor([[1,2,4,5],[4,3,2,9]]) -> 2개의 batch, [1, 2, 4, 5] 번째 indices와 [4, 3, 2, 9]의 indices
        output = self.encoder(output) # [bacth x 1 x embedding_size]
        # print('after emgedding', output.size())
        
        # hidden: [num_layers * num_directions, batch, hidden_size]
        # output: [seq_len, batch, num_directions * hidden_size]
        output, hidden = self.rnn(output, hidden)
        # print('before view output:', output.size())
        # print('after decoder output:', output.view(batch_size, -1).size())
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden
    
    def init_hidden(self):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return hidden

model = RNN(n_characters, embedding_size, hidden_size, n_characters, num_layers)

```torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None)```  
A simple lookup table that stores embeddings of a fixed dictionary and size.

This module is often used to store word embeddings and retrieve them using indices. The input to the module is a list of indices, and the output is the corresponding word embeddings.

In [0]:
# Embedding 연습

embedding = nn.Embedding(10, 3)

# a batch of 2 samples of 4 indices each
indices = torch.LongTensor([[1,2,4,5],[4,3,2,9]]) # 2 x 4

embedding(indices) # [2 x 4 x 3] -> [(batch_size, indices), embedding_dim]

tensor([[0.2434, 0.0799, 0.0629]], grad_fn=<EmbeddingBackward>)

In [0]:
model = RNN(input_size=n_characters, 
            embedding_size=embedding_size,
            hidden_size=hidden_size, 
            output_size=n_characters, 
            num_layers=2)

In [0]:
# 모델 테스트

inp = char_tensor("A")
print(inp, inp.size())
print('View 적용:', inp.view(1, -1), inp.view(1, -1).size())

embedding = nn.Embedding(100, 100)
word_vector = embedding(inp.view(1, -1))
print('after embedding', word_vector.size())

hidden = model.init_hidden()
print(hidden.size())
out, hidden = model(inp, hidden)
print(out.size())

tensor([36]) torch.Size([1])
View 적용: tensor([[36]]) torch.Size([1, 1])
after embedding torch.Size([1, 1, 100])
torch.Size([2, 1, 100])
before view output: torch.Size([1, 1, 100])
after decoder output: torch.Size([1, 100])
torch.Size([1, 100])


In [0]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_func = nn.CrossEntropyLoss() # 여긴 시발 또 cross entropy네?

In [0]:
# test function
# 임의의 문자 start_str 로 시작하는 길이 200짜리 모방 글을 생성하는 예제

def test():
    start_str = 'b'
    inp = char_tensor(start_str)
    hidden = model.init_hidden()
    x = inp

    print(start_str, end='')

    for i in range(200):
        output, hidden = model(x, hidden)

        # 여기서는 max값을 쓰지 않고 multinomial을 사용하는 이유는 만약 max값만 쓰는 경우에
        # 생성되는 text가 the the the ... 만 나오기 때문임.
        # multinomial 함수를 통해 높은 값을 가지는 문자들에 대해서 랜덤하게 다음 글자를 뽑아내는 방식으로 텍스트를 생성해보자

        output_dist = output.data.view(-1).div(0.8).exp() # Q. 굳이 이렇게 한 이유는?

        # input은 sample이고, 얼마나 뽑을지 결정하면 됨.
        top_i = torch.multinomial(output_dist, 1)[0] # [0]은 scalar값만 가져오겠단 소리.
        predicted_char = all_characters[top_i]

        print(predicted_char, end="")

        x = char_tensor(predicted_char)

In [0]:
# Train
# model = RNN(n_characters, embedding_size, hidden_size, n_characters, num_layers)

for i in range(num_epochs):
    # 랜덤한 텍스트 덩어리를 sampling하고, 이를 index tensor로 변환한다.
    input_, label = random_training_set() # 200개 짜리 chunk된 word의 indices
    hidden = model.init_hidden()
    
    # Q. 왜 굳이 tensor로 loss를 받는가?
    loss = torch.tensor([0]).type(torch.FloatTensor) # loss = 0 이라도 잘 작동함.
    optimizer.zero_grad()

    for j in range(chunk_len - 1):
        x = input_[j] # 글자 하나의 index -> tensor(22)

        # torch.unsqueeze(input, dim) -> 특정 자리에 차원을 추가하는 함수.
        y_ = label[j].unsqueeze(0).type(torch.LongTensor) # -> tensor([22]), 1x1
        y, hidden = model(x, hidden)
        loss += loss_func(y, y_)

    loss.backward()
    optimizer.step()

    if i % 100 ==0:
        print("\n",loss/chunk_len,"\n")
        test()
        print("\n","="*100)


 tensor(4.5962, grad_fn=<DivBackward0>) 

bkfwXle%Nh>#45L,]kX/ZN (Oz}4HLt^esHOZ?N#ORPlr@?n=GLAn=vt/#l8nED-j_97Q"AZZH@U<1P|gSw$KU1:i@yY]'8c4q|cji/H<%'@Jd":}5cEdIf+.TKfgD~[).7;d|^o\YL~[WYQNvYr3Dw~"	]r
>)7l<UTJN[Cy:OqLDx4uf}cc0tG9/qd"!X9-1;f%}

 tensor(2.3547, grad_fn=<DivBackward0>) 

bJ
Tovin
War sou sor, there aste pafes, abt uond,;

O min mawis tarino to the whes fo ga; wot. tareey wiag the the thoud Wd lL roul thor whis,
That yith th shers fallin roy eins ,ow thes wile seor ther

 tensor(2.2918, grad_fn=<DivBackward0>) 

bure an me withee zece norfes and thet in of' tire Hot tim lo wing Mell sis will thel

CD'''s mell not they ourt; beeat yite! mris maew

Sowat yarincere fou raos thov, lich ther st.
LE:
Act nof I EH:
T

 tensor(2.2984, grad_fn=<DivBackward0>) 

been the aus, the cance of, sungust blise bulla gheon the mase the shas waly suy and
Thenere,
I shame jour noce be fom hand thear hak semall I thou hill ongy way thour whaan shell
Hirs aw and for hake,

 tensor(2.0571, gra

# Experiment


## batch_size가 여러개인 RNN 모델을 직접 짜보자.

사실 모델 내에서 batch 부분에 대해서 건들 것이 없음. 기존의 [input] -> [output] 구조를, [batch x input] -> [batch x output] 으로 바꿔주면 됨.

In [0]:
### Hyper parameters
num_epochs = 2000
print_every = 100
plot_every = 10
chunk_len = 200
hidden_size = 100
batch_size = 5
num_layers = 2
embedding_size = 70
lr = 0.002

In [0]:
class RNNBatch(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size

        # Architecture
        self.embedding = nn.Embedding(self.input_size, self.embedding_size)
        self.rnn = nn.RNN(self.embedding_size, self.hidden_size, self.num_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input_, hidden):
        # input: [batch] == [5]

        # Embedding
        # input: [batch x indices] == [5 x 1] 우리는 단어마다 받을 것 (seq_len=1)
        output = self.embedding(input_.view(batch_size, -1))
        # output: [batch x indices x dim] == [5 x 1 x 100]

        # RNN
        # input: [batch, seq_len, features] == [5 x 1 x 100]. seq_len=1 이고 embedding_size가 input dim
        # hidden: [num_layers * num_directions, batch, hidden_Size] == [2, 5, 100]
        output, hidden = self.rnn(output, hidden)
        # output: [batch, seq_len, features] == [5, 1, 100]. input과 같음.
        # hidden: [num_layers * num_directions, batch, hidden_size] == [2, 5, 100]

        # FC
        # input: [batch_size, in_features] = [5 x 100]
        output = output.view(batch_size, -1) # [2, 5, 100] -> [5, 100]
        output = self.fc(output)
        # output: [batch_size, output_size]
        output = nn.functional.log_softmax(output)
        return output, hidden

    def init_hidden(self):
        hidden = torch.zeros(self.num_layers * 1, batch_size, self.hidden_size)
        return hidden


In [0]:
model = RNNBatch(input_size=n_characters,
          embedding_size=embedding_size,
          hidden_size=hidden_size,
          output_size=n_characters,
          num_layers=2)

In [0]:
# test function
# 임의의 문자 start_str 로 시작하는 길이 200짜리 모방 글을 생성하는 예제

def test_with_batch():
    start_str = 'bbbbb'
    inp = char_tensor(start_str) # [x, x, x, .. , x]
    hidden = model.init_hidden()
    x = inp
    predicted_chars = ['' for _ in range(batch_size)]
    print(start_str, end='')

    for i in range(200):
        output, hidden = model(x, hidden)

        # 여기서는 max값을 쓰지 않고 multinomial을 사용하는 이유는 만약 max값만 쓰는 경우에
        # 생성되는 text가 the the the ... 만 나오기 때문임.
        # multinomial 함수를 통해 높은 값을 가지는 문자들에 대해서 랜덤하게 다음 글자를 뽑아내는 방식으로 텍스트를 생성해보자

        output_dist = output.data.div(0.8).exp() 
        top_i = torch.multinomial(output_dist, 1).view(-1)
        predicted_char = [all_characters[item] for item in top_i]
        predicted_chars = [predicted_chars[item] + predicted_char[item] for item in range(batch_size)]


        x = char_tensor("".join(predicted_char))
    
    for item in predicted_chars:
        print(item, sep='\n')

In [31]:
# 모델 테스트

inp = char_tensor("ABCDE")
print(inp, inp.size())
print('View 적용:', inp.view(1, -1), inp.view(batch_size, -1).size())

embedding = nn.Embedding(100, 70)
word_vector = embedding(inp.view(batch_size, -1))
print('after embedding', word_vector.size())

hidden = model.init_hidden()
print(hidden.size())
out, hidden = model(inp, hidden)
print(out.size())

# test 함수
# output_dist = out.data.view(-1).div(0.8).exp()
output_dist = out.data.div(0.8).exp()  # batch의 경우, view를 안해줘야 되니까.
print(output_dist.size())

print('='*100)
print('multinomial')
top_i = torch.multinomial(output_dist, 1)
print(top_i, top_i.size())
top_i = top_i.view(-1)
print(top_i)
predicted_char = [all_characters[item] for item in top_i]
print(predicted_char)
predicted_chars = ['' for _ in range(batch_size)]
predicted_chars = [predicted_chars[item] + predicted_char[item] for item in range(batch_size)]
print(predicted_chars)
x = char_tensor("".join(predicted_char))
print(x)

tensor([36, 37, 38, 39, 40]) torch.Size([5])
View 적용: tensor([[36, 37, 38, 39, 40]]) torch.Size([5, 1])
after embedding torch.Size([5, 1, 70])
torch.Size([2, 5, 100])
torch.Size([5, 100])
torch.Size([5, 100])
multinomial
tensor([[31],
        [69],
        [51],
        [ 3],
        [62]]) torch.Size([5, 1])
tensor([31, 69, 51,  3, 62])
['v', '(', 'P', '3', '!']
['v', '(', 'P', '3', '!']
tensor([31, 69, 51,  3, 62])




잘 나오는 것을 확인함. 그러나 문제는 training하는 과정인데...

In [32]:
# Train
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_func = nn.NLLLoss() # multinomial이라서 Cross Entropy 사용 X

for i in range(num_epochs):
    # 랜덤한 텍스트 덩어리를 sampling하고, 이를 index tensor로 변환한다.
    input_, label = random_training_set() # chunk_len = 200 이므로, 5의 배수.
    hidden = model.init_hidden()
    # print(input_.size())
    
    loss = torch.tensor([0]).type(torch.FloatTensor)
    optimizer.zero_grad()
    for j in range(chunk_len - batch_size): # chunck_len - 1 - (batch_size) + 1
        x = torch.stack([input_[j+k:j+k+1] for k in range(batch_size)], dim=0) # [5, 1]
        y_ = torch.stack([label[j+k+1:j+k+1+1] for k in range(batch_size)],dim=0) # [5, 1]
        y_ = y_.view(-1).long() # [batch_size, 1]
        y, hidden = model(x, hidden) # [batch_size, output_size]
        # print(y.size(), y_.size())
        loss += loss_func(y, y_)

    loss.backward()
    optimizer.step()

    if i % 100 ==0:
        print("\n",loss/chunk_len,"\n")
        # test를 해야 하는데, batch size가 다를 경우에는 어떻게 inference를 진행하지...?
        test_with_batch()
        print("\n","="*100)




 tensor([4.4976], grad_fn=<DivBackward0>) 

bbbbb354OT	M0TcpyS%abN1,lJ='	[I?<Vo.}{#[itEl~&*4rxg]).!Ct=*_2T4Y}n!kP3	'_f:
E&~dk.#1rEr3?$B1{7Z~,+J4! biZgaSTUXiS_Kn-0|sd
n75>4wHejnf>{?{2}aQgNQ|U'/V5|vZak@o?B5<_P~R_^ ,]v#Rr%s$}gG{9LZwQg ;@e-"/4K1CkZE
N96
3Xv+*`VGUlP@,ay%#V!"5W7d x~8~qs36aY
LmIwY['|\?D]t(8*Ty-lggQX"te'rSHU*]:ESCO@_9kT[Q0[s&>P8Atg7meo~-"Z\[Emfl?\3iw`9c}59N8<}YxSh?,& t:!~EWC<Uv@A3QlgE8cTO#^0e'3y#xa*#zU08c?<lD~hHGn#o+<eh>0J
<S^K=wP|N-\_8-|C7!/FRO%;m&BydN<L=C6Z8IxPr6U(<D.-Y"7#`g#V98}bXa?"X``0wq^{pcY=U#<!M_@?J(+lRS	q(%1p k$\.oV-$$^2vd%2 CXEz-fP_E6_ l),P#MCDRsXe}lG(S6TdO`Q@|E_V.n'	9.#;L]oNJn0,wyJ,uA#T9}8ECgH-=rn:#}<|*
|p|qs;tz)4m``+bX+D<gQ,fAuM;dH7V;LcAU1J}?p"v\	MY7xp4xmc^up)]5wOg	!]q>;!d	c~Kst68(}VlN[
F~u|q'K+J\77/xe`m7^:\i6Ij{V	s5466md3}xKTw#{iLrv~@J5w{.>knAyCJ%?(W^?-	MYGVmn.L${k?:^C}^m#0)U TA^9_S{n5/E "3}
m_iNS.lt[.O\8uG_\i70}gLo'df 1bcRAob|>\-[YYc-6DF?|>|e8 OCp:-|G)_j&loe&)Ro#gb*l4}nwRwfERB |,COeZ:f64h.d%onX5tOSP_U.;!rp z%;)ONYP3l*}I;\X=
o99-97

batch로 진행했을 때 training이 잘 되지 않는 것 처럼 보인다. loss는 약 2.3 정도에서 수렴한다.