### 문자 단위 RNN(Char RNN)
- RNN의 입출력의 단위가 단어 레벨(word-level)이 아니라 문자 레벨(character-level)로 하여 RNN을 구현한다면, 이를 문자 단위 RNN이라고 합니다.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

#### 1. 훈련 데이터 전처리하기

In [2]:
input_str = 'apple'
label_str = 'pple!ep'
char_vocab = sorted(list(set(input_str+label_str)))
vocab_size = len(char_vocab)
print ('문자 집합의 크기 : {}'.format(vocab_size))
char_vocab

문자 집합의 크기 : 5


['!', 'a', 'e', 'l', 'p']

In [3]:
input_size = vocab_size # 입력의 크기는 문자 집합의 크기
hidden_size = 8
output_size = 7
learning_rate = 0.1

In [4]:
char_to_index = dict((c,i) for i, c in enumerate(char_vocab))
print(char_to_index)

index_to_char = {} # 마지막에 다시 변환해주기 위함
for key, value in char_to_index.items():
    index_to_char[value] = key
print(index_to_char)

{'!': 0, 'a': 1, 'e': 2, 'l': 3, 'p': 4}
{0: '!', 1: 'a', 2: 'e', 3: 'l', 4: 'p'}


In [5]:
x_data = [char_to_index[i] for i in input_str]
y_data = [char_to_index[i] for i in label_str]
x_data, y_data

([1, 4, 4, 3, 2], [4, 4, 3, 2, 0, 2, 4])

In [6]:
# 배치 차원 추가
# 텐서 연산인 unsqueeze(0)를 통해 해결할 수도 있었음.
x_data = [x_data]
y_data = [y_data]

x_data, y_data

([[1, 4, 4, 3, 2]], [[4, 4, 3, 2, 0, 2, 4]])

In [7]:
# np.eye -> identity행렬 만듦
x_one_hot = [np.eye(vocab_size)[x] for x in x_data] 
x_one_hot

[array([[0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.]])]

In [8]:
X = torch.FloatTensor(x_one_hot)
Y = torch.LongTensor(y_data)

print('훈련 데이터의 크기 : {}'.format(X.shape))
print('레이블의 크기 : {}'.format(Y.shape))

훈련 데이터의 크기 : torch.Size([1, 5, 5])
레이블의 크기 : torch.Size([1, 7])


#### 2. model 만들고 train하기

In [9]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Net, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size, bias=True)
    
    def forward(self, x):
#         print(x.shape) # (bs, in_size, word_vec_size)
        x, _status = self.rnn(x)
#         print(x.shape) # (bs, in_size, hidden_size)
        x = self.fc(x)
#         print(x.shape) # (bs, in_size, out_size)
        return x

In [10]:
net = Net(input_size, hidden_size, output_size)

y_hat = net(X)
y_hat.shape # (bs, timesteps=input_size, output_size)

torch.Size([1, 5, 7])

In [11]:
print(y_hat.view(-1, input_size).shape) 
# (output_size, bs+timesteps)

torch.Size([7, 5])


In [12]:
print(Y.shape)
print(Y.view(-1).shape)

torch.Size([1, 7])
torch.Size([7])


In [13]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), learning_rate)

In [14]:
for i in range(10):
    optimizer.zero_grad()
    outputs = net(X)
    
#     print(outputs.shape) # (bs, time_step_size(=input_size), output_size)
#     print(Y.shape) # (bs, output_size)
#     print(outputs.view(-1, input_size).shape)
#     print(Y.view(-1).shape)
    
    loss = criterion(outputs, Y)
    # view -> 배치차원 제거
#     loss = criterion(outputs.view(-1, input_size), Y.view(-1))
    loss.backward()
    optimizer.step()
#     print(outputs.data.shape) # (bs, time_step_size(=input_size), output_size)
    result = outputs.data.numpy().argmax(axis=1)
    result_str = ''.join([index_to_char[c] for c in np.squeeze(result)])
    print(i, "loss: ", loss.item(), "prediction str: ", result_str, '\n', "prediction: ", result, "true Y: ", y_data, '\n')

0 loss:  1.5952900648117065 prediction str:  pll!pep 
 prediction:  [[4 3 3 0 4 2 4]] true Y:  [[4, 4, 3, 2, 0, 2, 4]] 

1 loss:  1.3477944135665894 prediction str:  p!l!pep 
 prediction:  [[4 0 3 0 4 2 4]] true Y:  [[4, 4, 3, 2, 0, 2, 4]] 

2 loss:  1.000227689743042 prediction str:  pplepep 
 prediction:  [[4 4 3 2 4 2 4]] true Y:  [[4, 4, 3, 2, 0, 2, 4]] 

3 loss:  0.650600254535675 prediction str:  pplepep 
 prediction:  [[4 4 3 2 4 2 4]] true Y:  [[4, 4, 3, 2, 0, 2, 4]] 

4 loss:  0.4056902527809143 prediction str:  pple!ep 
 prediction:  [[4 4 3 2 0 2 4]] true Y:  [[4, 4, 3, 2, 0, 2, 4]] 

5 loss:  0.25015988945961 prediction str:  pple!ep 
 prediction:  [[4 4 3 2 0 2 4]] true Y:  [[4, 4, 3, 2, 0, 2, 4]] 

6 loss:  0.14836657047271729 prediction str:  pple!ep 
 prediction:  [[4 4 3 2 0 2 4]] true Y:  [[4, 4, 3, 2, 0, 2, 4]] 

7 loss:  0.08627326041460037 prediction str:  pple!ep 
 prediction:  [[4 4 3 2 0 2 4]] true Y:  [[4, 4, 3, 2, 0, 2, 4]] 

8 loss:  0.05114513263106346 predi

### 문자 단위 RNN(Char RNN) - 더 많은 데이터

In [67]:
import torch
import torch.nn as nn
import torch.optim as optim

sentence = ("if you want to build a ship, don't drum up people together to "
            "collect wood and don't assign them tasks and work, but rather "
            "teach them to long for the endless immensity of the sea.")

char_set = list(set(sentence)) # 중복을 제거한 문자 집합 생성
char_dic = {c: i for i, c in enumerate(char_set)} # 각 문자에 정수 인코딩
print(char_dic)

{'n': 0, 'g': 1, 'f': 2, 'i': 3, 'e': 4, 'p': 5, 'o': 6, 'y': 7, 'd': 8, 'm': 9, "'": 10, 'b': 11, 'l': 12, ',': 13, 'w': 14, 'a': 15, 'k': 16, 'r': 17, ' ': 18, 'u': 19, 'c': 20, 't': 21, '.': 22, 'h': 23, 's': 24}


In [68]:
dic_size = len(char_dic)
print('문자 집합의 크기 : {}'.format(dic_size))

문자 집합의 크기 : 25


In [69]:
hidden_size = dic_size
sequence_length = 10  # 임의 숫자 지정
learning_rate = 0.1

In [70]:
len(sentence) , sequence_length

(180, 10)

In [71]:
x_data = []
y_data = []

for i in range(0, len(sentence) - sequence_length):
    x_str = sentence[i : i+sequence_length]
    y_str = sentence[i+1 : i+sequence_length+1]
    if i == 0:
        print(i, x_str, '->', y_str)
    
    x_data.append([char_dic[c] for c in x_str]) # x str to idx
    y_data.append([char_dic[c] for c in y_str])

0 if you wan -> f you want


In [72]:
print("(", len(x_data),", ", len(x_data[0]), ")")
print("(", len(y_data),", ", len(y_data[0]), ")")
print(x_data[0]) # if you wan에 해당됨.
print(y_data[0]) # f you want에 해당됨.

( 170 ,  10 )
( 170 ,  10 )
[3, 2, 18, 7, 6, 19, 18, 14, 15, 0]
[2, 18, 7, 6, 19, 18, 14, 15, 0, 21]


In [73]:
x_one_hot = [np.eye(dic_size)[x] for x in x_data] # x 데이터는 원-핫 인코딩
X = torch.FloatTensor(x_one_hot) # one hot encoded
Y = torch.LongTensor(y_data) # one hot encoded

In [74]:
print('훈련 데이터의 크기 : {}'.format(X.shape))
print('레이블의 크기 : {}'.format(Y.shape))

훈련 데이터의 크기 : torch.Size([170, 10, 25])
레이블의 크기 : torch.Size([170, 10])


In [75]:
class Net(nn.Module):
    def __init__(self, input_dim, hidden_dim, layers):
        super(Net, self).__init__()
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers=layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, hidden_dim, bias=True)
        
    def forward(self, x):
#         print(x.shape)
        x, _status = self.rnn(x)
#         print(x.shape)
        x = self.fc(x)
        return x

In [81]:
criterion = torch.nn.CrossEntropyLoss()
deeper_net = Net(dic_size, hidden_size, 2)
optimizer = optim.Adam(deeper_net.parameters(), learning_rate)

for i in range(100):
    optimizer.zero_grad()
#     print(X.shape)
    outputs = deeper_net(X) 
#     print(outputs.shape)
#     print(Y.shape)
#     print(outputs.view(-1, dic_size).shape)
#     print(Y.view(-1).shape)
    loss = criterion(outputs.view(-1, dic_size), Y.view(-1))
    loss.backward()
    optimizer.step()
    results = outputs.argmax(dim=2)
#     print(results)
    predict_str = ""
    for j, result in enumerate(results):
        if j == 0: # 처음에는 예측 결과를 전부 가져오지만
            predict_str += ''.join([char_set[t] for t in result])
        else: # 그 다음에는 마지막 글자만 반복 추가
            predict_str += char_set[result[-1]]
    
    print(predict_str, '\n')
    

ygllllllllllllllllllylllllllyllllllllyllllllllllllllllllllllllllllllyllllllllllllllllllylllllllllllllyyllllllllyllylllllllllllllyllllllllllllllllllllllllllllylllllllllllllllllllll 

       w                                                  w                                                                w                                                        

                                                                                                                                                                                    

ew.hewooe..ssssss.essssseg.sas.es.msmssssessssssssssssssssssswssmssssssessssessssess.ssesseees.ssm.sss.sees.esss.sssssesesessesmsmsesssegmes.sssssssesssesees.sssssessssmssssemsses 

oooooomooomoooooomoooooooooooomoomoooooooooooooooooooooooooooooooooooooooooooooooooomoooooooooooooooooooooooooooaoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo 

tott oot ooooooooooooooooolooooooooooooooooooooooooooooooloooolooooooooooooooooooooooooooo

g you want to build a ship, don't drum up people together to collect wood and don't assign them tasks and work, but rather teach them ta long for the endless immensity of the sean 

g you want to build a ship, don't arum up people together to collect wood and don't assign them tosks and work, but rather teach them to long for the endless immensity of the sean 

g you want to build a ship, don't drum up people together to collect wood and don't assign them tosks and work, but rather teach them to long for the endless immensity of the sean 

g you want to build a ship, don't drum up people together to collect wood and don't assign them tosks and work, but rather teach them to long for the endless immensity of the sean 

g you want to build a ship, don't drum up people together to collect wood and don't assign them tosks and work, but rather teach them to long for the endless immensity of the sean 

l you want to build a ship, don't drum up people together to collect wood and don't assign

### 단어 단위 RNN - 임베딩 사용

In [85]:
import torch
import torch.nn as nn
import torch.optim as optim

sentence = "Repeat is the best medicine for memory".split()

vocab = list(set(sentence))
print(vocab)

['the', 'best', 'medicine', 'Repeat', 'memory', 'for', 'is']


In [86]:
word2index = {tkn: i for i, tkn in enumerate(vocab, 1)}  # 단어에 고유한 정수 부여
word2index['<unk>']=0
print(word2index)

{'the': 1, 'best': 2, 'medicine': 3, 'Repeat': 4, 'memory': 5, 'for': 6, 'is': 7, '<unk>': 0}


In [87]:
# 수치화된 데이터를 단어로 바꾸기 위한 사전
index2word = {v: k for k, v in word2index.items()}
print(index2word)

{1: 'the', 2: 'best', 3: 'medicine', 4: 'Repeat', 5: 'memory', 6: 'for', 7: 'is', 0: '<unk>'}


In [94]:
def build_data(sentence, word2index):
    encoded = [word2index[token] for token in sentence]
    print("encoded:", encoded)
    input_seq, label_seq = encoded[:-1], encoded[1:]
    input_seq = torch.LongTensor(input_seq).unsqueeze(0)
    label_seq = torch.LongTensor(label_seq).unsqueeze(0)
    return input_seq, label_seq

X, Y = build_data(sentence, word2index)
print("input:", X)
print("output:", Y)

encoded: [4, 7, 1, 2, 3, 6, 5]
input: tensor([[4, 7, 1, 2, 3, 6]])
output: tensor([[7, 1, 2, 3, 6, 5]])


In [106]:
class Net(nn.Module):
    def __init__(self, vocab_size, input_size, hidden_size, batch_first=True):
        super(Net, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, # 워드 임베딩
                                            
                                            embedding_dim=input_size)
        self.rnn_layer = nn.RNN(input_size, hidden_size, # 입력 차원, 은닉 상태의 크기 정의
                                batch_first=batch_first)
        self.linear = nn.Linear(hidden_size, vocab_size) # 출력은 원-핫 벡터의 크기를 가져야함. 또는 단어 집합의 크기만큼 가져야함.
        
    def forward(self, x):
#         print(x.shape)
        
        output = self.embedding_layer(x)
#         print(output.shape)
        
        output, hidden = self.rnn_layer(output)
#         print(output.shape)
        
        output = self.linear(output)
#         print( output.view(-1, output.size(2)).shape)
        return output.view(-1, output.size(2))

In [113]:
# 하이퍼 파라미터
vocab_size = len(word2index)  # 단어장의 크기는 임베딩 층, 최종 출력층에 사용된다. <unk> 토큰을 크기에 포함한다.
input_size = 5  # 임베딩 된 차원의 크기 및 RNN 층 입력 차원의 크기
hidden_size = 20  # RNN의 은닉층 크기

# 모델 생성
model = Net(vocab_size, input_size, hidden_size, batch_first=True)
# 손실함수 정의
loss_function = nn.CrossEntropyLoss() # 소프트맥스 함수 포함이며 실제값은 원-핫 인코딩 안 해도 됨.
# 옵티마이저 정의
optimizer = optim.Adam(model.parameters())
decode = lambda y: [index2word.get(x) for x in y]

for step in range(201):
    # 경사 초기화
    optimizer.zero_grad()
    # 순방향 전파
    output = model(X)
    # 손실값 계산
    loss = loss_function(output, Y.view(-1))
    # 역방향 전파
    loss.backward()
    # 매개변수 업데이트
    optimizer.step()
    # 기록
    if step % 40 == 0:
        print("[{:02d}/201] {:.4f} ".format(step+1, loss))
        pred = output.softmax(-1).argmax(-1).tolist()
        print(" ".join(["Repeat"] + decode(pred)))
        print()

[01/201] 2.1626 
Repeat <unk> Repeat for Repeat <unk> <unk>

[41/201] 1.5776 
Repeat is the best medicine for memory

[81/201] 0.9204 
Repeat is the best medicine for memory

[121/201] 0.4742 
Repeat is the best medicine for memory

[161/201] 0.2519 
Repeat is the best medicine for memory

[201/201] 0.1478 
Repeat is the best medicine for memory

