## 3. Sequence 단위 RNN(Char RNN)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np

## 1. 훈련 데이터 전처리하기

In [2]:
sentence = ("if you want to build a ship, don't drum up people together to "
            "collect wood and don't assign them tasks and work, but rather "
            "teach them to long for the endless immensity of the sea.")

In [3]:
char_set = list(set(sentence)) # 중복을 제거한 문자 집합 생성
char_dic = {c: i for i, c in enumerate(char_set)} # 각 문자에 정수 인코딩

In [4]:
print(char_set)

['l', 'o', 'h', ' ', 'w', 'n', 'u', "'", 'i', 'y', 'p', ',', 'g', 'b', 'r', 'k', 's', 'a', 'm', 'd', 'e', 'c', 't', 'f', '.']


In [5]:
dic_size = len(char_dic)
print("문자 집합의 크기 : {}".format(dic_size))

문자 집합의 크기 : 25


In [6]:
# 하이퍼파리미터 설정
hidden_size = dic_size
sequence_length = 10 # 임의 숫자 지정
learning_rate = 0.1

In [7]:
# 데이터 구성
x_data = []
y_data = []

for i in range(0, len(sentence) - sequence_length):
  x_str = sentence[i:i + sequence_length]
  y_str = sentence[i + 1: i + sequence_length + 1]
  print(i, x_str, '->', y_str)

  x_data.append([char_dic[c] for c in x_str])  # x str to index
  y_data.append([char_dic[c] for c in y_str])  # y str to index

0 if you wan -> f you want
1 f you want ->  you want 
2  you want  -> you want t
3 you want t -> ou want to
4 ou want to -> u want to 
5 u want to  ->  want to b
6  want to b -> want to bu
7 want to bu -> ant to bui
8 ant to bui -> nt to buil
9 nt to buil -> t to build
10 t to build ->  to build 
11  to build  -> to build a
12 to build a -> o build a 
13 o build a  ->  build a s
14  build a s -> build a sh
15 build a sh -> uild a shi
16 uild a shi -> ild a ship
17 ild a ship -> ld a ship,
18 ld a ship, -> d a ship, 
19 d a ship,  ->  a ship, d
20  a ship, d -> a ship, do
21 a ship, do ->  ship, don
22  ship, don -> ship, don'
23 ship, don' -> hip, don't
24 hip, don't -> ip, don't 
25 ip, don't  -> p, don't d
26 p, don't d -> , don't dr
27 , don't dr ->  don't dru
28  don't dru -> don't drum
29 don't drum -> on't drum 
30 on't drum  -> n't drum u
31 n't drum u -> 't drum up
32 't drum up -> t drum up 
33 t drum up  ->  drum up p
34  drum up p -> drum up pe
35 drum up pe -> rum up peo
36

In [8]:
print(x_data[0])
print(y_data[0])

[8, 23, 3, 9, 1, 6, 3, 4, 17, 5]
[23, 3, 9, 1, 6, 3, 4, 17, 5, 22]


In [9]:
print([char_set[c] for c in x_data[0]])
print([char_set[c] for c in y_data[0]]) 

['i', 'f', ' ', 'y', 'o', 'u', ' ', 'w', 'a', 'n']
['f', ' ', 'y', 'o', 'u', ' ', 'w', 'a', 'n', 't']


In [10]:
x_one_hot = [np.eye(dic_size)[x] for x in x_data] # x 데이터는 원-핫 인코딩
X = torch.FloatTensor(x_one_hot)
Y = torch.LongTensor(y_data)

In [11]:
print("훈련 데이터의 크기 : {}".format(X.shape))
print("레이블의 크기 : {}".format(Y.shape))

훈련 데이터의 크기 : torch.Size([170, 10, 25])
레이블의 크기 : torch.Size([170, 10])


In [12]:
print(X[0])

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [13]:
print(Y[0])

tensor([23,  3,  9,  1,  6,  3,  4, 17,  5, 22])


# 2. 모델구현하기

In [14]:
class Net(torch.nn.Module):
  def __init__(self, input_dim, hidden_dim, layers):  # 현재 hidden_size는 dic_size와 같음
    super(Net, self).__init__()
    self.rnn = torch.nn.RNN(input_dim, hidden_dim, num_layers= layers, batch_first = True)
    self.fc = torch.nn.Linear(hidden_dim, hidden_dim, bias = True)

  def forward(self, x):
    x, _status = self.rnn(x)
    x = self.fc(x)

    return x

In [15]:
net = Net(dic_size, hidden_size, 2)

In [16]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), learning_rate)

In [17]:
outputs = net(X)
print(outputs.shape)

torch.Size([170, 10, 25])


In [18]:
torch.Size([170, 10, 25])

torch.Size([170, 10, 25])

In [19]:
print(outputs.view(-1, dic_size).shape)  # 2차원 텐서로 변환

torch.Size([1700, 25])


In [20]:
torch.Size([1700, 25])

torch.Size([1700, 25])

In [21]:
print(Y.shape)
print(Y.view(-1).shape)

torch.Size([170, 10])
torch.Size([1700])


In [22]:
for i in range(100):
  optimizer.zero_grad()
  outputs = net(X)  #  (170, 10, 25) 크기를 가진 텐서를 매 에포크마다 모델의 입력으로 사용
  loss = criterion(outputs.view(-1, dic_size), Y.view(-1))
  loss.backward()
  optimizer.step()

  # results의 텐서 크기는 (170, 10)
  results = outputs.argmax(dim = 2)  # dim = 2?
  predict_str = ""
  for j, result in enumerate(results):
    if j == 0:  # 처음에는 예측 결과를 전부 가져오지만
      predict_str += ''.join([char_set[t] for t in result])
    else:  # 그 다음에는 마지막 글자면 반복 추가(가장확률이 높은글지??)
      predict_str += char_set[result[-1]]

  print(predict_str)

sussss..nusnsssns..snnn..ss..ss..usss.snss.sdss.n.sssnssus.sss.usnnnusssss.nu.nss..us...ssn.ssnsnss..s.susnsss..nns..sdsnnsdsnns.ns.nsnss..fs.nsus.ssnnnuscn...susnn..sssss.ssn...n
    d t t  d d m m    d t t d d tt   t ttt t    dt   t   tt   t  dtt   t  t d d d t e t   t t ttt  t  d d d m t d t d t   tt   t d e ttm      t   t   t t d t i   ttt    m t e t m 
                                                                                                                                                                                   
tnhhdhthththtrnhhthnhnthththththththththnrthhdhthh rhththththntnthth thnh htrnrhthththththththtntnrnhnthththththththththththtrnththththththnththtnhhththththththrnnh hthndnhhththth
tah atth t taattaaa otttata haa toaaaa tttaaah aattaoaaotat  aa httotao d taa tta tot haa t a taaaata a t tt ht a toa oh t t ahtaa at ttdaaah tad taa ttt h otataaht tahtaoa ottoa 
tsoesoesosoeo eoeso soses p so  eseess so so seesse  o s es  eo sesso o s e s to  e k see oe ossse s