# 딥 러닝을 이용한 자연어 처리 입문

https://wikidocs.net/22660

## 08. NLP를 위한 딥 러닝 개요


### 08-4 순환신경망(RNN)


텍스트 분류, 태깅 작업, 기계 번역 등에 사용되는 모델

### 1. 순환 신경망(Recurrent Neural Network, RNN)

결과값 방향 : 출력층 방향 + 다음 은닉층

메모리 셀(RNN셀) : RNN에서 은닉층에서 활성화 함수를 통해 결과를 내보내는 노드

t-1 시점의 출력값이 t시점의 입력으로 들어간다.(은닉 값)

#### 출력값의 수

- one-to-many : 이미지 캡셔닝
- many-to-one : 감성분류, 텍스트 분류
- many-to-many : 개채명 인식, 품사 태깅

### 2. 양방향 순환 신경망(Bidirectional Recurrent Neural Network)

이전 시점 뿐만 아니라 다음 시점으로도 input으로 활용하여 예측이 가능

forward state(앞 시점의 은닉 상태)

backward state(뒤 시점의 은닉 상태)

이 두가지를 이용하여 한 시점의 결과값 계산

### 3. 케라스(Keras)로 RNN 구현하기

RNN의 입력 텐서 : (batch_size, timesteps, input_dim)

    SimpleRNN(배치 사이즈, input_shape = (time steps, input_dim), 
                return_sequences = True or False)
                
return_sequences 
- True : (배치 사이즈, output_dim ) 의 2D 텐서 반환
- False : (배치 사이즈, time steps, output_dim) 의 3D 텐서 반환

In [5]:
from keras.models import Sequential
from keras.layers import SimpleRNN

model = Sequential()
model.add(SimpleRNN(3, input_shape = (2,10)))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_3 (SimpleRNN)     (None, 3)                 42        
Total params: 42
Trainable params: 42
Non-trainable params: 0
_________________________________________________________________


In [6]:
# batch_size를 미리 정희하였을 경우
model = Sequential()
model.add(SimpleRNN(3, batch_input_shape=(8,2,10)))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_4 (SimpleRNN)     (8, 3)                    42        
Total params: 42
Trainable params: 42
Non-trainable params: 0
_________________________________________________________________


In [7]:
# return_sequences 매개변수로 해보기
model = Sequential()
model.add(SimpleRNN(3, batch_input_shape=(8,2,10), return_sequences=True))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_5 (SimpleRNN)     (8, 2, 3)                 42        
Total params: 42
Trainable params: 42
Non-trainable params: 0
_________________________________________________________________


In [3]:
SimpleRNN(3, input_shape = (5,100), return_sequences= True)

<keras.layers.recurrent.SimpleRNN at 0x21bdfff5f28>

In [7]:
import pandas as pd
import numpy as np

In [40]:
from keras_preprocessing.text import Tokenizer

text= "나랑 점심 먹으러 갈래 메뉴는 햄버거 점심 메뉴 좋지"

t = Tokenizer()
t.fit_on_texts([text])


encoded = t.texts_to_sequences([text])[0]

vocab_size = len(t.word_index) + 1


print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 9


In [41]:
# 단어 빈도수에 따른 인덱스
print(t.word_index)

{'점심': 1, '나랑': 2, '먹으러': 3, '갈래': 4, '메뉴는': 5, '햄버거': 6, '메뉴': 7, '좋지': 8}


In [42]:
# 현재 단어, 다음 단어로 묶어준다 : 학습을 위함
sequences = list()

for c in range(1,len(encoded)):
    sequence = encoded[c-1:c+1]
    
    sequences.append(sequence)
    
print('단어 묶음의 개수 : %d' %len(sequences))

단어 묶음의 개수 : 8


In [43]:
print(sequences)

word_index_inv = dict(zip(t.word_index.values(), t.word_index.keys()))


print([[word_index_inv[i],word_index_inv[j]] for i,j in sequences])

[[2, 1], [1, 3], [3, 4], [4, 5], [5, 6], [6, 1], [1, 7], [7, 8]]
[['나랑', '점심'], ['점심', '먹으러'], ['먹으러', '갈래'], ['갈래', '메뉴는'], ['메뉴는', '햄버거'], ['햄버거', '점심'], ['점심', '메뉴'], ['메뉴', '좋지']]


In [44]:
import numpy as np

# 앞 단어 X, 뒷 단어 y

X,y = zip(*sequences)
X = np.array(X)
y = np.array(y)

In [45]:
y

array([1, 3, 4, 5, 6, 1, 7, 8])

In [46]:
# one-hot encoding

from keras.utils import to_categorical

y = to_categorical(y, num_classes = vocab_size)

print(y)

[[0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [47]:
y

array([[0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [49]:
# 다음 단어 예측 모델 만들기

from keras.layers import Embedding, Dense, SimpleRNN
from keras.models import Sequential

model = Sequential()
model.add(Embedding(vocab_size, 9, input_length = 1))
# 단어 집합의 크기는 9
# 임베딩 벡터 크기는 9
# 각 sample의 길이는 단어 한개 : 1

model.add(SimpleRNN(9))
# RNN에서 나오는 결과의 벡터 차원 또한 9, 더크게 하여도 상관 없다.

model.add(Dense(vocab_size, activation = 'softmax'))
# 출력층을 지낭서 오는 벡터 크기도 9

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 9)              81        
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 9)                 171       
_________________________________________________________________
dense_1 (Dense)              (None, 9)                 90        
Total params: 342
Trainable params: 342
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

model.fit(X, y, epochs = 500, verbose =2)

Epoch 1/500
 - 0s - loss: 2.1801 - acc: 0.3750
Epoch 2/500
 - 0s - loss: 2.1770 - acc: 0.3750
Epoch 3/500
 - 0s - loss: 2.1738 - acc: 0.3750
Epoch 4/500
 - 0s - loss: 2.1706 - acc: 0.3750
Epoch 5/500
 - 0s - loss: 2.1675 - acc: 0.3750
Epoch 6/500
 - 0s - loss: 2.1643 - acc: 0.3750
Epoch 7/500
 - 0s - loss: 2.1611 - acc: 0.3750
Epoch 8/500
 - 0s - loss: 2.1579 - acc: 0.3750
Epoch 9/500
 - 0s - loss: 2.1546 - acc: 0.3750
Epoch 10/500
 - 0s - loss: 2.1514 - acc: 0.5000
Epoch 11/500
 - 0s - loss: 2.1481 - acc: 0.5000
Epoch 12/500
 - 0s - loss: 2.1448 - acc: 0.5000
Epoch 13/500
 - 0s - loss: 2.1415 - acc: 0.5000
Epoch 14/500
 - 0s - loss: 2.1382 - acc: 0.5000
Epoch 15/500
 - 0s - loss: 2.1349 - acc: 0.5000
Epoch 16/500
 - 0s - loss: 2.1315 - acc: 0.5000
Epoch 17/500
 - 0s - loss: 2.1281 - acc: 0.5000
Epoch 18/500
 - 0s - loss: 2.1247 - acc: 0.5000
Epoch 19/500
 - 0s - loss: 2.1212 - acc: 0.5000
Epoch 20/500
 - 0s - loss: 2.1177 - acc: 0.5000
Epoch 21/500
 - 0s - loss: 2.1142 - acc: 0.5000
E

Epoch 171/500
 - 0s - loss: 1.0787 - acc: 0.8750
Epoch 172/500
 - 0s - loss: 1.0714 - acc: 0.8750
Epoch 173/500
 - 0s - loss: 1.0640 - acc: 0.8750
Epoch 174/500
 - 0s - loss: 1.0567 - acc: 0.8750
Epoch 175/500
 - 0s - loss: 1.0495 - acc: 0.8750
Epoch 176/500
 - 0s - loss: 1.0423 - acc: 0.8750
Epoch 177/500
 - 0s - loss: 1.0351 - acc: 0.8750
Epoch 178/500
 - 0s - loss: 1.0279 - acc: 0.8750
Epoch 179/500
 - 0s - loss: 1.0208 - acc: 0.8750
Epoch 180/500
 - 0s - loss: 1.0138 - acc: 0.8750
Epoch 181/500
 - 0s - loss: 1.0067 - acc: 0.8750
Epoch 182/500
 - 0s - loss: 0.9998 - acc: 0.8750
Epoch 183/500
 - 0s - loss: 0.9928 - acc: 0.8750
Epoch 184/500
 - 0s - loss: 0.9859 - acc: 0.8750
Epoch 185/500
 - 0s - loss: 0.9790 - acc: 0.8750
Epoch 186/500
 - 0s - loss: 0.9722 - acc: 0.8750
Epoch 187/500
 - 0s - loss: 0.9654 - acc: 0.8750
Epoch 188/500
 - 0s - loss: 0.9587 - acc: 0.8750
Epoch 189/500
 - 0s - loss: 0.9520 - acc: 0.8750
Epoch 190/500
 - 0s - loss: 0.9453 - acc: 0.8750
Epoch 191/500
 - 0s 

Epoch 339/500
 - 0s - loss: 0.3778 - acc: 0.8750
Epoch 340/500
 - 0s - loss: 0.3762 - acc: 0.8750
Epoch 341/500
 - 0s - loss: 0.3746 - acc: 0.8750
Epoch 342/500
 - 0s - loss: 0.3731 - acc: 0.8750
Epoch 343/500
 - 0s - loss: 0.3715 - acc: 0.8750
Epoch 344/500
 - 0s - loss: 0.3700 - acc: 0.8750
Epoch 345/500
 - 0s - loss: 0.3685 - acc: 0.8750
Epoch 346/500
 - 0s - loss: 0.3670 - acc: 0.8750
Epoch 347/500
 - 0s - loss: 0.3656 - acc: 0.8750
Epoch 348/500
 - 0s - loss: 0.3641 - acc: 0.8750
Epoch 349/500
 - 0s - loss: 0.3627 - acc: 0.8750
Epoch 350/500
 - 0s - loss: 0.3613 - acc: 0.8750
Epoch 351/500
 - 0s - loss: 0.3598 - acc: 0.8750
Epoch 352/500
 - 0s - loss: 0.3584 - acc: 0.8750
Epoch 353/500
 - 0s - loss: 0.3571 - acc: 0.8750
Epoch 354/500
 - 0s - loss: 0.3557 - acc: 0.8750
Epoch 355/500
 - 0s - loss: 0.3544 - acc: 0.8750
Epoch 356/500
 - 0s - loss: 0.3530 - acc: 0.8750
Epoch 357/500
 - 0s - loss: 0.3517 - acc: 0.8750
Epoch 358/500
 - 0s - loss: 0.3504 - acc: 0.8750
Epoch 359/500
 - 0s 

<keras.callbacks.History at 0x19aa77e1f60>

In [53]:
# 다음 단어 예측
def predict_next_word(model, t, current_word):
    encoded = t.texts_to_sequences([current_word])[0]
    
    encoded = np.array(encoded)
    result = model.predict_classes(encoded, verbose = 0)
    
    
    for word, index in t.word_index.items():
        if index == result:
            return word

In [54]:
print(predict_next_word(model, t, '먹으러'))

갈래


In [57]:
# 다음 문장을 예측
def sentence_generation(model, t, current_word, n):
    init_word = current_word
    
    sentence = ''
    for _ in range(n): # n번 반복
        encoded = t.texts_to_sequences([current_word])[0]
        
        encoded = np.array(encoded)
        result = model.predict_classes(encoded, verbose = 0)
        
        # word : 예측한 단어 : result에 해당하는 단어
        for word, index in t.word_index.items():
            if index == result:
                break
        # 예측 단어를 input으로 넣어 또 다음 단어 에측을 위함
        current_word = word
        sentence = sentence + ' ' + word # 단어를 n번 붙이기
    
    # 초기 주어진 단어 + 문장(예측에 예측을 거듭하여 생성된 문장)
    sentence = init_word + sentence
    return sentence

In [58]:
print(sentence_generation(model, t, '먹으러', 6))

먹으러 갈래 메뉴는 햄버거 점심 먹으러 갈래
