In [1]:
import os 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import tensorflow as tf
tf.__version__

'2.12.0'

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, GRU, LSTM, Dense, Bidirectional

# 2. Keras로 LSTM 구현하기

- 기본적인 활용은 simple RNN과 거의 동일

https://keras.io/api/layers/recurrent_layers/lstm/

In [5]:
x = tf.random.uniform((1, 3, 4))
x.shape # (데이터 개수, seq의 길이(단어의 개수), 단어의 임베딩 차원)

TensorShape([1, 3, 4])

In [6]:
LSTM(8)(x) # unit 수 - hidden state의 차원의 크기 --> 누적된 seq의 정보가 8차원 vector에 기록
           # 따라서, 정보가 많으면 unit의 수가 더 커야함

<tf.Tensor: shape=(1, 8), dtype=float32, numpy=
array([[ 0.05510229, -0.15806057, -0.0120617 ,  0.21166593, -0.00374101,
         0.13033758, -0.04504932,  0.12556145]], dtype=float32)>

In [7]:
out =  LSTM(8, return_sequences = True, return_state = True)(x)
out # 모든 seq의 hidden state / 마지막 hidden state / cell state

[<tf.Tensor: shape=(1, 3, 8), dtype=float32, numpy=
 array([[[-0.1027828 , -0.02570496,  0.04342749,  0.1212106 ,
          -0.05779904, -0.07455692, -0.02374983,  0.11319604],
         [-0.09873844, -0.00498222,  0.07658664,  0.18294671,
          -0.08071811, -0.08066523, -0.03855846,  0.1389728 ],
         [-0.11606564, -0.01325427,  0.12444531,  0.2882171 ,
          -0.10411558, -0.08987059, -0.0307428 ,  0.19376348]]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 8), dtype=float32, numpy=
 array([[-0.11606564, -0.01325427,  0.12444531,  0.2882171 , -0.10411558,
         -0.08987059, -0.0307428 ,  0.19376348]], dtype=float32)>,
 <tf.Tensor: shape=(1, 8), dtype=float32, numpy=
 array([[-0.25342947, -0.02778663,  0.25038716,  0.61581564, -0.18369953,
         -0.20878783, -0.06350601,  0.37216583]], dtype=float32)>]

In [8]:
len(out)
# RNN과 다르게 len이 3인 이유는 LSTM은 RNN과의 차이점 존재 --> cell state

3

- 양방향 RNN

In [9]:
Bidirectional(LSTM(8))(x) # 마지막 hidden state가 두 배가 됨

<tf.Tensor: shape=(1, 16), dtype=float32, numpy=
array([[ 0.09339321, -0.16120915, -0.1861935 ,  0.03517503, -0.10660518,
        -0.08419734,  0.04895993, -0.17647159,  0.00752816,  0.20217797,
        -0.02757097, -0.01164663, -0.01385706,  0.2689354 , -0.00155638,
         0.040525  ]], dtype=float32)>

사실 실제로 SimpleRNN이 사용되는 경우는 거의 없습니다. 이보다는 LSTM이나 GRU을 주로 사용하는데, 이번에는 임의의 입력에 대해서 LSTM을 사용할 경우를 보겠습니다.

- 품사 태깅 문제

In [10]:
John = [1,0,0,0]
loves = [0,1,0,0]
Jane = [0,0,1,0]
Alex = [0,0,0,1]

train_X = np.array([
    [ John, loves, Jane ],
    [ Jane, loves, Alex ]
]).astype(np.float32)

S = [0] # subject
V = [1] # verb
O = [2] # object

idx2tag = ['S', 'V', 'O']

train_Y = np.array([[S, V, O], [S, V, O]]).astype(np.float32)

print("train_y", train_Y)
print("train_X의 shape", train_X.shape)
print("train_Y의 shape", train_Y.shape)

train_y [[[0.]
  [1.]
  [2.]]

 [[0.]
  [1.]
  [2.]]]
train_X의 shape (2, 3, 4)
train_Y의 shape (2, 3, 1)


In [None]:
num_classes = 3
input_dim = 4  
sequence_length = 3
learning_rate = 0.1


lstm = LSTM(num_classes)
output = lstm(train_X)

print('hidden state : {}, shape: {}'.format(output, output.shape))

In [None]:
# return_sequences = True 

lstm = LSTM(3, return_sequences=True, return_state=True)
whole_seq_output, final_memory_state, final_carry_state = lstm(train_X)

print('whole_seq_output: {}, shape: {}'.format(whole_seq_output, whole_seq_output.shape))
print('final_memory_state : {}, shape: {}'.format(final_memory_state, final_memory_state.shape))
print('final_carry_state : {}, shape: {}'.format(final_carry_state, final_carry_state.shape))

- LSTM

In [14]:
from tensorflow.keras import layers, models

lstm_model = models.Sequential() #모델 호출
lstm_model.add(
    layers.LSTM(units=3,
                input_shape = (3,4), 
                return_sequences = True, # !!!
                name='LSTM-1')
    )

lstm_model.add(
    layers.Dense(
        units=3,
        input_shape=(3,3), 
        activation= 'softmax', 
        name='hidden-to-output')) # 출력을 위한 FFN


lstm_model.compile(
    loss='sparse_categorical_crossentropy', # 분류? 회귀? 생성? 추천? --> target 형태
    optimizer='adam',
    metrics=['accuracy'])


In [16]:
lstm_model.fit(train_X, train_Y, epochs= 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f04c8fa5880>

In [17]:
predictions = lstm_model.predict(train_X) # 3개의 class에 대한 확률값
for i, prediction in enumerate(predictions):
  print(prediction)
  print(np.argmax(prediction, axis=1)) # 확률이 큰 index --> class
  result_str = [idx2tag[c] for c in np.argmax(prediction, axis=1) ] # index에서 tag 확인
  print("\tPrediction str: ", "".join(result_str))

[[0.3516856  0.30227014 0.34604418]
 [0.32119507 0.38397402 0.29483098]
 [0.3234248  0.3205833  0.3559919 ]]
[0 1 2]
	Prediction str:  SVO
[[0.34340426 0.30303183 0.35356385]
 [0.31654036 0.38435557 0.29910412]
 [0.3287321  0.3150349  0.356233  ]]
[2 1 2]
	Prediction str:  OVO


모델을 이해하려면 weight 개수(W0\~Wn)를 세어봐야 합니다.

In [18]:
lstm_model.summary() # RNN에 비해 LSTM이 4배 수준의 parameter 수를 보인다 --> 모델의 복잡도 증가(더 복잡한 문제 해결한다는 '장점' but! 더 많은 데이터를 필요로 한다는 '단점')

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 LSTM-1 (LSTM)               (None, 3, 3)              96        
                                                                 
 hidden-to-output (Dense)    (None, 3, 3)              12        
                                                                 
Total params: 108
Trainable params: 108
Non-trainable params: 0
_________________________________________________________________


In [19]:
for model_weight in lstm_model.weights:
    print(model_weight.name, '=>', model_weight.shape)

LSTM-1/lstm_cell_6/kernel:0 => (4, 12)
LSTM-1/lstm_cell_6/recurrent_kernel:0 => (3, 12)
LSTM-1/lstm_cell_6/bias:0 => (12,)
hidden-to-output/kernel:0 => (3, 3)
hidden-to-output/bias:0 => (3,)


### 문제1 

In [None]:
model = Sequential()
model.add(LSTM(7, input_shape=(100,5)))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 7)                 364       
                                                                 
Total params: 364
Trainable params: 364
Non-trainable params: 0
_________________________________________________________________


- W_forget : (num_units + input_dim + 1) * num_units
- W_input : (num_units + input_dim + 1) * num_units
- W_output : (num_units + input_dim + 1) * num_units
- W_cell : (num_units + input_dim + 1) * num_units

### 문제2

In [None]:
model = Sequential()
model.add(LSTM(5, input_shape = (2, 10)))
model.add(Dense(1))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 5)                 320       
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 326
Trainable params: 326
Non-trainable params: 0
_________________________________________________________________


- W_forget : (num_units + input_dim + 1) * num_units
- W_input : (num_units + input_dim + 1) * num_units
- W_output : (num_units + input_dim + 1) * num_units
- W_cell : (num_units + input_dim + 1) * num_units

In [None]:
# weight 개수 카운팅

### 문제3

In [None]:
model = Sequential()
model.add(LSTM(5, input_shape = (2, 10), return_sequences=True))
model.add(LSTM(7))
model.add(Dense(1))
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_5 (LSTM)               (None, 2, 5)              320       
                                                                 
 lstm_6 (LSTM)               (None, 7)                 364       
                                                                 
 dense_3 (Dense)             (None, 1)                 8         
                                                                 
Total params: 692
Trainable params: 692
Non-trainable params: 0
_________________________________________________________________


In [None]:
# 첫번째 레이어의 파라미터 개수

In [None]:
# 두번째 레이어의 파라미터 개수 


In [None]:
# 세번째 레이어의 파라미터 개수 

# 3. Keras로 GRU 구현하기

https://keras.io/api/layers/recurrent_layers/gru/

In [None]:
gru = GRU(num_classes)
output = gru(train_X)

print('hidden state : {}, shape: {}'.format(output, output.shape))

hidden state : [[-0.3331254   0.15914953 -0.20464015]
 [-0.226817    0.35800463 -0.16532332]], shape: (2, 3)


In [None]:
# return_sequences = True 
gru = GRU(3, return_sequences=True, return_state=True)
whole_sequence_output, final_state= gru(train_X)

print('whole_seq_output: {}, shape: {}'.format(whole_seq_output, whole_seq_output.shape))
print('final_state : {}, shape: {}'.format(final_state, final_state.shape))

whole_seq_output: [[[-0.09133016 -0.02071412  0.14170164]
  [ 0.02361282  0.06159054  0.03471249]
  [ 0.06012786 -0.04471743 -0.08892121]]

 [[ 0.03587217 -0.0941782  -0.11587585]
  [ 0.08149255  0.02020295 -0.18497597]
  [ 0.03355836 -0.02213107 -0.14921774]]], shape: (2, 3, 3)
final_state : [[ 0.10870809  0.3283546  -0.2419539 ]
 [-0.13706805 -0.10124536 -0.31973392]], shape: (2, 3)


In [None]:
from tensorflow.keras import layers, models

gru_model = models.Sequential() #모델 호출
gru_model.add(
    layers.GRU(units=3,
                input_shape = (3,4), 
                return_sequences = True, # !!!
                reset_after = False,
                name='GRU')
    )
# reset_after : keras 구현을 하면서 병렬처리를 위해 공식을 수정(bias를 2개로 나눔)하였는데, 원래 논문에 나온 공식으로 계산하기 위해 False로 수정

gru_model.add(
    layers.Dense(
        units=3,
        input_shape=(3,3), 
        activation= 'softmax', 
        name='hidden-to-output')) # 출력을 위한 FFN


gru_model.compile(
    loss='sparse_categorical_crossentropy', # 분류? 회귀? 생성? 추천? --> target 형태
    optimizer='adam',
    metrics=['accuracy'])


In [None]:
gru_model.fit(train_X, train_Y, epochs= 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fcb1dbe1710>

In [None]:
predictions = gru_model.predict(train_X)
for i, prediction in enumerate(predictions):
  print(prediction)
  print(np.argmax(prediction, axis=1))
  result_str = [idx2tag[c] for c in np.argmax(prediction, axis=1) ]
  print("\tPrediction str: ", "".join(result_str))


[[0.8838949  0.03892433 0.07718088]
 [0.03276091 0.89175344 0.07548558]
 [0.12529185 0.05806499 0.8166431 ]]
[0 1 2]
	Prediction str:  SVO
[[0.8183433  0.04454137 0.13711534]
 [0.03734664 0.8973856  0.06526773]
 [0.07024387 0.06087027 0.8688859 ]]
[0 1 2]
	Prediction str:  SVO


모델을 이해하려면 weight 개수(W0\~Wn)를 세어봐야 합니다.

In [None]:
gru_model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 GRU-1 (GRU)                 (None, 3, 3)              72        
                                                                 
 hidden-to-output (Dense)    (None, 3, 3)              12        
                                                                 
Total params: 84
Trainable params: 84
Non-trainable params: 0
_________________________________________________________________


In [None]:
for model_weight in gru_model.weights:
    print(model_weight.name, '=>', model_weight.shape)

### 문제1

In [None]:
model = Sequential()
model.add(GRU(9, input_dim = 10, return_sequences=True, reset_after=False))
# reset_after : keras 구현을 하면서 병렬처리를 위해 공식을 수정(bias를 2개로 나눔)하였는데, 원래 논문에 나온 공식으로 계산하기 위해 False로 수정
model.add(GRU(6, reset_after = False))
model.add(Dense(3))
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_6 (GRU)                 (None, None, 9)           540       
                                                                 
 gru_7 (GRU)                 (None, 6)                 288       
                                                                 
 dense_6 (Dense)             (None, 3)                 21        
                                                                 
Total params: 849
Trainable params: 849
Non-trainable params: 0
_________________________________________________________________


- W_reset : (num_units + input_dim + 1) * num_units
- W_update : (num_units + input_dim + 1) * num_units
- W_new : (num_units + input_dim + 1) * num_units


In [None]:
# 첫번째 레이어의 파리미터 개수

In [None]:
# 두번째 레이어의 파라미터 개수

In [None]:
# 세번째 레이어의 파리미터 개수

### reference

- https://github.com/ukairia777/tensorflow-nlp-tutorial/blob/main/08.%20RNN/8-4.%20understanding_simplernn_and_lstm.ipynb