<a href="https://colab.research.google.com/github/ImJongHwan/practice-ml-nlp/blob/main/8_recurrent_neural_network/7_Char_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 문자 단위 RNN(Char RNN)

https://wikidocs.net/48649

## 문자 단위 RNN 언어 모델 (Char RNNLM)

In [56]:
## 데이터에 대한 이해와 전처리

import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename="11-0.txt")

f = open('11-0.txt', 'rb')
sentences = []
for sentence in f:
  sentence = sentence.strip()
  sentence = sentence.lower()
  sentence = sentence.decode('ascii', 'ignore')
  if len(sentence) > 0:
    sentences.append(sentence)
f.close()

In [57]:
sentences[:5]

['the project gutenberg ebook of alices adventures in wonderland, by lewis carroll',
 'this ebook is for the use of anyone anywhere in the united states and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever. you may copy it, give it away or re-use it under the terms',
 'of the project gutenberg license included with this ebook or online at']

In [58]:
total_data = ' '.join(sentences)
print('문자열의 길이 또는 총 문자의 개수: %d' % len(total_data))

문자열의 길이 또는 총 문자의 개수: 159484


In [59]:
print(total_data[:200])

the project gutenberg ebook of alices adventures in wonderland, by lewis carroll this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with


In [60]:
char_vocab = sorted(list(set(total_data)))
vocab_size = len(char_vocab)
print(f'문자 집합의 크기: {vocab_size}')

문자 집합의 크기: 56


In [61]:
char_to_index = dict((char, index) for index, char in enumerate(char_vocab))
print('문자 집합', char_to_index)

문자 집합 {' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, '_': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55}


In [62]:
index_to_char = {}
for key, value in char_to_index.items():
  index_to_char[value] = key

In [63]:
train_X = 'appl'
train_y = 'pple'

In [64]:
seq_length = 60

n_samples = int(np.floor((len(total_data) - 1) / seq_length))
print(f'샘플의 수: {n_samples}')

샘플의 수: 2658


In [65]:
train_X = []
train_y = []

for i in range(n_samples):
  X_sample = total_data[i * seq_length: (i+1) * seq_length]

  X_encoded = [char_to_index[c] for c in X_sample]
  train_X.append(X_encoded)

  y_sample = total_data[i * seq_length + 1: (i + 1) * seq_length +1]
  y_encoded = [char_to_index[c] for c in y_sample]
  train_y.append(y_encoded)

In [66]:
print('X 데이터의 첫번째 샘플: ', train_X[0])
print('y 데이터의 첫번째 샘플: ', train_y[0])
print('-'*50)
print('X 데이터의 첫번째 샘플 디코딩: ', [index_to_char[i] for i in train_X[0]])
print('y 데이터의 첫번째 샘플 디코딩: ', [index_to_char[i] for i in train_y[0]])

X 데이터의 첫번째 샘플:  [49, 37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30]
y 데이터의 첫번째 샘플:  [37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30, 43]
--------------------------------------------------
X 데이터의 첫번째 샘플 디코딩:  ['t', 'h', 'e', ' ', 'p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u', 't', 'e', 'n', 'b', 'e', 'r', 'g', ' ', 'e', 'b', 'o', 'o', 'k', ' ', 'o', 'f', ' ', 'a', 'l', 'i', 'c', 'e', 's', ' ', 'a', 'd', 'v', 'e', 'n', 't', 'u', 'r', 'e', 's', ' ', 'i', 'n', ' ', 'w', 'o', 'n', 'd', 'e', 'r', 'l', 'a']
y 데이터의 첫번째 샘플 디코딩:  ['h', 'e', ' ', 'p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u', 't', 'e', 'n', 'b', 'e', 'r', 'g', ' ', 'e',

In [67]:
print(train_X[1])

[43, 33, 10, 0, 31, 54, 0, 41, 34, 52, 38, 48, 0, 32, 30, 47, 47, 44, 41, 41, 0, 49, 37, 38, 48, 0, 34, 31, 44, 44, 40, 0, 38, 48, 0, 35, 44, 47, 0, 49, 37, 34, 0, 50, 48, 34, 0, 44, 35, 0, 30, 43, 54, 44, 43, 34, 0, 30, 43, 54]


In [68]:
print(train_y[1])

[33, 10, 0, 31, 54, 0, 41, 34, 52, 38, 48, 0, 32, 30, 47, 47, 44, 41, 41, 0, 49, 37, 38, 48, 0, 34, 31, 44, 44, 40, 0, 38, 48, 0, 35, 44, 47, 0, 49, 37, 34, 0, 50, 48, 34, 0, 44, 35, 0, 30, 43, 54, 44, 43, 34, 0, 30, 43, 54, 52]


In [69]:
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)

print(f'train_X의 크기(shape): {train_X.shape}')
print(f'train_y의 크기(shape): {train_y.shape}')

train_X의 크기(shape): (2658, 60, 56)
train_y의 크기(shape): (2658, 60, 56)


In [16]:
## 모델 설계하기
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

In [18]:
hidden_units = 256

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(None, train_X.shape[2]), return_sequences=True))
model.add(LSTM(hidden_units, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=80, verbose=2)

Epoch 1/80
84/84 - 48s - loss: 3.0706 - accuracy: 0.1826 - 48s/epoch - 569ms/step
Epoch 2/80
84/84 - 37s - loss: 2.7126 - accuracy: 0.2519 - 37s/epoch - 445ms/step
Epoch 3/80
84/84 - 37s - loss: 2.3790 - accuracy: 0.3320 - 37s/epoch - 441ms/step
Epoch 4/80
84/84 - 38s - loss: 2.2445 - accuracy: 0.3631 - 38s/epoch - 450ms/step
Epoch 5/80
84/84 - 37s - loss: 2.1353 - accuracy: 0.3890 - 37s/epoch - 442ms/step
Epoch 6/80
84/84 - 37s - loss: 2.0547 - accuracy: 0.4081 - 37s/epoch - 442ms/step
Epoch 7/80
84/84 - 43s - loss: 1.9789 - accuracy: 0.4299 - 43s/epoch - 509ms/step
Epoch 8/80
84/84 - 38s - loss: 1.9183 - accuracy: 0.4448 - 38s/epoch - 449ms/step
Epoch 9/80
84/84 - 37s - loss: 1.8623 - accuracy: 0.4601 - 37s/epoch - 442ms/step
Epoch 10/80
84/84 - 37s - loss: 1.8138 - accuracy: 0.4743 - 37s/epoch - 438ms/step
Epoch 11/80
84/84 - 37s - loss: 1.7675 - accuracy: 0.4878 - 37s/epoch - 435ms/step
Epoch 12/80
84/84 - 37s - loss: 1.7222 - accuracy: 0.4986 - 37s/epoch - 436ms/step
Epoch 13/80
8

<keras.callbacks.History at 0x7fe728483450>

In [72]:
def sentence_generation(model, length):
  ix = [np.random.randint(vocab_size)]

  y_char = [index_to_char[ix[-1]]]
  print(ix[-1], '번 문자', y_char[-1],'로 예측을 시작!')

  X = np.zeros((1, length, vocab_size))

  for i in range(length):
    X[0][i][ix[-1]] = 1
    print(index_to_char[ix[-1]], end="")
    ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
    y_char.append(index_to_char[ix[-1]])
  return ('').join(y_char)

In [73]:
result = sentence_generation(model, 100)
print(result)

35 번 문자 f 로 예측을 시작!
f the month, and doesnt tell what oclock it is! why should it had leet of the side, the queen was hif the month, and doesnt tell what oclock it is! why should it had leet of the side, the queen was his


## 문자 단위 RNN(Char RNN)으로 텍스트 생성하기

In [74]:
## 데이터에 대한 이해와 전처리

import numpy as np
from tensorflow.keras.utils import to_categorical

In [75]:
raw_text = '''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''

In [76]:
tokens = raw_text.split()
raw_text = ' '.join(tokens)
print(raw_text)

I get on with life as a programmer, I like to contemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine.


In [77]:
char_vocab = sorted(list(set(raw_text)))
vocab_size = len(char_vocab)
print('문자 집합: ', char_vocab)
print(f'문자 집합의 크기: {vocab_size}')

문자 집합:  [' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
문자 집합의 크기: 33


In [78]:
char_to_index = dict((char, index) for index, char in enumerate(char_vocab))
print(char_to_index)

{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


In [79]:
length = 11
sequences = []
for i in range(length, len(raw_text)):
  seq = raw_text[i-length:i]
  sequences.append(seq)
print('총 훈련 샘플의 수: %d' % len(sequences))

총 훈련 샘플의 수: 426


In [80]:
sequences[:10]

['I get on wi',
 ' get on wit',
 'get on with',
 'et on with ',
 't on with l',
 ' on with li',
 'on with lif',
 'n with life',
 ' with life ',
 'with life a']

In [81]:
encoded_sequences = []
for sequence in sequences:
  encoded_sequence = [char_to_index[char] for char in sequence]
  encoded_sequences.append(encoded_sequence)

In [82]:
encoded_sequences[:5]

[[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18],
 [0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28],
 [16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17],
 [14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0],
 [28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21]]

In [83]:
encoded_sequences = np.array(encoded_sequences)

X_data = encoded_sequences[:, :-1]
y_data = encoded_sequences[:, -1]

In [84]:
print(X_data[:5])
print(y_data[:5])

[[ 8  0 16 14 28  0 24 23  0 31]
 [ 0 16 14 28  0 24 23  0 31 18]
 [16 14 28  0 24 23  0 31 18 28]
 [14 28  0 24 23  0 31 18 28 17]
 [28  0 24 23  0 31 18 28 17  0]]
[18 28 17  0 21]


In [85]:
X_data_one_hot = [to_categorical(encoded, num_classes=vocab_size) for encoded in X_data]
X_data_one_hot = np.array(X_data_one_hot)
y_data_one_hot = to_categorical(y_data, num_classes=vocab_size)

In [86]:
print(X_data_one_hot.shape)

(426, 10, 33)


In [89]:
## 모델 설계하기
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [91]:
hidden_units = 64

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(X_data_one_hot.shape[1], X_data_one_hot.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_data_one_hot, y_data_one_hot, epochs=100, verbose=2)

Epoch 1/100
14/14 - 2s - loss: 3.4544 - accuracy: 0.1408 - 2s/epoch - 137ms/step
Epoch 2/100
14/14 - 0s - loss: 3.3131 - accuracy: 0.1972 - 95ms/epoch - 7ms/step
Epoch 3/100
14/14 - 0s - loss: 3.0553 - accuracy: 0.1972 - 98ms/epoch - 7ms/step
Epoch 4/100
14/14 - 0s - loss: 2.9846 - accuracy: 0.1972 - 92ms/epoch - 7ms/step
Epoch 5/100
14/14 - 0s - loss: 2.9514 - accuracy: 0.1972 - 96ms/epoch - 7ms/step
Epoch 6/100
14/14 - 0s - loss: 2.9317 - accuracy: 0.1972 - 85ms/epoch - 6ms/step
Epoch 7/100
14/14 - 0s - loss: 2.9132 - accuracy: 0.1972 - 92ms/epoch - 7ms/step
Epoch 8/100
14/14 - 0s - loss: 2.8999 - accuracy: 0.1972 - 86ms/epoch - 6ms/step
Epoch 9/100
14/14 - 0s - loss: 2.8828 - accuracy: 0.1972 - 89ms/epoch - 6ms/step
Epoch 10/100
14/14 - 0s - loss: 2.8645 - accuracy: 0.1972 - 87ms/epoch - 6ms/step
Epoch 11/100
14/14 - 0s - loss: 2.8248 - accuracy: 0.1972 - 91ms/epoch - 7ms/step
Epoch 12/100
14/14 - 0s - loss: 2.8054 - accuracy: 0.2160 - 86ms/epoch - 6ms/step
Epoch 13/100
14/14 - 0s -

<keras.callbacks.History at 0x7fe7275331d0>

In [92]:
def sentence_generation(model, char_to_index, seq_length, seed_text, n):

  init_text = seed_text
  sentence = ''

  for _ in range(n):
    encoded = [char_to_index[char] for char in seed_text]
    encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre')
    encoded = to_categorical(encoded, num_classes=len(char_to_index))

    result = model.predict(encoded, verbose=0)
    result = np.argmax(result, axis=1)

    for char, index in char_to_index.items():
      if index == result:
        break
      
    seed_text = seed_text + char
    sentence = sentence + char
  sentence = init_text + sentence
  return sentence

In [93]:
print(sentence_generation(model, char_to_index, 10, 'I get on w', 80))

I get on with life as a programmer, I like to cse wiint toest tldivw ae ,rrrr. tht  uueer.
