## 1. 문자 단위 RNN 언어 모델 (Char RNNLM)
- 

In [1]:
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename="11-0.txt")

f = open('11-0.txt', 'rb')
sentences = []
for sentence in f:
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = sentence.decode('ascii','ignore') # 바이트 열 제거
    if len(sentence)>0:
        sentences.append(sentence)
f.close()


In [2]:
total_data = ' '.join(sentences)
print('문자열의 총 길이 : %d' % len(total_data))

문자열의 총 길이 : 159484


In [3]:
char_vocab = sorted(list(set(total_data)))
vocab_size = len(char_vocab)
print("문자 집합의 크기 :{}".format(vocab_size))

문자 집합의 크기 :56


In [4]:
char_to_index = dict((char, index) for index, char in enumerate(char_vocab))
print('문자 집합 :',char_to_index)

문자 집합 : {' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, '_': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55}


In [5]:
index_to_char = {}
for key, value in char_to_index.items():
    index_to_char[value] = key

In [6]:
# appl (입력 시퀀스) -> pple (예측해야하는 시퀀스)
train_X = 'appl'
train_y = 'pple'

In [7]:
seq_length = 60

# 문자열의 길이를 seq_length로 나누면 전처리 후 생겨날 샘플 수
n_samples = int(np.floor((len(total_data) - 1) / seq_length))
print ('샘플의 수 : {}'.format(n_samples))

샘플의 수 : 2658


In [8]:
train_X = []
train_y = []

for i in range(n_samples):
    # 0:60 -> 60:120 -> 120:180로 loop를 돌면서 문장 샘플을 1개씩 pick.
    X_sample = total_data[i * seq_length: (i + 1) * seq_length]

    # 정수 인코딩
    X_encoded = [char_to_index[c] for c in X_sample]
    train_X.append(X_encoded)

    # 오른쪽으로 1칸 쉬프트
    y_sample = total_data[i * seq_length + 1: (i + 1) * seq_length + 1]
    y_encoded = [char_to_index[c] for c in y_sample]
    train_y.append(y_encoded)

In [9]:
print('X 데이터의 첫번째 샘플 :',train_X[0])
print('y 데이터의 첫번째 샘플 :',train_y[0])
print('-'*50)
print('X 데이터의 첫번째 샘플 디코딩 :',[index_to_char[i] for i in train_X[0]])
print('y 데이터의 첫번째 샘플 디코딩 :',[index_to_char[i] for i in train_y[0]])

X 데이터의 첫번째 샘플 : [49, 37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30]
y 데이터의 첫번째 샘플 : [37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30, 43]
--------------------------------------------------
X 데이터의 첫번째 샘플 디코딩 : ['t', 'h', 'e', ' ', 'p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u', 't', 'e', 'n', 'b', 'e', 'r', 'g', ' ', 'e', 'b', 'o', 'o', 'k', ' ', 'o', 'f', ' ', 'a', 'l', 'i', 'c', 'e', 's', ' ', 'a', 'd', 'v', 'e', 'n', 't', 'u', 'r', 'e', 's', ' ', 'i', 'n', ' ', 'w', 'o', 'n', 'd', 'e', 'r', 'l', 'a']
y 데이터의 첫번째 샘플 디코딩 : ['h', 'e', ' ', 'p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u', 't', 'e', 'n', 'b', 'e', 'r', 'g', ' ', 'e',

In [10]:
print(train_X[1])

[43, 33, 10, 0, 31, 54, 0, 41, 34, 52, 38, 48, 0, 32, 30, 47, 47, 44, 41, 41, 0, 49, 37, 38, 48, 0, 34, 31, 44, 44, 40, 0, 38, 48, 0, 35, 44, 47, 0, 49, 37, 34, 0, 50, 48, 34, 0, 44, 35, 0, 30, 43, 54, 44, 43, 34, 0, 30, 43, 54]


In [11]:
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)

print('train_X의 크기(shape) : {}'.format(train_X.shape)) # 원-핫 인코딩
print('train_y의 크기(shape) : {}'.format(train_y.shape)) # 원-핫 인코딩

train_X의 크기(shape) : (2658, 60, 56)
train_y의 크기(shape) : (2658, 60, 56)


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

hidden_units = 256

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(None, train_X.shape[2]), return_sequences=True))
model.add(LSTM(hidden_units, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=80, verbose=2)

1 Physical GPUs, 1 Logical GPUs
Epoch 1/80
84/84 - 11s - loss: 3.0682 - accuracy: 0.1818 - 11s/epoch - 129ms/step
Epoch 2/80
84/84 - 1s - loss: 2.7435 - accuracy: 0.2454 - 1s/epoch - 14ms/step
Epoch 3/80
84/84 - 1s - loss: 2.3939 - accuracy: 0.3283 - 1s/epoch - 15ms/step
Epoch 4/80
84/84 - 1s - loss: 2.2474 - accuracy: 0.3626 - 1s/epoch - 16ms/step
Epoch 5/80
84/84 - 1s - loss: 2.1513 - accuracy: 0.3875 - 1s/epoch - 16ms/step
Epoch 6/80
84/84 - 1s - loss: 2.0701 - accuracy: 0.4047 - 1s/epoch - 17ms/step
Epoch 7/80
84/84 - 1s - loss: 2.0041 - accuracy: 0.4239 - 1s/epoch - 17ms/step
Epoch 8/80
84/84 - 1s - loss: 1.9510 - accuracy: 0.4364 - 1s/epoch - 16ms/step
Epoch 9/80
84/84 - 1s - loss: 1.9037 - accuracy: 0.4479 - 1s/epoch - 17ms/step
Epoch 10/80
84/84 - 1s - loss: 1.8601 - accuracy: 0.4608 - 1s/epoch - 17ms/step
Epoch 11/80
84/84 - 1s - loss: 1.8181 - accuracy: 0.4732 - 1s/epoch - 17ms/step
Epoch 12/80
84/84 - 1s - loss: 1.7787 - accuracy: 0.4840 - 1s/epoch - 16ms/step
Epoch 13/80
84

<keras.callbacks.History at 0x21c7e1e8a30>

In [13]:
def sentence_generation(model, length):
    # 문자에 대한 랜덤한 정수 생성
    ix = [np.random.randint(vocab_size)]

    # 랜덤한 정수로부터 맵핑되는 문자 생성
    y_char = [index_to_char[ix[-1]]]
    print(ix[-1],'번 문자',y_char[-1],'로 예측을 시작!')

    # (1, length, 55) 크기의 X 생성. 즉, LSTM의 입력 시퀀스 생성
    X = np.zeros((1, length, vocab_size))

    for i in range(length):
        # X[0][i][예측한 문자의 인덱스] = 1, 즉, 예측 문자를 다음 입력 시퀀스에 추가
        X[0][i][ix[-1]] = 1
        print(index_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(index_to_char[ix[-1]])
    return ('').join(y_char)

In [14]:
result = sentence_generation(model, 100)
print(result)

39 번 문자 j 로 예측을 시작!
jesty, he feet they were all locked, and contated to see if there were a sortow, the room worts not jesty, he feet they were all locked, and contated to see if there were a sortow, the room worts not p


# 2. 문자 단위 RNN(Char RNN)으로 텍스트 생성하기

1) 데이터에 대한 이해와 전처리

In [15]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [28]:
raw_text = '''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''

In [29]:
# 단락을 없애고 하나의 문자열로 재저장
tokens = raw_text.split()
raw_text = ' '.join(tokens)
raw_text

"I get on with life as a programmer, I like to contemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine."

In [30]:
# 중복을 제거한 문자 집합 생성
char_vocab = sorted(list(set(raw_text)))
vocab_size = len(char_vocab)
print("문자 집합 :",  char_vocab)
print("문자 집합의 크기 : {}".format(vocab_size))

문자 집합 : [' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
문자 집합의 크기 : 33


In [31]:
char_to_index = dict((char, index) for index, char in enumerate(char_vocab))
print(char_to_index)

{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


In [32]:
# 훈련에 사용할 문장 샘플들을 만든다
# 입력 시퀀스의 길이가 5라고 하면, stude -> n  // tuden -> t 처럼 예측된다
# 입력시퀀스가 10이 되도록 구성하고, 예측 대상까지 포함한 길이를 11로 한다
length=11
sequences = []
for i in range(length, len(raw_text)):
    seq = raw_text[i-length:i]
    sequences.append(seq)
print("총 훈련 샘플 수 : %d"%len(sequences))

총 훈련 샘플 수 : 426


In [33]:
sequences[:10]

['I get on wi',
 ' get on wit',
 'get on with',
 'et on with ',
 't on with l',
 ' on with li',
 'on with lif',
 'n with life',
 ' with life ',
 'with life a']

In [36]:
encoded_sequences = []
for sequence in sequences:
    # 문장 샘플에서 각 문자에 대하여 정수 인코딩을 수행
    encoded_sequence = [char_to_index[char] for char in sequence] 
    encoded_sequences.append(encoded_sequence)

In [37]:
encoded_sequences[:5]

[[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18],
 [0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28],
 [16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17],
 [14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0],
 [28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21]]

In [38]:
encoded_sequences = np.array(encoded_sequences)

X_data = encoded_sequences[:, :-1]
y_data = encoded_sequences[:, -1]


In [39]:
X_data_one_hot = [to_categorical(encoded, num_classes=vocab_size) for encoded in X_data]
X_data_one_hot = np.array(X_data_one_hot)

y_data_one_hot = to_categorical(y_data, num_classes=vocab_size)

In [41]:
print(X_data_one_hot.shape)
# 배치 개수, 타임스탭, 벡터 차원(문자집합의 크기)

(426, 10, 33)


In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

hidden_units = 64

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(X_data_one_hot.shape[1], X_data_one_hot.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_data_one_hot, y_data_one_hot, epochs=100, verbose=2)

Epoch 1/100
14/14 - 2s - loss: 3.4640 - accuracy: 0.1573 - 2s/epoch - 127ms/step
Epoch 2/100
14/14 - 0s - loss: 3.3482 - accuracy: 0.1972 - 66ms/epoch - 5ms/step
Epoch 3/100
14/14 - 0s - loss: 3.1154 - accuracy: 0.1972 - 80ms/epoch - 6ms/step
Epoch 4/100
14/14 - 0s - loss: 3.0010 - accuracy: 0.1972 - 111ms/epoch - 8ms/step
Epoch 5/100
14/14 - 0s - loss: 2.9630 - accuracy: 0.1972 - 83ms/epoch - 6ms/step
Epoch 6/100
14/14 - 0s - loss: 2.9422 - accuracy: 0.1972 - 68ms/epoch - 5ms/step
Epoch 7/100
14/14 - 0s - loss: 2.9223 - accuracy: 0.1972 - 77ms/epoch - 5ms/step
Epoch 8/100
14/14 - 0s - loss: 2.9150 - accuracy: 0.1972 - 68ms/epoch - 5ms/step
Epoch 9/100
14/14 - 0s - loss: 2.8933 - accuracy: 0.1972 - 68ms/epoch - 5ms/step
Epoch 10/100
14/14 - 0s - loss: 2.8740 - accuracy: 0.1972 - 67ms/epoch - 5ms/step
Epoch 11/100
14/14 - 0s - loss: 2.8540 - accuracy: 0.1972 - 76ms/epoch - 5ms/step
Epoch 12/100
14/14 - 0s - loss: 2.8340 - accuracy: 0.1972 - 66ms/epoch - 5ms/step
Epoch 13/100
14/14 - 0s 

<keras.callbacks.History at 0x21c83d552b0>

In [43]:
# 문자열로 다음 문자 예측하기
def sentence_generation(model, char_to_index, seq_length, seed_text, n):
    
    # 초기 시퀀스 
    init_text = seed_text
    sentence = ''
    
    # 다음 문자 예측을 n 번 반복
    for _ in range(n):
        # 현재 시퀀스에 대한 정수 인코딩
        encoded = [char_to_index[char] for char in seed_text]
        # 데이터 패딩
        encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre')
        encoded = to_categorical(encoded, num_classes=len(char_to_index))
        
        # 입력한 X에 대하여 y를 예측하고 y 를 result 에 저장한다
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)
        
        for char, index in char_to_index.items():
            if index==result:
                break
            
        # 현재 시퀀스 + 예측 문자를 현재 시퀀스로 변경
        seed_text = seed_text + char
        
        # 예측 문자를 문장에 저장한다
        sentence = sentence + char
        
    # n 번의 예측이 끝나면 완성 문장을 리턴한다
    sentence = init_text + sentence
    return sentence

In [44]:
print(sentence_generation(model, char_to_index, 10, 'I get on w', 80))


I get on with life as a programmer, I like to hang out with programming and deep learning.
