LSTM을 사용해 셰익스피어의 저서에서 고급 자동 텍스트 생성

In [None]:
from __future__ import print_function
import numpy as np
import random
import sys

In [None]:
text = open('/content/drive/MyDrive/Colab Notebooks/13주차_과제_김정웅(20175308)/shakespeare_final.txt').read().lower()

In [None]:
characters = sorted(list(set(text))) # 중복되지 않는 텍스트 집합 리스트

In [None]:
print('corpus length:', len(text))
print('total chars:', len(characters))

corpus length: 196788
total chars: 63


In [None]:
char2indices = dict((c, i) for i, c in enumerate(characters))
indices2char = dict((i, c) for i, c in enumerate(characters))

In [None]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences)) 

nb sequences: 65583


In [None]:
# 인덱스를 벡터화된 형태로 변환
X = np.zeros((len(sentences), maxlen, len(characters)), dtype=np.bool)
y = np.zeros((len(sentences), len(characters)), dtype=np.bool)

In [None]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char2indices[char]] = 1
    y[i, char2indices[next_chars[i]]] = 1

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM,Activation,Dropout
from tensorflow.keras.optimizers import RMSprop

In [None]:
model = Sequential()

model.add(LSTM(128, input_shape=(maxlen, len(characters))))
model.add(Dense(len(characters)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01))

print (model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               98304     
                                                                 
 dense (Dense)               (None, 63)                8127      
                                                                 
 activation (Activation)     (None, 63)                0         
                                                                 
Total params: 106,431
Trainable params: 106,431
Non-trainable params: 0
_________________________________________________________________
None


  super(RMSprop, self).__init__(name, **kwargs)


In [None]:
# 예측된 값을 인덱스로 변환
def pred_indices(preds, metric=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / metric
    exp_preds = np.exp(preds)
    preds = exp_preds/np.sum(exp_preds)
    probs = np.random.multinomial(1, preds, 1)
    return np.argmax(probs)

In [None]:
for iteration in range(1, 30):
    print('-' * 40)
    print('Iteration', iteration)
    model.fit(X, y,batch_size=128,epochs=1)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.7,1.2]:

        print('\n----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x = np.zeros((1, maxlen, len(characters)))
            for t, char in enumerate(sentence):
                x[0, t, char2indices[char]] = 1. # 인덱스 벡터화

            preds = model.predict(x, verbose=0)[0]
            next_index = pred_indices(preds, diversity)
            pred_char = indices2char[next_index]

            generated += pred_char
            sentence = sentence[1:] + pred_char

            sys.stdout.write(pred_char)
            sys.stdout.flush()
        print("\nOne combination completed \n")

----------------------------------------
Iteration 1

----- diversity: 0.2
----- Generating with seed: " compost on the weeds, to
make them rank"
 compost on the weeds, to
make them ranked the me the not the work the not the not the nould ther the nound the work dore to the prown the sere the me the mound the mound the me to the me the mast of the prown the son the mound the mound the me the mound and the sere so hore to the mound it the sere the prown the be the not the mate ther the mout to the work the mane to of the not ther with the not the mound the not the nour the not the
One combination completed 


----- diversity: 0.7
----- Generating with seed: " compost on the weeds, to
make them rank"
 compost on the weeds, to
make them ranke hive nor tom crome on coun thou doind’d to bremadited of it say had. so what dove of i hou the thou, ble to to still now thir of have to appolice to my with hit hig  and be thele a fill exemed, my them in som loted and the gutend prome the or of doow