<a href="https://colab.research.google.com/github/KORshinjoonghyeok/DL/blob/master/%5B2%5DShakespeare_Writing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import time
import warnings
warnings.filterwarnings('ignore')

###### 데이터셋 다운로드

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

In [4]:
print(text[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


###### 화이트 스페이스 포함 출력

In [5]:
print(repr(text[:200]))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you'


In [6]:
#총 문장 길이
len(text)

1115394

###### 데이터셋의 텍스트를 정렬후 vocab에 저장

In [7]:
vocab = sorted(set(text))

In [8]:
vocab[:10]

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']

###### 텍스트에 총 사용된 캐릭터 갯수

In [9]:
len(vocab)

65

### Preprocessing

##### 각 캐릭터에 인덱스 부여
######  -  enumerate : 순서가 있는 자료형의 index번호 와 index값 을 반환하는 함수

In [10]:
char2idx = {u: i for i, u in enumerate(vocab)}

In [11]:
char2idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

##### index -> Char

In [12]:
idx2char = np.array(vocab)

###### - 49번째 index에 해당하는 char = 'k'

In [13]:
idx2char[49]

'k'

In [14]:
text[:200]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you'

In [15]:
char2idx['i']

47

###### 전체 text -> int

In [16]:
text_as_int = np.array([char2idx[c] for c in text])

###### - 총 문자열 길이와 동일

In [17]:
len(text_as_int)

1115394

In [18]:
text_as_int[:10]

array([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])

###### - 원본 문자열과 변환된 시퀀스 비교

In [19]:
#원본 문자열과 변환된 시퀀스
print(text[:5],text_as_int[:5])

First [18 47 56 57 58]


In [20]:
#단어사전 출력
char2idx['F'],char2idx['i'],char2idx['r'],char2idx['s'],char2idx['t']

(18, 47, 56, 57, 58)

###### - 동일

---
---
---

###### Generate X,y dataset

In [21]:
# 단일 입력에 대해 원하는 문장의 최대길이
window_size = 100
shuffle_buffer = 10000
batch_size=64

###### if 'h','e','l','l'
###### pred 'e','l','l','o'

In [22]:
def windowed_dataset(series, window_size, shuffle_buffer, batch_size):
    series = tf.expand_dims(series, -1) #차원 확장
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size + 1, shift=1, drop_remainder=True) #윈도우사이즈 +1
    ds = ds.flat_map(lambda x: x.batch(window_size + 1))
    ds = ds.shuffle(shuffle_buffer)
    ds = ds.map(lambda x: (x[:-1], x[1:]))
    return ds.batch(batch_size).prefetch(1)

In [23]:
train_data = windowed_dataset(np.array(text_as_int), window_size, shuffle_buffer, batch_size)

In [24]:
vocab_size = len(vocab)
vocab_size

65

In [25]:
#임베딩할 vector차원
embedding_dim = 256

# RNN unit count
rnn_units = 1024

In [26]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
])

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [28]:
checkpoint_path = './models/my_checkpt.ckpt'

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True, 
    save_best_only=True,
    monitor='loss', 
    verbose=1, 
)

In [29]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True) 

In [30]:
model.compile(optimizer='adam', loss=loss, metrics=['acc'])

###### fit

In [31]:
model.fit(train_data, 
          epochs=10, 
          steps_per_epoch=1720, 
          callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 00001: loss improved from inf to 0.69684, saving model to ./models/my_checkpt.ckpt
Epoch 2/10
Epoch 00002: loss improved from 0.69684 to 0.31148, saving model to ./models/my_checkpt.ckpt
Epoch 3/10
Epoch 00003: loss improved from 0.31148 to 0.28099, saving model to ./models/my_checkpt.ckpt
Epoch 4/10
Epoch 00004: loss improved from 0.28099 to 0.28094, saving model to ./models/my_checkpt.ckpt
Epoch 5/10
Epoch 00005: loss did not improve from 0.28094
Epoch 6/10
Epoch 00006: loss improved from 0.28094 to 0.28056, saving model to ./models/my_checkpt.ckpt
Epoch 7/10
Epoch 00007: loss did not improve from 0.28056
Epoch 8/10
Epoch 00008: loss did not improve from 0.28056
Epoch 9/10
Epoch 00009: loss did not improve from 0.28056
Epoch 10/10
Epoch 00010: loss did not improve from 0.28056


<tensorflow.python.keras.callbacks.History at 0x7f6de881d5f8>

In [32]:
#모델 재정의
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[1, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
])

In [33]:
model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f6d8c11e2b0>

In [34]:
model.build(tf.TensorShape([1, None]))

In [35]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [36]:
def generate_text(model, start_string):
    # 평가 단계 (학습된 모델을 사용하여 텍스트 생성)

    # 생성할 문자의 수
    num_generate = 1000

    # 시작 문자열을 숫자로 변환(벡터화)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # 결과를 저장할 빈 문자열
    text_generated = []

    # 온도가 낮으면 더 예측 가능한 텍스트가 됩니다.
    # 온도가 높으면 더 의외의 텍스트가 됩니다.
    # 최적의 세팅을 찾기 위한 실험
    temperature = 1.0

    # 여기에서 배치 크기 == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # 배치 차원 제거
        predictions = tf.squeeze(predictions, 0)

        # 범주형 분포를 사용하여 모델에서 리턴한 단어 예측
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # 예측된 단어를 다음 입력으로 모델에 전달
        # 이전 은닉 상태와 함께
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [38]:
print(generate_text(model, start_string=u"FLOWER "))

FLOWER EDWARD IV:
Brother of Gloucester, and give our satisfied?

QUEEN MARGARET:
Mine, such as IV:
No, God forbid that I should wish them sever'd
Whom God hath join'd together; ay, and 'twere pity
To su.

PRING EDWARD IV:
Now, brother of LErrown him ere't be long.'

KING EDWARD IV:
Ha! durst the traitor breathe Sixth claim England:
And Warwick, follow me, being thou canst do what I mean to ask.

LADY GREY:
Why, then I will do what your grace commands.

GLOUCESTER:

KING EDWARD IV:
Now, messenger, what letto do a dangerous honour,
Or than for strength and safety of our country.

BONA:
Dear brother, how shall Bona be revenged
But by thy heart Lord Bowhat of France is sending over masquers
To respector the naprest,
And but stancil say nd himself-gle like short is he thought with him
And prince shall foe:
I speak no more than what let thy dauntleash to be quite alsewis of France;
How could he stay till Warwick made a seem to our coox Margaret,
With this my son, Prince Edward, Henry's heir