## PTB 데이터 다운로드

In [None]:
import os

if 'ptb.train.txt' in os.listdir():
    with open("ptb.train.txt", 'r') as f:
        text = f.read()        
else:
    from urllib.request import urlopen
    url = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt'
    html = urlopen(url)
    text = html.read().decode()

    with open("ptb.train.txt", 'w') as f:
        f.write(text)
text = text.replace('\n', '<eos>').strip()    

## 토크나이징

In [None]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

tokenizer = Tokenizer(char_level=True, filters='', lower=True)
tokenizer.fit_on_texts([text])
sequences = (np.array(tokenizer.texts_to_sequences([text])) - 1)[0]

## 데이터셋 만들기

In [None]:
def generate_train_data(sequences, step):
    X = []
    step += 1
    for i in range(0, len(sequences)-step, step):
        X.append(sequences[i: i + step])
    return np.array(X)

train_data = generate_train_data(sequences, 20)
X, y = train_data[:,:-1], train_data[:,1:]

## 모델 훈련

In [None]:
total_words = len(tokenizer.word_index)
model = keras.models.Sequential([
     keras.layers.Embedding(total_words, 100),
     keras.layers.GRU(256, return_sequences=True),
     keras.layers.GRU(256, return_sequences=True),
     keras.layers.Dense(total_words, activation = 'softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, None, 256)         235008    
_________________________________________________________________
gru_1 (GRU)                  (None, None, 256)         394752    
_________________________________________________________________
dense (Dense)                (None, None, 48)          12336     
Total params: 642,096
Trainable params: 642,096
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
history = model.fit(X, y, epochs=20)

## 문장 생성

In [None]:
def next_char(texts):
    '''
    model.predict 단계에서 모델은 다음에 어떤 문자가 올지 확률값을 반환합니다.
    그 확률값을 토대로 다음에 올 문자를 샘플링합니다(np.random.choice).
    '''
    X_new = np.array(tokenizer.texts_to_sequences([texts])) -1
    y_proba = model.predict(X_new)[0, -1, :] # (배치, 타입스탭, 다음에 올 문자의 확률)
    char_id = np.random.choice(range(len(y_proba)), size=1, p=y_proba)
    return tokenizer.sequences_to_texts([char_id+1])[0]

def complete_text(text, n_chars=20):
    for _ in range(n_chars):
        text += next_char(text) 
    return text

text = "i will be"
complete_text(text, n_chars=105)

'i will <unk> <unk> in n analyst with progress corp. have ann numberso <unk> that promotions are british issue w'