In [33]:
import numpy as np
import tensorflow as tf

In [34]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, Dense, SimpleRNN

In [35]:
tf.__version__

'2.1.0'

In [37]:
keras.__version__

'2.2.4-tf'

In [36]:
seed = 2020
np.random.seed(seed)

In [4]:
text = ''' 경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n'''

In [5]:
t = Tokenizer()
t.fit_on_texts([text])

In [6]:
t.word_index

{'말이': 1,
 '경마장에': 2,
 '있는': 3,
 '뛰고': 4,
 '있다': 5,
 '그의': 6,
 '법이다': 7,
 '가는': 8,
 '고와야': 9,
 '오는': 10,
 '곱다': 11}

In [7]:
# 단어 집합 크기 설정
vocab_size = len(t.word_index) + 1
print('단어 집합의 크기 :', vocab_size)

단어 집합의 크기 : 12


In [8]:
t.texts_to_sequences(['그의 말이 법이다'])

[[6, 1, 7]]

In [9]:
sequences = []

for line in text.split('\n'):
    encoded = t.texts_to_sequences([line])[0]
    # print(line, encoded)
    for i in range(1, len(encoded)):
        print(line, encoded[:i+1])
        sequence = encoded[:i+1]
        sequences.append(sequence)

print('학습에 사용할 샘플의 개수: ', len(sequences))

 경마장에 있는 말이 뛰고 있다 [2, 3]
 경마장에 있는 말이 뛰고 있다 [2, 3, 1]
 경마장에 있는 말이 뛰고 있다 [2, 3, 1, 4]
 경마장에 있는 말이 뛰고 있다 [2, 3, 1, 4, 5]
그의 말이 법이다 [6, 1]
그의 말이 법이다 [6, 1, 7]
가는 말이 고와야 오는 말이 곱다 [8, 1]
가는 말이 고와야 오는 말이 곱다 [8, 1, 9]
가는 말이 고와야 오는 말이 곱다 [8, 1, 9, 10]
가는 말이 고와야 오는 말이 곱다 [8, 1, 9, 10, 1]
가는 말이 고와야 오는 말이 곱다 [8, 1, 9, 10, 1, 11]
학습에 사용할 샘플의 개수:  11


In [10]:
# 샘플에서 가장긴 샘플의 길이 (가는 말이 고와야 오는 말이 곱다.)
max_len = max(len(s) for s in sequences)
print(max_len)

6


In [11]:
# 전체 샘플의 길이를 6(가장 긴 샘플의 길이)으로 패딩
# 'pre' 옵션을 주면 앞을 0으로 패딩
sequences = pad_sequences(sequences, max_len)

In [12]:
sequences[:5]

array([[0, 0, 0, 0, 2, 3],
       [0, 0, 0, 2, 3, 1],
       [0, 0, 2, 3, 1, 4],
       [0, 2, 3, 1, 4, 5],
       [0, 0, 0, 0, 6, 1]])

In [13]:
X = sequences[:, :-1]
Y = sequences[:, -1]

In [14]:
X[:5]

array([[0, 0, 0, 0, 2],
       [0, 0, 0, 2, 3],
       [0, 0, 2, 3, 1],
       [0, 2, 3, 1, 4],
       [0, 0, 0, 0, 6]])

In [15]:
y = to_categorical(Y, vocab_size)

In [16]:
y

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [17]:
def setup_model(dim, rnn_units):
    embedding = Embedding(vocab_size, dim, input_length=max_len-1)   # 레이블을 분리 했으므로 X의 길이는 5
    rnn = SimpleRNN(rnn_units, return_sequences=True)
    output = Dense(vocab_size, activation='softmax')

    return embedding, rnn, output

def call_model():
    model = Sequential()
    model.add(embedding)
    model.add(rnn)
    model.add(output)

    return model

In [18]:
def call_model2(n, dim, rnn_units):
    model = Sequential()
    model.add(Embedding(vocab_size, dim, input_length=max_len-1))

    for _ in range(n):
        model.add(SimpleRNN(rnn_units, return_sequences=True))

    model.add(Dense(vocab_size, activation='softmax'))

    return model

In [19]:
embedding, rnn, output = setup_model(10, 32)

In [20]:
model = call_model()

In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 10)             120       
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 5, 32)             1376      
_________________________________________________________________
dense_1 (Dense)              (None, 5, 12)             396       
Total params: 1,892
Trainable params: 1,892
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(loss=keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])

In [38]:
history = model.fit(X, y, epochs = 200, verbose=0)

ValueError: Error when checking target: expected dense_1 to have 3 dimensions, but got array with shape (11, 12)

In [None]:
history.history['accuracy'][-1]

### 모델 검증

In [None]:
# n 몇 단어를 끄집어 낼것인가.
def sentence_generation(model, t, current_word, n):
    init_word = current_word
    sentence = ''
    for _ in range(n):
        encoded = t.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], 5)

        result = model.predict_classes(encoded, verbose=0)
        for word, index in t.word_index.items():
            if index == result:
                break
        current_word = current_word + ' ' + word
        sentence = sentence + ' ' + word
    
    sentence = init_word + sentence

    return sentence


In [None]:
print(sentence_generation(model, t, '경마장에', 3))

In [None]:
# 그의 단어 뒤에 1번째 단어를 예측
print(sentence_generation(model, t, '그의', 2))

In [None]:
embedding, rnn, output = setup_model(2, 32)

In [None]:
model2 = call_model(1)

In [None]:
model2.summary()

In [None]:
model2.compile(loss=keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])

In [None]:
history2 = model2.fit(X, y, epochs=200, verbose=0)

In [None]:
history2.history['accuracy'][-1]

In [None]:
print(sentence_generation(model2, t, '경마장에', 3))

In [None]:
print(sentence_generation(model2, t, '그의', 2))

In [None]:
print(sentence_generation(model2, t, '가는', 5))

In [None]:
embedding, rnn, output = setup_model(4, 32)
model3  = call_model(1)
model3.summary()
model3.compile(loss=keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])

In [None]:
history3 = model3.fit(X, y, epochs=200, verbose=0)

In [None]:
history3.history['accuracy'][-1]

In [None]:
embedding, rnn, output = setup_model(6, 32)
model4  = call_model(1)
model4.summary()
model4.compile(loss=keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])

In [None]:
history4 = model4.fit(X, y, epochs=200, verbose=0)
history4.history['accuracy'][-1]

In [None]:
embedding, rnn, output = setup_model(1, 32)
model5  = call_model(1)
model5.summary()
model5.compile(loss=keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])

In [None]:
history5 = model5.fit(X, y, epochs=200, verbose=0)
history5.history['accuracy'][-1]

In [None]:
embedding, rnn, output = setup_model(2, 64)
model6  = call_model(1)
model6.summary()
model6.compile(loss=keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])

In [None]:
history6 = model6.fit(X, y, epochs=200, verbose=0)
history6.history['accuracy'][-1]

In [None]:
embedding, rnn, output = setup_model(2, 64)
model7 = call_model(1)
model7.summary()
model7.compile(loss=keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])

In [None]:
history7 = model7.fit(X, y, epochs=200, verbose=0)
history7.history['accuracy'][-1]

In [None]:
embedding, rnn, output = setup_model(2, 12)
model8 = call_model(1)
model8.summary()
model8.compile(loss=keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])

In [None]:
history8 = model8.fit(X, y, epochs=200, verbose=0)
history8.history['accuracy'][-1]

In [None]:
model10 = Sequential()
model10.add(Embedding(vocab_size, 4, input_length=max_len-1))
model10.add(SimpleRNN(12, return_sequences=True))
model10.add(SimpleRNN(12, return_sequences=False))
model10.add(Dense(vocab_size, activation='softmax'))
model10.summary()
model10.compile(loss=keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
history10 = model10.fit(X, y, epochs=200, verbose=0)

In [None]:
history10.history['accuracy'][-1]

In [None]:
model9 = call_model2(2, 2, 12)
model9.summary()
model9.compile(loss=keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
history9 = model9.fit(X, y, epochs=200, verbose=0)
history9.history['accuracy'][-1]