In [40]:
import tensorflow as tf
import pandas as pd
import os
import sys
sys.path.append('/Users/hoyoung/Desktop/pycharm_work/korean_grammar_corrector/bin')
sys.path.append('/Users/hoyoung/Desktop/pycharm_work/korean_grammar_corrector/utils')

import tensorflow_preprocess as tp
from model.transformer_model import *
import tensorflow_datasets as tfds

In [7]:
df = pd.read_csv('../../../data/train/model_save_test/corpus_repair_train.shuf.savetest.csv', names=['src', 'tgt'])

In [16]:
MAX_LENGTH = 24

def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))

    loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

def accuracy(y_true, y_pred):
    # 레이블의 크기 : (batch_size, MAX_LENGTH - 1)
    y_true = tf.reshape(y_true, shape = (-1, MAX_LENGTH-1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

In [17]:
src = df['src'].apply(lambda x: tp.full_stop_filter(x))
tgt = df['tgt'].apply(lambda x: tp.full_stop_filter(x))

inputs, outputs, tokenizer = tp.tokenize_and_filter(src, tgt, max_length=24)
dataset = tp.create_train_dataset(inputs, outputs, batch_size=64, buffer_size=60000)

In [18]:
tf.keras.backend.clear_session()

d_model = 256
num_layers = 5
num_heads = 8
dff = 512
dropout = 0.1

model = transformer(
    vocab_size = tokenizer.vocab_size+2,
    num_layers = num_layers,
    dff = dff,
    d_model = d_model,
    num_heads = num_heads,
    dropout = dropout,
    name="transformer")

In [19]:
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 dec_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 enc_padding_mask (Lambda)      (None, 1, 1, None)   0           ['inputs[0][0]']                 
                                                                                                  
 encoder (Functional)           (None, None, 256)    4741632     ['inputs[0][0]',                 
                                                                  'enc_padding_mask[0][0

In [20]:
learning_rate = CustomSchedule(128)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [21]:
checkpoint_path = "../../../checkpoint_20221205/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# 모델의 가중치를 저장하는 콜백 만들기
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [22]:
epochs = 3
model.fit(dataset, epochs=epochs, callbacks=[cp_callback])

Epoch 1/3
Epoch 1: saving model to ../../../checkpoint/cp.ckpt
Epoch 2/3

KeyboardInterrupt: 

In [23]:
path = "../../../checkpoint_20221205/cp.ckpt"

In [24]:
os.listdir(os.path.dirname(path))

['cp.ckpt.data-00000-of-00001', 'checkpoint', 'cp.ckpt.index']

In [25]:
d_model = 128
num_layers = 4
num_heads = 4
dff = 256
dropout = 0.1
vocab_size = 8185

model2 = transformer(
    vocab_size = vocab_size,
    num_layers = num_layers,
    dff = dff,
    d_model = d_model,
    num_heads = num_heads,
    dropout = dropout,
    name="transformer2")

In [26]:
model2.load_weights(path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd9b7c02340>

In [27]:
model2.summary()

Model: "transformer2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 dec_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 enc_padding_mask (Lambda)      (None, 1, 1, None)   0           ['inputs[0][0]']                 
                                                                                                  
 encoder (Functional)           (None, None, 128)    1577600     ['inputs[0][0]',                 
                                                                  'enc_padding_mask[0][

In [44]:
train_tok = tfds.deprecated.text.SubwordTextEncoder.load_from_file('../../../tokenizer.tok')
train_tok.vocab_size

8183

In [51]:
def evaluate(sentence, MAX_LENGTH, MODEL, TOKENIZER):
    # 입력 문장에 대한 전처리
    sentence = tp.full_stop_filter(sentence)

    # 입력 문장에 시작 토큰과 종료 토큰을 추가
    sentence = tf.expand_dims(
      [TOKENIZER.vocab_size] + TOKENIZER.encode(sentence) + [TOKENIZER.vocab_size+1], axis=0)

    output = tf.expand_dims([vocab_size], 0)

    # 디코더의 예측 시작
    for i in range(MAX_LENGTH):
        predictions = MODEL(inputs=[sentence, output], training=False)

        # 현재 시점의 예측 단어를 받아온다.
        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # 만약 현재 시점의 예측 단어가 종료 토큰이라면 예측을 중단
        if tf.equal(predicted_id, [TOKENIZER.vocab_size+1][0]):
            break

        # 현재 시점의 예측 단어를 output(출력)에 연결한다.
        # output은 for문의 다음 루프에서 디코더의 입력이 된다.
        output = tf.concat([output, predicted_id], axis=-1)

    # 단어 예측이 모두 끝났다면 output을 리턴.
    return tf.squeeze(output, axis=0)



def predict(sentence):
    prediction = evaluate(sentence, 24, TOKENIZER)

    # prediction == 디코더가 리턴한 챗봇의 대답에 해당하는 정수 시퀀스
    # tokenizer.decode()를 통해 정수 시퀀스를 문자열로 디코딩.
    predicted_sentence = TOKENIZER.decode(
      [i for i in prediction if i < TOKENIZER.vocab_size])

    print('Input: {}'.format(sentence))
    print('Output: {}'.format(predicted_sentence))

    return predicted_sentence


In [38]:
test = "이유른 신차의 경우 야 기퍼센트에서 시자캐요."
t = tp.full_stop_filter(test)

In [39]:
predict(test)


InvalidArgumentError: Exception encountered when calling layer "embedding_3" (type Embedding).

indices[0,0] = 8185 is not in [0, 8185) [Op:ResourceGather]

Call arguments received by layer "embedding_3" (type Embedding):
  • inputs=tf.Tensor(shape=(1, 1), dtype=float32)

In [58]:
import re

corpus = []
with open('../../../tokenizer.tok.subwords', 'r', encoding='utf-8') as f:
    for inx, line in enumerate(f):
        if inx > 1:
            sent = line.lower().strip()
            sent = sent.replace('\n', '')
#             sent = re.sub(r"[^а-яА-Я?.!,_]+", " ", sent)
            sent = sent.strip()
            corpus.append(sent)

load_tokenizer = tfds.deprecated.text.SubwordTextEncoder(vocab_list = corpus)

In [60]:
load_tokenizer.vocab_size

8183