In [19]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.python.util.tf_export import keras_export
import tensorflow.python.keras as keras
from glob import glob
import datetime
import os

In [20]:
#data length 15603  vocabsize = 1584
vocab_path= "vocab.txt"
training_data_path="dataset.txt"
epochs = 5
learning_rate=0.002
dropout_prob=0.3
train_batch_size=4000
dev_batch_size=4000
dev_data_size=4000
vocab_size=1584
embedding_size=128
filter_size=128
kernel_size=3
conv_activation="relu"
shuffle_buffer_size=10000000
space_skip_prob=0.5

class ChatspaceModel(tf.keras.Model):
    def __init__(
        self,):
        super(ChatspaceModel, self).__init__()
    
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.filter_size = filter_size
        self.kernel_size = kernel_size
        self.conv_activation = conv_activation
        self.dropout_prob = dropout_prob

        self.embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.gru1 = keras.layers.GRU(128, return_sequences=True)
        self.gru2 = keras.layers.GRU(128)
        self.maxpool = tf.keras.layers.GlobalMaxPool1D()
        self.dropout_layer = tf.keras.layers.Dropout(rate=self.dropout_prob)
        self.conv_layer = tf.keras.layers.Conv1D(
            2,
            3,
            padding="same",
            activation="softmax",
            kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01),
        )
        self.conv_layer1 = tf.keras.layers.Conv1D(
            filter_size,
            kernel_size,
            padding="same",
            activation=self.conv_activation,
            kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01),
        )
        #self.concatenated = tf.keras.layers.concatenate()

    def call(self, input_tensor, training=True):
        outputs = self.embedding_layer(input_tensor)
        
        output1 = self.conv_layer1(outputs)
        output1 = self.dropout_layer(outputs)
        output2 = self.conv_layer1(outputs)
        output2 = self.dropout_layer(outputs)
        output3 = self.conv_layer1(outputs)
        outputs = tf.keras.layers.concatenate([output1,output2,output3], axis = -1)
        outputs = self.gru1(outputs)
        outputs = self.conv_layer(outputs)

        return outputs


In [21]:
from glob import glob

def main():

    with open("vocab.txt", "r", encoding='utf-8') as f:
        content = f.read()
        keys = ["<PAD>", "<s>", "</s>", "<UNK>"] + list(content)
        values = list(range(len(keys)))
        print(len(keys))
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys[:vocab_size], values[:vocab_size], key_dtype=tf.string, value_dtype=tf.int32
        )
        lookup_table = tf.lookup.StaticHashTable(
            initializer=initializer,
            default_value=3,
        )


    files = glob(training_data_path)
    print(files)
    dataset = tf.data.TextLineDataset(files)
    
    dataset = (
        dataset.shuffle(shuffle_buffer_size)
        .map(
            make_chatspace_training_dataset(
                lookup_table=lookup_table,
                space_skip_prob=space_skip_prob,
            ),
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
        )
        .cache()
        .prefetch(
            tf.data.experimental.AUTOTUNE,
        )
    )
    

    #print(len(list(dataset)))
    # Input은 0, Label은 -1로 Padding함
    dev_dataset = dataset.take(dev_data_size).padded_batch(dev_batch_size,padded_shapes = ([None], [None]), padding_values=(0, -1))
    train_dataset = dataset.skip(dev_data_size).padded_batch(train_batch_size,padded_shapes = ([None], [None]), padding_values=(0, -1))

    model = ChatspaceModel()

    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
    loss = sparse_categorical_crossentropy_with_ignore
    metrics = sparse_categorical_accuracy_with_ignore

    
    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=[metrics],
    )

    filename = 'checkpoint-epoch-{}-batch-{}-trial-001.h5'.format(epochs, train_batch_size)
    log_dir = "logs\\fit\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    checkpoint_path = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"model\\checkpoint-{epoch}.ckpt"
    checkpoint_dir = os.path.dirname(checkpoint_path)

    model.fit(
        train_dataset,
        epochs=epochs,
        validation_data=dev_dataset,
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint(
                filepath=checkpoint_path,
#                 monitor="val_loss",
#                 verbose=1,
#                 save_best_only=True,
            ),
            tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1),
            tf.keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1),
        ],
    )



In [22]:
def make_chatspace_training_dataset(
    lookup_table: tf.lookup.StaticHashTable,
    space_skip_prob: float,
):

    @tf.function
    def _mapping_function(
        sentence: tf.Tensor,
    ):
        sentence = tf.strings.unicode_split(sentence, "UTF-8")
        sentence = tf.strings.regex_replace(sentence, " +", " ")
        sentence_length = tf.shape(sentence)[0]
        print(tf.shape(sentence))

        def cond(index, inputs, labels):
            return index < sentence_length

        def body(index, inputs, labels):
            inputs = tf.concat([inputs, [sentence[index]]], axis=0)

            index, labels = tf.cond(
                index != sentence_length - 1 and sentence[index + 1] == " ",
                lambda: tf.cond(
                    tf.random.uniform([], minval=0, maxval=1) <= space_skip_prob,
                    lambda: (index + 1, tf.concat([labels, [1]], axis=0)),
                    lambda: (index, tf.concat([labels, [0]], axis=0)),
                ),
                lambda: (index, tf.concat([labels, [0]], axis=0)),
            )

            index += 1
            return index, inputs, labels

        _, inputs, labels = tf.while_loop(
            cond,
            body,
            (
                tf.constant(0),
                tf.constant([], dtype=tf.string),
                tf.constant([], dtype=tf.int32),
            ),
            shape_invariants=(
                tf.TensorShape([]),
                tf.TensorShape([None]),
                tf.TensorShape([None]),
            ),
        )

        inputs = tf.concat([["<s>"], inputs, ["</s>"]], axis=0)
        labels = tf.concat([[0], labels, [0]], axis=0)
        inputs = lookup_table.lookup(inputs)

        return inputs, labels

    return _mapping_function

In [23]:
def sparse_categorical_crossentropy_with_ignore(y_true, y_pred, from_logits=False, axis=-1, ignore_id=-1):
    positions = tf.where(y_true != ignore_id)

    y_true = tf.gather_nd(y_true, positions)
    y_pred = tf.gather_nd(y_pred, positions)

    return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=from_logits, axis=axis)


def sparse_categorical_accuracy_with_ignore(y_true, y_pred, ignore_id=-1):
    positions = tf.where(y_true != ignore_id)

    y_true = tf.gather_nd(y_true, positions)
    y_pred = tf.gather_nd(y_pred, positions)

    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

In [24]:
import json
import re
from typing import List, Union

import tensorflow as tf


class Sentencespace:
    def __init__(self):
        self.model = ChatspaceModel()


        with open("vocab.txt", "r", encoding='utf-8') as f:
            content = f.read()
            keys = ["<PAD>", "<s>", "</s>", "<UNK>"] + list(content)
            values = list(range(len(keys)))
            initializer = tf.lookup.KeyValueTensorInitializer(
                keys[: len(keys)],
                values[: len(keys)],
                key_dtype=tf.string,
                value_dtype=tf.int32,
            )
            self.lookup_table = tf.lookup.StaticHashTable(initializer=initializer, default_value=3)

    def space(self, texts, batch_size=1) -> Union[List[str], str]:

        is_single_inference = isinstance(texts, str)
        texts = [texts] if is_single_inference else texts
        dataset = self.make_chatspace_inputs(texts, batch_size=batch_size)

        outputs = []
        for data in dataset:
            pred = self.model(data)
            space_preds = tf.math.argmax(pred, axis=-1)
            outputs.extend(space_preds)

        result = self.generate_text(texts, outputs)

        return result[0] if is_single_inference else result

    def make_chatspace_inputs(self, texts: List[str], batch_size = 1):

        @tf.function
        def _mapping_function(x: tf.Tensor):
            x = tf.strings.unicode_split(x, "UTF-8")
            return self.lookup_table.lookup(tf.concat([["<s>"], x, ["</s>"]], axis=0))

        return (
            tf.data.Dataset.from_tensor_slices(tf.constant(texts, dtype=tf.string))
            .map(_mapping_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            .padded_batch(batch_size, padding_values=0, padded_shapes=[None])
        )

    def generate_text(self, texts: List[str], space_pred: tf.Tensor) -> str:

        result = []
        for text, pred in zip(texts, space_pred):
            generated_sentence = [
                text[i] + (" " if pred[i + 1] == 1 else "")
                for i in range(len(text))
            ]
            joined_chars = "".join(generated_sentence)
            result.append(re.sub(r"\s+", " ", joined_chars).strip())

        return result


In [25]:
main()

1588
['dataset.txt']
Tensor("Shape_1:0", shape=(1,), dtype=int32)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [106]:
i =Sentencespace()
result = i.space("피할수없으면즐겨라")
result

'피할수 없으면즐겨라'