# Character Level CNN Variational Autoencoder

In [None]:
import Corpus.gutenberg as corpus
from TextPreprocess.Tokenizer.Stanford import tokenize
from Utils.visual import hist, tally
from Utils.debug import dump
from Utils.generator import sliding_window, random_window, transform
from Utils.FS.file import save, load
from Utils.keras import compact_embedding
from Utils.misc import batch
from Utils.indexer import build_index, index_2_one_hot
from keras.preprocessing.sequence import pad_sequences
from keras_tqdm import TQDMNotebookCallback

Using TensorFlow backend.


In [None]:
#data = tokenize(corpus.raw())
data = corpus.gutenberg.words()

In [None]:
def char_generator():
    for word in data:
        for char in word:
            yield char

In [None]:
s2i, i2s, size = build_index(char_generator())

In [None]:
MAX_SEQ_LENGTH = max([len(word) for word in data])
MAX_SEQ_LENGTH = 50

In [None]:
NUM_SYMBOL = size
NUM_SYMBOL

In [None]:
def word_generator():
    for word in data:
            yield word

NUM_SAMPLE = len(list(word_generator()))
NUM_SAMPLE

In [None]:
from keras.layers import Input, Conv2D, Conv2DTranspose, Dense, Flatten, Dropout, Reshape, Embedding
from keras.models import Model, Sequential
from keras.regularizers import l2
import numpy as np
import tensorflow as tf

def custom_loss(y_true, y_pred):
    print(y_true, y_pred)
    '''Just another crossentropy'''
    #y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
    #y_pred /= y_pred.sum(axis=-1, keepdims=True)
    #cce = T.nnet.categorical_crossentropy(y_pred, y_true)
    '''
    [np.average
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=y_true[i],
            logits=y_pred[i],
        )
     for i in y_true]
    '''
    return y_true - y_pred

def create_baseline(dropout=0, branching=5):
    
    activation = 'selu'
    padding = 'same'
    layer = 3
    dim = [NUM_SYMBOL, 200, 400, 1000]
    kernel = [5, 2, 2]
    strides = [1, 5, 5]
    
    inp = Input(shape=(MAX_SEQ_LENGTH,NUM_SYMBOL))
    #x = Embedding(NUM_SYMBOL, NUM_SYMBOL, weights=[np.eye(NUM_SYMBOL)], input_length=10, trainable=False)(inp)
    x = Reshape((1, MAX_SEQ_LENGTH, NUM_SYMBOL))(inp)
    for i in range(layer):
        x = Conv2D(dim[i+1], (1, kernel[i]), strides=(1, strides[i]), activation=activation, padding=padding)(x)
        
    for i in reversed(range(layer)):
        x = Conv2DTranspose(dim[i], (1, kernel[i]), strides=(1, strides[i]), activation=activation, padding=padding)(x)
        
    x = Reshape((MAX_SEQ_LENGTH, NUM_SYMBOL))(x)
    model = Model(inp, x)
    model.compile(loss='mean_squared_error',
              optimizer='adam')
    return model

In [None]:
from keras_tqdm import TQDMNotebookCallback

BATCH_SIZE = 1024
DROPOUT = 0.5
BRANCHING = 2

model = create_baseline(DROPOUT, BRANCHING)
model.summary()

In [None]:
from keras.utils.np_utils import to_categorical
def sample_generator(word_generator, batch_size = 64):
    sample = []
    for word in word_generator:
        unpad = batch(s2i, word)
        padded = pad_sequences([unpad], maxlen=MAX_SEQ_LENGTH, dtype='float32', padding='pre', truncating='pre', value=0.)
        one_hot = to_categorical(padded, num_classes=NUM_SYMBOL)
        sample.append(one_hot)
        if len(sample) == batch_size:
            train = np.array(sample)
            yield(train, train)
            sample = []

In [None]:
word_gen = word_generator()
next(sample_generator(word_gen, 2))[0].shape

In [None]:
word_gen = word_generator()
BATCH_SIZE = 128
model.fit_generator(
    sample_generator(word_gen, BATCH_SIZE),
    NUM_SAMPLE // BATCH_SIZE,
    epochs=200,
    #class_weight=class_weight
    verbose=0, callbacks=[TQDMNotebookCallback()]
)