In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Lambda, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import backend as K


In [2]:
max_seq_length = 100  # Length of input sequences
vocab_size = 10000    # Size of vocabulary
embedding_dim = 64    # Embedding dimensions
latent_dim = 32       # Latent space dimensions


In [3]:
# Encoder
input_text = Input(shape=(max_seq_length,))
x = Embedding(vocab_size, embedding_dim)(input_text)
encoder_bilstm = Bidirectional(LSTM(latent_dim, return_sequences=False))
encoder_outputs = encoder_bilstm(x)

# VAE Sampling layer
def sampling(args):
    z_mean, z_log_sigma = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_sigma) * epsilon

z_mean = Dense(latent_dim)(encoder_outputs)
z_log_sigma = Dense(latent_dim)(encoder_outputs)
z = Lambda(sampling)([z_mean, z_log_sigma])

# Instantiate encoder model
encoder = Model(input_text, [z_mean, z_log_sigma, z])
encoder.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 100, 64)              640000    ['input_1[0][0]']             
                                                                                                  
 bidirectional (Bidirection  (None, 64)                   24832     ['embedding[0][0]']           
 al)                                                                                              
                                                                                                  
 dense (Dense)               (None, 32)                   2080      ['bidirectional[0][0]']   

In [4]:
# Example text
texts = ["Hello world", "Variational Autoencoders are fun"]

# Tokenize text
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences
data = pad_sequences(sequences, maxlen=max_seq_length)
