In [6]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras import Input, Model
import numpy as np

# Dummy Text Data
vocab_size = 1000
max_length = 10
dummy_data = np.random.randint(0, vocab_size, size=(100, max_length))

# Encoder Architecture
input_text = Input(shape=(max_length,))
x = Embedding(vocab_size, 64)(input_text)
x = LSTM(32)(x)
z_mean = Dense(2)(x)
z_log_var = Dense(2)(x)

# Custom loss function
def vae_loss(z_mean, z_log_var):
    kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
    return tf.reduce_mean(kl_loss)

# Add KL loss to the model
kl_loss = vae_loss(z_mean, z_log_var)
encoder = Model(input_text, [z_mean, z_log_var])
encoder.add_loss(kl_loss)

from tensorflow.keras.callbacks import LearningRateScheduler

# Compile the Model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
encoder.compile(optimizer=optimizer)

# Learning Rate Scheduler
def scheduler(epoch, lr):
    return lr * tf.math.exp(-0.1)

callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

# Train the Model
encoder.fit(dummy_data, epochs=10, batch_size=32, callbacks=[callback])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e4b25a2a020>

In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to encode sample texts
def encode_samples(samples):
    encoded = encoder.predict(samples)
    return encoded  # Returns the latent space representations

# Simple evaluation: Check the difference in latent representations
def evaluate_latent_space(latent1, latent2):
    mean_diff = np.mean(np.abs(latent1[0] - latent2[0]))  # Difference in means
    log_var_diff = np.mean(np.abs(latent1[1] - latent2[1]))  # Difference in log variances
    return mean_diff, log_var_diff

# Sample Text Data (replace with your dataset)
texts = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "The sky is blue and the grass is green.",
    "Roses are red, violets are blue."
]

# Assuming a previously defined vocab_size and max_length
vocab_size = 1000  # Example value
max_length = 10    # Example value

# Tokenization and Padding
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [14]:
latent_means, latent_log_vars = encoder.predict(padded_sequences)



In [15]:

# Assuming the first two are similar, and the last two are dissimilar
latent_similar_means = [latent_means[0], latent_means[1]]
latent_dissimilar_means = [latent_means[2], latent_means[3]]
latent_similar_log_vars = [latent_log_vars[0], latent_log_vars[1]]
latent_dissimilar_log_vars = [latent_log_vars[2], latent_log_vars[3]]


In [16]:

# Evaluate
diff_similar = evaluate_latent_space(latent_similar_means, latent_similar_log_vars)
diff_dissimilar = evaluate_latent_space(latent_dissimilar_means, latent_dissimilar_log_vars)

print("Difference in Latent Space for Similar Texts:", diff_similar)
print("Difference in Latent Space for Dissimilar Texts:", diff_dissimilar)

Difference in Latent Space for Similar Texts: (0.031839605, 0.033389855)
Difference in Latent Space for Dissimilar Texts: (0.011693824, 0.033667423)
