In [8]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Reshape
from tensorflow.keras.models import Model
import numpy as np


# Load the IMDB word index
word_index = imdb.get_word_index()
# Reverse the word index to map integer indices to words
reverse_word_index = {value: key for (key, value) in word_index.items()}
# Add padding, start, and unknown tokens
reverse_word_index[0] = '<PAD>'
reverse_word_index[1] = '<START>'
reverse_word_index[2] = '<UNK>'

# Parameters
vocab_size = 10000  # Size of the vocabulary
max_length = 100    # Maximum length of input sentences
latent_dim = 32     # Size of the latent space
embedding_dim = 50  # Embedding dimension
epochs = 10         # Number of training epochs

# Load IMDB dataset
(x_train, _), (x_test, _) = imdb.load_data(num_words=vocab_size)
x_train = pad_sequences(x_train, maxlen=max_length, padding='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post')

# AutoEncoder Model
# Encoder
inputs = Input(shape=(max_length,))
embedded = Embedding(vocab_size, embedding_dim)(inputs)
flattened = Flatten()(embedded)
encoded = Dense(latent_dim, activation='relu')(flattened)
encoder_model = Model(inputs, encoded)

# Decoder
latent_inputs = Input(shape=(latent_dim,))
reconstructed = Dense(max_length*embedding_dim, activation='relu')(latent_inputs)
reshaped = Reshape((max_length, embedding_dim))(reconstructed)
decoded = Dense(vocab_size, activation='softmax')(reshaped)
decoder_model = Model(latent_inputs, decoded)

# Autoencoder
autoencoder = Model(inputs, decoder_model(encoder_model(inputs)))
autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Train the model
autoencoder.fit(x_train, x_train,
                epochs=epochs,
                batch_size=32,
                validation_data=(x_test, x_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d3f2179dc00>

In [9]:
# Usage example (after training)
encoded_texts = encoder_model.predict(x_test[:10])
decoded_texts = decoder_model.predict(encoded_texts)










In [10]:
def decode_sequence(sequence):
    """Decode a sequence of integers back to words."""
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in sequence])

# Assume `decoded_texts` is the output from the decoder
decoded_sequences = np.argmax(decoded_texts, axis=-1)

# Convert each sequence in the decoded_sequences back to text
decoded_texts = [decode_sequence(seq) for seq in decoded_sequences]

# Example: print the first decoded text
print(decoded_texts[0])


? this is this movie is ? <UNK> this ? ? in <START> movie of <START> movie is is ? <START> movie of br ? ? ? ? a <START> <START> <UNK> br ? is ? ? ? ? ? <START> ? in ? ? <START> ? <UNK> ? ? <UNK> <START> ? <UNK> ? <START> <UNK> <START> ? <START> <START> br br br br ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
