In [9]:

import random
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, RepeatVector, TimeDistributed, Embedding, Dropout, Bidirectional, GRU
from tensorflow.keras.optimizers import Adam
from gensim.models import Word2Vec
from tensorflow.keras.callbacks import EarlyStopping
from keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.utils import to_categorical

# Parameters for the model
embedding_dim = 300
latent_dim = 256

# Parameters
vocab_size = 10000
max_length = 150
epochs = 200
batch_size = 256
learning_rate = 0.01

# Load IMDB dataset
(x_train, _), (x_test, _) = imdb.load_data(num_words=vocab_size)
word_index = imdb.get_word_index()

# Create a reverse word index
reverse_word_index = {value + 3: key for key, value in word_index.items()}
reverse_word_index[0] = '<PAD>'
reverse_word_index[1] = '<START>'
reverse_word_index[2] = '<UNK>'
reverse_word_index[3] = '<UNUSED>'

# Convert sequences back to text
train_texts = [[reverse_word_index.get(i, '<UNK>') for i in sequence] for sequence in x_train]
test_texts = [[reverse_word_index.get(i, '<UNK>') for i in sequence] for sequence in x_test]
all_texts = train_texts + test_texts



In [10]:

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=all_texts, vector_size=embedding_dim, window=5, min_count=1, workers=4, epochs=20)

# Convert the word2vec model to a dictionary
# Create an embedding matrix where each row index corresponds to a word index
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        try:
            embedding_vector = word2vec_model.wv[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            # Words not found in the embedding index will be all zeros
            pass


In [11]:

def sequence_to_text(sequence):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in sequence])

def add_noise_to_text(sequence, noise_factor=0.1):
    noisy_sequence = []
    for i in sequence:
        if random.random() < noise_factor:
            # Randomly replace a word with another word
            noisy_sequence.append(random.randint(1, vocab_size - 1))
        else:
            noisy_sequence.append(i)
    return noisy_sequence

# Add noise to the data
noisy_x_train = [add_noise_to_text(seq) for seq in x_train]
noisy_x_test = [add_noise_to_text(seq) for seq in x_test]

# Convert to padded sequences
noisy_x_train_padded = pad_sequences(noisy_x_train, maxlen=max_length, padding='post')
noisy_x_test_padded = pad_sequences(noisy_x_test, maxlen=max_length, padding='post')

# Example of noisy text
print("Original:", sequence_to_text(x_train[0]))
print("Noisy:", sequence_to_text(noisy_x_train[0]))


Original: ? in with i like horrible business chinese charm would killer waited which explosion <START> going at fun <UNK> film make like lame character has novel ? a all final sense <UNK> real <START> find character nothing ? second perhaps they <START> find valuable cover this city an br overall <START> horror has i <UNUSED> should shop was in with <START> delightful 00 despite <START> with their people is i like horrible an well it br garbage <START> with this genre this is i taken that ? <UNK> she sex is and house and after <UNK> <START> product bud i final which returned be <START> does is i an annoying <UNK> film where if at man it's film sent be <UNUSED> with is comedy you than some <UNK> in perfect i get ? and <START> think plot windows it fun <START> ? the lou <UNK> sequence at their like horrible wanted on getting night just the <START> ? rich br any other <START> couple it someone then he decade more on why <UNUSED> can't ajay that <START> family with for still wanted on fina

In [19]:
# Encoder
input_text = Input(shape=(max_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_text)
encoder_output = Bidirectional(LSTM(2*latent_dim, return_sequences=False))(encoder_embedding)
encoder_output = Dropout(0.15)(encoder_output)
encoder_output = Dense(latent_dim, activation='relu')(encoder_output)

# Decoder
decoder_input_seq = Input(shape=(max_length - 1,))
decoder_embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(decoder_input_seq)
decoder_input = RepeatVector(max_length)(encoder_output)
decoder_lstm = LSTM(latent_dim, return_sequences=True)(decoder_input)
decoder_lstm = Dropout(0.15)(decoder_lstm)
decoder_lstm = LSTM(2*latent_dim, return_sequences=True)(decoder_lstm)
decoder_output = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder_lstm)


# Compile and train with MSE loss
autoencoder = Model([input_text, decoder_input_seq], decoder_output)
autoencoder.compile(optimizer=Adam(learning_rate=learning_rate, clipnorm=1.0), loss='sparse_categorical_crossentropy')

# # First, pad the sequences to max_length
x_train_padded = pad_sequences(x_train, maxlen=max_length, padding='post')
x_test_padded = pad_sequences(x_test, maxlen=max_length, padding='post')

# y_train and y_test are simply the padded original sequences
y_train = x_train_padded
y_test = x_test_padded

# Prepare decoder_input_data by shifting the sequences by one position
decoder_input_train = np.zeros((x_train_padded.shape[0], max_length - 1), dtype=int)
decoder_input_test = np.zeros((x_test_padded.shape[0], max_length - 1), dtype=int)

for i in range(len(x_train_padded)):
    decoder_input_train[i] = x_train_padded[i, 1:]  # Drop the first word

for i in range(len(x_test_padded)):
    decoder_input_test[i] = x_test_padded[i, 1:]  # Drop the first word


In [20]:
autoencoder.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_13 (InputLayer)       [(None, 150)]                0         []                            
                                                                                                  
 embedding_12 (Embedding)    (None, 150, 300)             3000000   ['input_13[0][0]']            
                                                                                                  
 bidirectional_8 (Bidirecti  (None, 1024)                 3330048   ['embedding_12[0][0]']        
 onal)                                                                                            
                                                                                                  
 dropout_9 (Dropout)         (None, 1024)                 0         ['bidirectional_8[0][0]'

In [21]:
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow.keras.backend as K

def custom_lr_scheduler(epoch, lr):
    # Decrease learning rate by 0.1 factor every 5 epochs
    if epoch % 5 == 0 and epoch != 0:
        lr = lr * 0.1
    return lr

# Define the callback
lr_scheduler = LearningRateScheduler(custom_lr_scheduler)
early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# Train the model
autoencoder.fit([noisy_x_train_padded, decoder_input_train], np.expand_dims(y_train, -1),
          epochs=epochs,
          batch_size=batch_size,
          validation_data=([noisy_x_test_padded, decoder_input_test], np.expand_dims(y_test, -1)),
          callbacks=[early_stopping, lr_scheduler])



Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200


<keras.src.callbacks.History at 0x7fc6a7effd60>

In [22]:
def sample(preds, temperature=1.0):
    # Convert to array and prevent numerical issues with very small numbers
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-7) / temperature  # Adjust by temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)  # Softmax
    probas = np.random.multinomial(1, preds, 1)  # Sample from the softmax distribution
    return np.argmax(probas)

In [26]:
import tensorflow as tf
# Prepare the decoder input for prediction
decoder_input_test = np.zeros((noisy_x_test_padded.shape[0], max_length - 1), dtype=int)
for i in range(len(x_test_padded)):
    decoder_input_test[i] = x_test_padded[i, 1:]  # Drop the first word

# Predicting denoised text
denoised_texts = autoencoder.predict([noisy_x_test_padded[:10], decoder_input_test[:10]])

# Function to convert sequences back to text
def decode_denoised_sequence(sequence):
    indices = tf.math.argmax(sequence, axis=1).numpy()
    return ' '.join([reverse_word_index.get(word, '?') for word in indices])

def decode_sequence_with_sampling(prob_distributions, temperature=1.0):
    return ' '.join([reverse_word_index.get(sample(probs, temperature), '?') for probs in prob_distributions])

# Decoding the denoised sequences
for i in range(10):
    print("Original:", sequence_to_text(x_test[i]))
    print("Noisy:", sequence_to_text(noisy_x_test_padded[i]))
    print("Generated:", decode_sequence_with_sampling(denoised_texts[i], temperature=1))
    print("\n")


Original: ? murder both in have <UNUSED> easily of of ? ? <UNK> <START> boring the <START> again marries understand dead <START> over a odd odd odd of of br how where first lead spiral make you cross in have movie not convict are role dark and where in true director and old just <UNK> not last i lot ? an he film spiral based both in <UNUSED> easily
Noisy: ? suggest both in have <UNUSED> easily of of ? ? <UNK> <START> boring the <START> again marries understand dead <START> clue a odd odd odd of of br how where first lead spiral make you cross in have movie not convict are role dark revolution where in true director and old just <UNK> wounded last i lot dance an he film spiral based both in <UNUSED> easily ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
Denoised: <START> <UNK> funny anything there written where a second fully suspenseful i <PAD> a that with crafted by firs