### Text Generation With Keras

In [1]:
#download and uncompress movie reviews
!wget https:/ /ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2022-08-15 12:46:23--  ftp://https//
           => '.listing'
Resolving https (https)... failed: No such host is known. .
wget: unable to resolve host address 'https'
/ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz: Scheme missing.


In [2]:
#Strip out HTML tags that occur in reviews
import tensorflow as tf 
from tensorflow import keras
dataset = keras.utils.text_dataset_from_directory(
    directory="aclImdb", label_mode=None, batch_size=256)
dataset = dataset.map(lambda x: tf.strings.regex_replace(x, "<br />", " "))

Found 105006 files belonging to 1 classes.


In [3]:
#prep TextVectorization layer
from tensorflow.keras.layers import TextVectorization
  
sequence_length = 100 
vocab_size = 15000                            
text_vectorization = TextVectorization(
    max_tokens=vocab_size,                
    output_mode="int",                        
    output_sequence_length=sequence_length,   
)
text_vectorization.adapt(dataset)

In [4]:
def prepare_lm_dataset(text_batch):
    #convert batch of text (strings) to batch of int sequences
    vectorized_sequences = text_vectorization(text_batch)  
    #Create inputs by cutting off last word of sequences
    x = vectorized_sequences[:, :-1]                    
    #create targets by offsetting the sequences by 1
    y = vectorized_sequences[:, 1:]                          
    return x, y
  
lm_dataset = dataset.map(prepare_lm_dataset, num_parallel_calls=4)

In [6]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):  
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(                          
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)              
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
  
    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions                        
 
    def compute_mask(self, inputs, mask=None):                             
        return tf.math.not_equal(inputs, 0)                                
 
    def get_config(self):                                                  
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [10]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True                     
  
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [11]:
#A simple transformer-based language model
from tensorflow.keras import layers
embed_dim = 256 
latent_dim = 2048 
num_heads = 2 
  
inputs = keras.Input(shape=(None,), dtype="int64")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, x)
outputs = layers.Dense(vocab_size, activation="softmax")(x)       
model = keras.Model(inputs, outputs)
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop")

In [None]:
#Create text-generation callback that has a "temp" gauge for amount of randomness in model
import numpy as np
#Dictonary that maps word indices back to strings, for text decoding
tokens_index = dict(enumerate(text_vectorization.get_vocabulary()))
#Temp sampling from prob dist
def sample_next(predictions, temperature=1.0):                        
    predictions = np.asarray(predictions).astype("float64")
    predictions = np.log(predictions) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, predictions, 1)
    return np.argmax(probas)
  
class TextGenerator(keras.callbacks.Callback):
    def __init__(self,
                 #prompt used for seed generation
                 prompt,        
                 #how many words to use
                 generate_length,                                      
                 model_input_length,
                 #range of temp for sampling
                 temperatures=(1.,),                                   
                 print_freq=1):
        self.prompt = prompt
        self.generate_length = generate_length
        self.model_input_length = model_input_length
        self.temperatures = temperatures
        self.print_freq = print_freq
  
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.print_freq != 0:
            return
        for temperature in self.temperatures:
            print("== Generating with temperature", temperature)
            #when generate text start from prompt
            sentence = self.prompt                                     
            for i in range(self.generate_length):
                #Feed current sequence to model
                tokenized_sentence = text_vectorization([sentence])    
                predictions = self.model(tokenized_sentence)           
                #Get predictions for the last timestep and use to sample a new word
                next_token = sample_next(predictions[0, i, :])         
                sampled_token = tokens_index[next_token]        
                #Append word to sample
                sentence += " " + sampled_token                        
            print(sentence)
            
prompt = "This movie" 
text_gen_callback = TextGenerator(
    prompt,
    generate_length=50,
    model_input_length=sequence_length,
    temperatures=(0.2, 0.5, 0.7, 1., 1.5))

In [None]:
model.fit(lm_dataset, epochs=200, callbacks=[text_gen_callback])