# Assignment 6: Autoregressive Language Modeling - Kai Ponel & Hannan Mahadik 

## Setup

### Imports

In [None]:
import tensorflow as tf
import numpy as np
import os

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, GRU

### Shakespeare dataset


In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


# RNN (Custom, Bad results)

In [None]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Create a vocabulary of unique characters in the text
vocab = sorted(set(text))

# Create a mapping from characters to indices
char2idx = {char: idx for idx, char in enumerate(vocab)}

# Create a mapping from indices to characters
idx2char = np.array(vocab)

# Convert the text to a sequence of integer indices
text_as_int = np.array([char2idx[char] for char in text])

# Define the sequence length and create training examples and targets
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


# HParams
BATCH_SIZE = 64
BUFFER_SIZE = 10000
embedding_dim = 256
rnn_units = 1024

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Split DS into train and val
train_size = int(0.8 * len(dataset))
val_size = int(0.2 * len(dataset))

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size).take(val_size)

def myModel(vocab_size, embedding_dim, rnn_units):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim))
    model.add(GRU(rnn_units, return_sequences=True, stateful=False,
                  kernel_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    return model

model = myModel(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

def loss_fn(labels, logits):
  return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits))

# Define a function to calculate the loss on the validation set
def val_loss(model, val_dataset):
    loss = 0
    for input_example_batch, target_example_batch in val_dataset:
        predictions = model(input_example_batch)
        loss += loss_fn(target_example_batch, predictions)
    return loss / len(val_dataset)


optimizer = tf.optimizers.Adam(clipnorm=1.0)

epochs = 25

for epoch in range(epochs):
    print(f'Starting epoch {epoch+1}')
    epoch_loss_avg = tf.keras.metrics.Mean()
    for input_example_batch, target_example_batch in train_dataset:
        if np.isnan(input_example_batch.numpy()).any() or np.isnan(target_example_batch.numpy()).any():
            print('Data contains nan values')
            break
        with tf.GradientTape() as tape:
            predictions = model(input_example_batch)
            tf.debugging.check_numerics(predictions, 'predictions contains nan or inf')            
            loss = loss_fn(target_example_batch, predictions)
            tf.debugging.check_numerics(loss, 'loss contains nan or inf')
            epoch_loss_avg.update_state(loss)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
    print(f'Epoch {epoch+1}: Train Loss: {epoch_loss_avg.result()}')
    print(f'Epoch {epoch+1}: Val Loss: {val_loss(model, val_dataset)}')


172
Length of train_dataset: 137
Starting epoch 1
Epoch 1: Train Loss: 2.781325101852417
Epoch 1: Val Loss: 2.236217737197876
Starting epoch 2
Epoch 2: Train Loss: 2.0536627769470215
Epoch 2: Val Loss: 1.8738161325454712
Starting epoch 3
Epoch 3: Train Loss: 1.7691675424575806
Epoch 3: Val Loss: 1.6603164672851562
Starting epoch 4
Epoch 4: Train Loss: 1.5998120307922363
Epoch 4: Val Loss: 1.516616940498352
Starting epoch 5
Epoch 5: Train Loss: 1.4881635904312134
Epoch 5: Val Loss: 1.4242022037506104
Starting epoch 6
Epoch 6: Train Loss: 1.4150744676589966
Epoch 6: Val Loss: 1.3632065057754517
Starting epoch 7
Epoch 7: Train Loss: 1.3578110933303833
Epoch 7: Val Loss: 1.302501916885376
Starting epoch 8
Epoch 8: Train Loss: 1.3105816841125488
Epoch 8: Val Loss: 1.2590655088424683
Starting epoch 9
Epoch 9: Train Loss: 1.2691797018051147
Epoch 9: Val Loss: 1.2233033180236816
Starting epoch 10
Epoch 10: Train Loss: 1.2319068908691406
Epoch 10: Val Loss: 1.1808059215545654
Starting epoch 11


In [None]:
def generate_text(model, start_string=None, num_generate_chars=1000, temperature=1.0):
    # Convert the start string to a sequence of integer indices
    if start_string:
        input_eval = [char2idx[s] for s in start_string]
        input_eval = tf.expand_dims(input_eval, 0)
    else:
        input_eval = tf.expand_dims([], 0)

    generated_text = []

    model.reset_states()

    for i in range(num_generate_chars):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        predictions = predictions / temperature

        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        generated_text.append(idx2char[predicted_id])

        input_eval = tf.expand_dims([predicted_id], 0)

    return ''.join(generated_text)
print(generate_text(model, start_string="C"))

LORO:
Welfickin aichteelengeane me,



RUCLo m, atite be dyousirthirachay whees m qus
TI are t'ser allecefoukisompe toutherdurngenonee, la the thond tsean GHe penghef hert balld t a. wh tand, therd ge tonof wh ber t:

METINThande youne witorit gath bur ll g lllds, me cad hind ard it juthatho ileneramug t tist mowan mallathigor d akind toknanopl y ul s y y inofo t, and ts ththes onguret.
S:

L nco:

IS &CHAMINThast totors an IXESe meng mear! t be wil githe sit hit notin ak's bl od:



ETHoune thithor oo aseldshelu tond pl chifesJO
HE:
ANLanof f blit y s ckealinstwansther t y tevinoo; yo conchis fin a Mixe atitur I t an
I ste thanchon:
S:
O:

Th ce ou al cif cellld h thonger s thetheave

Cowhet hatham t wnol henopof tie
ANGof aterds,
XI atit IO:
F h beere acace, hon ere ne;
Hore waved haveardomacerd n al t wet t paco tontonamous I akize wnon h, be myo le t OUSCAMID:
Theyeldursicacknd-berebld har s asthe, d n,
O mat e; ND maral, malo nge y a thobey tow frd heve tinghe.
NUpan his hires hit

# Tutorial based

In [None]:
import tensorflow as tf
import numpy as np

import pickle

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Dense, LSTM, GRU, Dropout, Input
from tensorflow.keras.utils import to_categorical

In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [None]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 1115394 characters


In [None]:
print(text[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [None]:
vocab = sorted(set(text))

In [None]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [None]:
print(vocab)
print(vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


In [None]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [None]:
BATCH_SIZE = 128
BUFFER_SIZE = 10000
seq_length = 50

In [None]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'First Citizen:\nBefore we proceed any further, hear'
Target: b'irst Citizen:\nBefore we proceed any further, hear '


In [None]:
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(128, 50), dtype=tf.int64, name=None), TensorSpec(shape=(128, 50), dtype=tf.int64, name=None))>

In [None]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [None]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4cbba58250>

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [None]:
# start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
# end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
# print('\nRun time:', end - start)

ROMEO:
That shall be julk, before:
The general is put the thunder of nobody.

SICINIUS:
Nay, then I look'd oath, to fine ourselves, son and have no such face
The strange many nuts bite us for two'ers to wounded?

JULIET:
O, falling look me, and I am Duke of Nare:
He let us sleep of speed;
When virties you stard flaw'd, and each of me and loot former,
I never said it fellest to the poxuls,
For now no succes it in my leave; and then
I grieve yield enjury thee on that, in their presence, and be known bey,
An I bigg'd the days of aws,
His scaptal flay woe well we'll Bianca say 'callo?
Dispatch she is dead.

First Gentleman:
When you thank you, sir, for I can are in my
fair; and, to rather thanksope,
Where did new run you shall. Pray you a lad, have supposeth
The pernice that mighonces
Which oft dear soul of that witness.

LUCENTIO:
Claudio, this depose and few,
That madam: but longest truch me 's't a dishost.
Our decree dishonour, I say.

GREMIO:
They came, I with a joy on stroke;--
That I

# Transformer (BadGPT)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Masking, Dropout, LSTM, Attention
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the Shakespeare dataset
shakespeare_path = tf.keras.utils.get_file(
    'shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
)
text = open(shakespeare_path, 'rb').read().decode(encoding='utf-8')
vocab = sorted(set(text))
char2idx = {u: i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# Preprocess the text
text_as_int = np.array([char2idx[c] for c in text])
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

# Create input and target sequences
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:
# Define hyperparameters
embedding_dim = 512
rnn_units = 2048
batch_size = 256

# Create the model
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    inputs = Input(shape=(None,))
    x = Embedding(vocab_size, embedding_dim)(inputs)
    x = Masking(mask_value=0.0)(x)
    x = Dropout(0.2)(x)
    x = LSTM(rnn_units, return_sequences=True)(x)
    x = Attention()([x, x])
    x = Dense(vocab_size)(x)
    model = Model(inputs=inputs, outputs=x)
    return model

model = build_model(len(vocab), embedding_dim, rnn_units, batch_size)

# Prepare training data
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(batch_size, drop_remainder=True)

# Compile and train the model
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f631d672b90>

In [None]:
# Generate text using the trained model
def generate_text(model, start_string):
    num_generate = 1000
    input_eval = [char2idx[s] for s in start_string]
    input_eval = pad_sequences([input_eval], maxlen=seq_length)
    text_generated = []
    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = pad_sequences([[predicted_id]], maxlen=seq_length)
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate text with a starting prompt
generated_text = generate_text(model, start_string="ROMEO: ")
print(generated_text)

# Tutorial / Previous stuff (Ignore)

### Read the data



In [None]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 1115394 characters


In [None]:
# Take a look at the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [None]:
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

65 unique characters


## Tokenization

### Vectorize the text

Before training, you need to convert the strings to a numerical representation. 

The `tf.keras.layers.StringLookup` layer can convert each character into a numeric ID. It just needs the text to be split into tokens first.

In [None]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

Now create the `tf.keras.layers.StringLookup` layer:

In [None]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

It converts from tokens to character IDs:

In [None]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>

Since the goal of this tutorial is to generate text, it will also be important to invert this representation and recover human-readable strings from it. For this you can use `tf.keras.layers.StringLookup(..., invert=True)`.  

Note: Here instead of passing the original vocabulary generated with `sorted(set(text))` use the `get_vocabulary()` method of the `tf.keras.layers.StringLookup` layer so that the `[UNK]` tokens is set the same way.

In [None]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

This layer recovers the characters from the vectors of IDs, and returns them as a `tf.RaggedTensor` of characters:

In [None]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

You can `tf.strings.reduce_join` to join the characters back into strings. 

In [None]:
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [None]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [None]:
batch_size = 128
buffer_size = 10000
seq_length = 200

In [None]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
dataset = sequences.map(split_input_target)

In [None]:
data = dataset.shuffle(buffer_size).batch(batch_size)

In [None]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 64

# Number of RNN units
rnn_units = 1024

In [None]:
def myModel(vocab_size, embedding_dim, rnn_units, batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]))
    model.add(GRU(rnn_units, return_sequences=True, stateful=True))
    model.add(Dense(vocab_size))
    return model

In [None]:
model = myModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=batch_size)

In [None]:
def loss_fn(labels, logits):
  return tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)

In [None]:
optimizer = tf.optimizers.Adam()

In [None]:
for epoch in range(epochs):
  

In [None]:
epochs = 5
for epoch in range(epochs):
      print('Start of epoch', epoch)

      for step, (x_batch_train, y_batch_train) in enumerate(dataset):

        skip_ids = ids_from_chars(['[UNK]'])[:, None]

        sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])

        prediction_mask = tf.sparse.to_dense(sparse_mask)

        input_chars = chars_from_ids(x_batch_train)
        input_chars = tf.strings.unicode_split(input_chars, 'UTF-8')

        # mask = tf.sequence_mask(x_batch_train,dtype=tf.float32,maxlen=1)
        mask = tf.reshape(prediction_mask, shape=[batch_size, tf.shape(x_batch_train)[1]])

        # input_ids = ids_from_chars(x_batch_train).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        with tf.GradientTape() as tape:

            logits = model(x_batch_train) 

            loss_value = loss_fn(y_batch_train, logits)

            masked_loss = tf.math.multiply(loss_value, prediction_mask)

        grads = tape.gradient(masked_loss, model.trainable_weights)

        optimizer.apply_gradients(zip(grads, model.trainable_weights))
      