In [13]:
import os
base_url = "https://github.com/marinaramalhete/DeepLearningAI_NLP_Specialization/raw/main/3%20-%20Natural%20Language%20Processing%20with%20Sequence%20Models/W2/Assignment_Deep_N_Grams/"

files = [
    "model.pkl.gz", "w2_unittest.py",
    "data/shakespeare_data.txt"
]

os.makedirs('data', exist_ok=True)
for file in files:
    !wget -O "{file}" "{base_url}{file}"  -q --show-progress



In [5]:
import os
import traceback
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import random as  rnd

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input

from termcolor import colored

# set random seed
rnd.seed(32)

# 1 Data Preprocessing

In [14]:
# Load Data
with open(os.path.join('data', 'shakespeare_data.txt')) as f:
    lines = [line.strip() for line in f if line.strip()]

print(f"Number of lines: {len(lines)}")

Number of lines: 125097


In [15]:
print("\n".join(lines[506:514]))

BENVOLIO	Here were the servants of your adversary,
And yours, close fighting ere I did approach:
I drew to part them: in the instant came
The fiery Tybalt, with his sword prepared,
Which, as he breathed defiance to my ears,
He swung about his head and cut the winds,
Who nothing hurt withal hiss'd him in scorn:
While we were interchanging thrusts and blows,


In [16]:
# Create Vocab
text = "\n".join(lines)

vocab = sorted(set(text))
vocab.insert(0, "[UNK]")  # For unknown characters
vocab.insert(1, "")       # For padding

print(f"{len(vocab)} unique characters")
print(" ".join(vocab))

82 unique characters
[UNK]  	 
   ! $ & ' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] a b c d e f g h i j k l m n o p q r s t u v w x y z |


In [17]:
# Concept: unicode_split (String Tensor)
line = "Hello world!"
chars = tf.strings.unicode_split(line, input_encoding='UTF-8')
print(chars)

tf.Tensor([b'H' b'e' b'l' b'l' b'o' b' ' b'w' b'o' b'r' b'l' b'd' b'!'], shape=(12,), dtype=string)


In [18]:
# Concept: StringLookup
ids = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)(chars)
print(ids) # Callable ids layer

tf.Tensor([34 59 66 66 69  4 77 69 72 66 58  5], shape=(12,), dtype=int64)


1. You create a StringLookup layer with the given vocabulary and no mask token.
2. Then you immediately call it with chars (which should be a tensor or list of strings).
3. It returns the integer indices for each string in chars.

In [19]:
def line_to_tensor(line, vocab):
    """
    Converts a line of text into a tensor of integer values representing characters.

    Args:
        line (str): A single line of text.
        vocab (list): A list containing the vocabulary of unique characters.

    Returns:
        tf.Tensor(dtype=int64): A tensor containing integers (unicode values) corresponding to the characters in the `line`.
    """
    chars = tf.strings.unicode_split(line, input_encoding='UTF-8')]
    ids = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)(chars)

    return ids

In [22]:
# Concept: reduce_join
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True, mask_token=None)
line = tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
print(line)

tf.Tensor(b'Hello world!', shape=(), dtype=string)


In [20]:
def text_from_ids(ids, vocab):
    """
    Converts a tensor of integer values into human-readable text.

    Args:
        ids (tf.Tensor): A tensor containing integer values (unicode IDs).
        vocab (list): A list containing the vocabulary of unique characters.

    Returns:
        str: A string containing the characters in human-readable format.
    """
    chars_from_ids = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True, mask_token=None)
    line = tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
    
    return line

In [23]:
# Train Eval split
train_lines, eval_lines = lines[:-1000], lines[-1000:]

In [31]:
# Concept: from_tensor_slices
all_ids = line_to_tensor("\n".join(["Hello world!", "Generative AI"]), vocab)
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

print([id for id in ids_dataset.take(5)])

[<tf.Tensor: shape=(), dtype=int64, numpy=34>, <tf.Tensor: shape=(), dtype=int64, numpy=59>, <tf.Tensor: shape=(), dtype=int64, numpy=66>, <tf.Tensor: shape=(), dtype=int64, numpy=66>, <tf.Tensor: shape=(), dtype=int64, numpy=69>]


In [37]:
# Concept: batch
seq_length = 10
data_generator = ids_dataset.batch(seq_length + 1, drop_remainder=True)

print([seq for seq in data_generator.take(5)]) # we only have 2 seqs
print([text_from_ids(seq, vocab).numpy() for seq in data_generator.take(5)]) # we only have 2 seqs

[<tf.Tensor: shape=(11,), dtype=int64, numpy=array([34, 59, 66, 66, 69,  4, 77, 69, 72, 66, 58])>, <tf.Tensor: shape=(11,), dtype=int64, numpy=array([ 5,  3, 33, 59, 68, 59, 72, 55, 74, 63, 76])>]
[b'Hello world', b'!\nGenerativ']


In [38]:
def split_input_target(sequence):
    """
    Splits the input sequence into two sequences, where one is shifted by one position.

    Args:
        sequence (tf.Tensor or list): A list of characters or a tensor.

    Returns:
        tf.Tensor, tf.Tensor: Two tensors representing the input and output sequences for the model.
    """
    input_text = sequence[:-1]
    target_text = sequence[1:]

    return input_text, target_text

split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

## Create Batch Dataset

In [41]:
def create_batch_dataset(lines, vocab, seq_length=100, batch_size=64):
    """
    Creates a batch dataset from a list of text lines.

    Args:
        lines (list): A list of strings with the input data, one line per row.
        vocab (list): A list containing the vocabulary.
        seq_length (int): The desired length of each sample.
        batch_size (int): The batch size.

    Returns:
        tf.data.Dataset: A batch dataset generator.
    """
        # Buffer size to shuffle the dataset
        # TF data is designed to work with possibly infinite sequences.
        # So it doesn't attempt to shuffle the entire sequence in memory. 
        # Instead, it maintains a buffer in which it shuffles elements.
    BUFFER_SIZE = 10000
    
        # For simplicity, just join all lines into a single line
    single_line_data  = "\n".join(lines)
    
        # Convert your data into a tensor using the given vocab
    all_ids = line_to_tensor(single_line_data, vocab)
        # Create a TensorFlow dataset from the data tensor
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
        # Create a batch dataset
    data_generator = ids_dataset.batch(seq_length + 1, drop_remainder=True) 
        # Map each input sample using the split_input_target function
    dataset_xy = data_generator.map(split_input_target)
    
        # Assemble the final dataset with shuffling, batching, and prefetching
    dataset = (
        dataset_xy
        .shuffle(BUFFER_SIZE)
        .batch(batch_size, drop_remainder=True)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )
    
    return dataset

In [42]:
dataset = create_batch_dataset(train_lines, vocab, seq_length=100, batch_size=64)

# 2. GRU Language Model

- `tf.keras.layers.Embedding(input_dim, output_dim)`: Initializes the embedding. In this case it is the size of the vocabulary by the dimension of the model. [docs](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)  
    - `Embedding(vocab_size, embedding_dim)`.
    - `vocab_size` is the number of unique words in the given vocabulary.
    - `embedding_dim` is the number of elements in the word embedding (some choices for a word embedding size range from 150 to 300, for example).
</br></br>

- `tf.keras.layers.GRU(units)`: Builds a traditional GRU of rnn_units with dense internal transformations. [docs](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRU) You can read the paper here: https://arxiv.org/abs/1412.3555
    - `units`: Number of recurrent units in the layer. It must be set to `rnn_units`
    - `return_sequences`: It specifies if the model returns a sequence of predictions. Set it to `True`
    - `return_state`: It specifies if the model must return the last internal state along with the prediction. Set it to `True` 
</br></br>

- `tf.keras.layers.Dense`: A dense layer. [docs](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense). You must set the following parameters:
    - `units`: Number of units in the layer. It must be set to `vocab_size`
    - `activation`: It must be set to `log_softmax` function as described in the next line.
</br></br>

- `tf.nn.log_softmax`: Log of the output probabilities. [docs](https://www.tensorflow.org/api_docs/python/tf/nn/log_softmax)
    - You don't need to set any parameters, just set the activation parameter as `activation=tf.nn.log_softmax`.
</br></br>
