In [1]:
import os
import random
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from urllib.request import urlretrieve


# Downloading Data
url = "https://www.cs.cmu.edu/%7Espok/grimmtmp/"
dir_name = "data"


def download_data(url, filename, download_dir):
    os.makedirs(download_dir, exist_ok=True)
    if not os.path.exists(os.path.join(download_dir, filename)):
        filepath, _ = urlretrieve(url + filename, os.path.join(download_dir, filename))
    else:
        filepath = os.path.join(download_dir, filename)
    return filepath


# Number of files and their names to download
num_files = 209
filenames = [format(i, "03d") + ".txt" for i in range(1, num_files + 1)]

for fn in filenames:
    download_data(url, fn, dir_name)

filenames = [os.path.join(dir_name, f) for f in os.listdir(dir_name)]

# Splitting the dataset
random_state = 54321

train_filenames, test_and_valid_filenames = train_test_split(
    filenames, test_size=0.2, random_state=random_state
)

valid_filenames, test_filenames = train_test_split(
    test_and_valid_filenames, test_size=0.5, random_state=random_state
)

# Generate bigrams and analyze vocabulary
bigram_set = set()
for fname in train_filenames:
    document = []
    with open(fname, "r") as f:
        for row in f:
            document.append(row.lower())
        document = " ".join(document)
        bigram_set.update([document[i : i + 2] for i in range(0, len(document), 2)])
n_vocab = len(bigram_set)
print("Found {} unique bigrams".format(n_vocab))


# Function to generate tf.data dataset
def generate_tf_dataset(filenames, ngram_width, window_size, batch_size, shuffle=False):
    documents = []
    for f in filenames:
        doc = tf.io.read_file(f)
        doc = tf.strings.ngrams(
            tf.strings.bytes_split(
                tf.strings.regex_replace(tf.strings.lower(doc), "\n", " ")
            ),
            ngram_width,
            separator="",
        )
        documents.append(doc.numpy().tolist())
    documents = tf.ragged.constant(documents)
    doc_dataset = tf.data.Dataset.from_tensor_slices(documents)
    doc_dataset = doc_dataset.map(lambda x: x[::ngram_width])
    doc_dataset = doc_dataset.flat_map(
        lambda x: tf.data.Dataset.from_tensor_slices(x)
        .window(size=window_size + 1, shift=int(window_size * 0.75))
        .flat_map(lambda window: window.batch(window_size + 1, drop_remainder=True))
    )
    doc_dataset = doc_dataset.map(lambda x: (x[:-1], x[1:]))
    doc_dataset = doc_dataset.batch(batch_size=batch_size)
    if shuffle:
        doc_dataset = doc_dataset.shuffle(buffer_size=batch_size * 10)
    return doc_dataset


# Prepare datasets
ngram_length = 2
batch_size = 256
window_size = 128

train_ds = generate_tf_dataset(
    train_filenames, ngram_length, window_size, batch_size, shuffle=True
)
valid_ds = generate_tf_dataset(valid_filenames, ngram_length, window_size, batch_size)
test_ds = generate_tf_dataset(test_filenames, ngram_length, window_size, batch_size)

# Adapt the TextVectorization layer
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=n_vocab,
    standardize=None,
    split=None,
    input_shape=(window_size,),
)

raw_train_data = []
for f in train_filenames:
    with open(f, "r") as file:
        raw_train_data.append(file.read().lower())

text_vectorizer.adapt(raw_train_data)

# Apply TextVectorization to dataset
train_ds = train_ds.map(lambda x, y: (text_vectorizer(x), y))
valid_ds = valid_ds.map(lambda x, y: (text_vectorizer(x), y))

# Check data before training
for x_batch, y_batch in train_ds.take(1):
    print("Input (x):", x_batch.numpy())
    print("Target (y):", y_batch.numpy())

# Define LSTM model
K.clear_session()

lm_model = tf.keras.Sequential(
    [
        layers.Embedding(n_vocab + 2, 96),
        layers.LSTM(512, return_state=False, return_sequences=True),
        layers.LSTM(256, return_state=False, return_sequences=True),
        layers.Dense(1024, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(n_vocab, activation="softmax"),
    ]
)

lm_model.summary()


# Perplexity Metric
class PerplexityMetric(tf.keras.metrics.Mean):
    def __init__(self, name="perplexity", **kwargs):
        super().__init__(name=name, **kwargs)
        self.cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction="none"
        )

    def _calculate_perplexity(self, real, pred):
        loss_ = self.cross_entropy(real, pred)
        step1 = K.mean(loss_, axis=-1)
        perplexity = K.exp(step1)
        return perplexity

    def update_state(self, y_true, y_pred, sample_weight=None):
        perplexity = self._calculate_perplexity(y_true, y_pred)
        super().update_state(perplexity)


# Compile the model
lm_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy", PerplexityMetric()],
)

# Train the model
lm_model.fit(train_ds, validation_data=valid_ds, epochs=60)

2024-10-11 23:03:31.881066: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-11 23:03:31.893414: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-11 23:03:31.904715: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-11 23:03:31.908088: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-11 23:03:31.916429: I tensorflow/core/platform/cpu_feature_guar

Found 696 unique bigrams


I0000 00:00:1728702214.741173   53389 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728702214.780822   53389 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728702214.780894   53389 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728702214.784750   53389 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728702214.784807   53389 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

Input (x): [[1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]]
Target (y): [[b'th' b' d' b'id' ... b'ei' b'th' b'er']
 [b'er' b'e ' b'wa' ... b'he' b'n ' b'th']
 [b'it' b' s' b'o ' ... b'co' b'ul' b'd ']
 ...
 [b'ey' b', ' b'an' ... b'et' b' o' b'ut']
 [b'ng' b' t' b'ho' ... b'an' b'd ' b'th']
 [b'er' b'e ' b'wa' ... b'le' b'ss' b'in']]


2024-10-11 23:03:45.659251: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 1/60


2024-10-11 23:03:47.498573: W tensorflow/core/framework/op_kernel.cc:1817] OP_REQUIRES failed at cast_op.cc:122 : UNIMPLEMENTED: Cast string to float is not supported


UnimplementedError: Graph execution error:

Detected at node Cast_1 defined at (most recent call last):
<stack traces unavailable>
2 root error(s) found.
  (0) UNIMPLEMENTED:  Cast string to float is not supported
	 [[{{node Cast_1}}]]
  (1) CANCELLED:  Function was cancelled before it was started
0 successful operations.
0 derived errors ignored. [Op:__inference_one_step_on_iterator_25777]

# About the dataset


In [1]:
import os
from urllib.request import urlretrieve


url = "https://www.cs.cmu.edu/%7Espok/grimmtmp/"
dir_name = "data"


def download_data(url, filename, download_dir):
    """Download a file if not present, and make sure it's the right sizze"""

    os.makedirs(download_dir, exist_ok=True)

    if not os.path.exists(os.path.join(download_dir, filename)):
        filepath, _ = urlretrieve(url + filename, os.path.join(download_dir, filename))
    else:
        filepath = os.path.join(download_dir, filename)

    return filepath


# Number of files and their names to download
num_files = 209
filenames = [format(i, "03d") + ".txt" for i in range(1, num_files + 1)]

for fn in filenames:
    download_data(url, fn, dir_name)

for i in range(len(filenames)):
    file_exists = os.path.isfile(os.path.join(dir_name, filenames[i]))
    assert file_exists
print("{} files found.".format(len(filenames)))

209 files found.


In [2]:
import random

random_file = random.choice(os.listdir("./data"))
random_file

f = open(f"./data/{random_file}")
print(f.read())

Once upon a time the little hen went with the little cock
to the nut-hill, and they agreed together that whichsoever of
them found a kernel of a nut should share it with the other.
Then the hen found a large, large nut, but said nothing about
it, intending to eat the kernel herself.  The kernel, however,
was so large that she could not swallow it, and it remained
sticking in her throat, so that she was alarmed lest she should
be choked.  Then she cried, cock, I entreat you to run as fast
as you can and fetch me some water, or I shall choke.  The little
cock did run as fast as he could to the spring, and said, stream,
you are to give me some water, the little hen is lying on the
nut-hill, and she has swallowed a large nut, and is choking.  The
well answered, first run to the bride, and get her to give you
some red silk.  The little cock ran to the bride and said, bride,
you are to give me some red silk, I want to give red silk to the
well, the well is to give me some water, I am to take

# Generating training, validation and test sets


In [3]:
from sklearn.model_selection import train_test_split

# Fix the random seed so we get the same output everytime
random_state = 54321

filenames = [os.path.join(dir_name, f) for f in os.listdir(dir_name)]

# First separate train and valid+test data
train_filenames, test_and_valid_filenames = train_test_split(
    filenames, test_size=0.2, random_state=random_state
)

# Separate valid+test data to validation and test data
valid_filenames, test_filenames = train_test_split(
    test_and_valid_filenames, test_size=0.5, random_state=random_state
)

for subset_id, subset in zip(
    ("train", "valid", "test"), (train_filenames, valid_filenames, test_filenames)
):
    print(
        "Got {} files in the {} dataset (e.g.{})".format(
            len(subset), subset_id, subset[:3]
        )
    )

Got 167 files in the train dataset (e.g.['data/199.txt', 'data/036.txt', 'data/043.txt'])
Got 21 files in the valid dataset (e.g.['data/088.txt', 'data/118.txt', 'data/016.txt'])
Got 21 files in the test dataset (e.g.['data/176.txt', 'data/195.txt', 'data/123.txt'])


# Analyze the vocabulary size.

I will be using bigrams to train the language model. The story will be split into units of two characters.
The characters will be converted to lowercase to reduce the input dimensionality. Using character-level bigrams helps to reduce the vocabulary, leading to faster model training


In [4]:
bigram_set = set()
# Go through each file in the training set
for fname in train_filenames:
    document = []  # This will hold all the text
    with open(fname, "r") as f:
        for row in f:
            # Convert text to lower case to reduce input dimensionality
            document.append(row.lower())
            # From the list of text we have, generate one long string
            # (containing all training stories)
        document = " ".join(document)
        # Update the set with all bigrams found
        bigram_set.update([document[i : i + 2] for i in range(0, len(document), 2)])
# Assign to a variable and print
n_vocab = len(bigram_set)
print("Found {} unique bigrams".format(n_vocab))

Found 696 unique bigrams


### Defining th etf.data pipeline

#### function generate_tf_dataset() that takes </br>

1. filenames - as list of filenames containing the text to be used for the model
2. ngram_width - widht of the n-grams to be extracted
3. window_size - lenght of the sequence of n-grams to be used to generate a single data point for the model
4. batch_size - Size of the batch
5. shuffle - (default to False ) whether to shuffle the data or not


In [5]:
import tensorflow as tf


def generate_tf_dataset(filenames, ngram_width, window_size, batch_size, shuffle=False):
    """Generate batched data from a list of files specified"""
    # Read the data found in the documents
    documents = []
    for f in filenames:
        doc = tf.io.read_file(f)
        doc = tf.strings.ngrams(  # Generate ngrams from the string
            tf.strings.bytes_split(
                # Create a list of chars from a string
                tf.strings.regex_replace(
                    # Replace new lines with space
                    tf.strings.lower(doc),  # Convert string to lower case
                    "\n",
                    " ",
                )
            ),
            ngram_width,
            separator="",
        )
    documents.append(doc.numpy().tolist())
    # documents is a list of list of strings, where each string is a story
    # From that we generate a ragged tensor
    documents = tf.ragged.constant(documents)

    # Create a dataset where each row in the ragged tensor would be a sample
    doc_dataset = tf.data.Dataset.from_tensor_slices(documents)

    # perform a quick transformation -tf.strings.ngrams
    # would generate all the ngrams (e.g. abcd => ab, bc, cd) with
    # overlaop, however for our data we do not need the overlap, so we need to skip the overlapping ngrams

    doc_dataset = doc_dataset.map(lambda x: x[::ngram_width])

    # using window function to generate windows from text
    # for a text sequence with window_size 3 and shift 1, you get e.g ab, cd, ef, gh, ij,... =>  [ab, cd, ef], [cd, ef, gh], [ef, gh, ij],...
    # each of these windows is a single training sequence for the model

    doc_dataset = doc_dataset.flat_map(
        lambda x: tf.data.Dataset.from_tensor_slices(x)
        .window(size=window_size + 1, shift=int(window_size * 0.75))
        .flat_map(lambda window: window.batch(window_size + 1, drop_remainder=True))
    )

    # From each windowed sequence we generate input and target tuple
    # e.g. [ab, cd, ef] -> ([ab, cd], [cd, ef])
    doc_dataset = doc_dataset.map(lambda x: (x[:-1], x[1:]))

    # Batch the data
    doc_dataset = doc_dataset.batch(batch_size=batch_size)

    # Shuffle the data if required
    doc_dataset = (
        doc_dataset.shuffle(buffer_size=batch_size * 10) if shuffle else doc_dataset
    )

    # return data
    return doc_dataset

2024-10-11 22:58:48.801943: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-11 22:58:48.812008: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-11 22:58:48.824173: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-11 22:58:48.827675: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-11 22:58:48.836418: I tensorflow/core/platform/cpu_feature_guar

# Hyperparameters


In [6]:
ngram_length = 2
batch_size = 256
window_size = 128

train_ds = generate_tf_dataset(
    train_filenames, ngram_length, window_size, batch_size, shuffle=True
)
valid_ds = generate_tf_dataset(valid_filenames, ngram_length, window_size, batch_size)

test_ds = generate_tf_dataset(test_filenames, ngram_length, window_size, batch_size)

I0000 00:00:1728701931.546171   52320 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728701931.581930   52320 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728701931.581981   52320 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728701931.585887   52320 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728701931.585942   52320 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [7]:
# looking at some data generated by the function

ds = generate_tf_dataset(train_filenames, 2, window_size=10, batch_size=1).take(5)

for record in ds:
    print(record[0].numpy(), "=>", record[1].numpy())

[[b'th' b'er' b'e ' b'wa' b's ' b'on' b'ce' b' a' b'n ' b'en']] => [[b'er' b'e ' b'wa' b's ' b'on' b'ce' b' a' b'n ' b'en' b'ch']]
[[b' a' b'n ' b'en' b'ch' b'an' b'tr' b'es' b's,' b' w' b'ho']] => [[b'n ' b'en' b'ch' b'an' b'tr' b'es' b's,' b' w' b'ho' b' h']]
[[b's,' b' w' b'ho' b' h' b'ad' b' t' b'hr' b'ee' b' s' b'on']] => [[b' w' b'ho' b' h' b'ad' b' t' b'hr' b'ee' b' s' b'on' b's ']]
[[b'ee' b' s' b'on' b's ' b'wh' b'o ' b'lo' b've' b'd ' b'ea']] => [[b' s' b'on' b's ' b'wh' b'o ' b'lo' b've' b'd ' b'ea' b'ch']]
[[b've' b'd ' b'ea' b'ch' b' o' b'th' b'er' b' a' b's ' b'br']] => [[b'd ' b'ea' b'ch' b' o' b'th' b'er' b' a' b's ' b'br' b'ot']]


2024-10-11 22:58:55.487380: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Implementing the language model


### Defining the TExtVectorization layer


In [8]:
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models

# The vectorization layer that will convert string bigrams to IDs
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=n_vocab,
    standardize=None,
    split=None,
    input_shape=(window_size,),
    # output_mode="int",
)

  super().__init__(name=name, **kwargs)


In [9]:
text_vectorizer.adapt(train_ds)
text_vectorizer.get_vocabulary()[:10]

2024-10-11 22:58:55.577767: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


['', '[UNK]', 'e ', 'th', 'he', ' t', 't ', ' w', ' a', 'd ']

In [10]:
train_ds = train_ds.map(lambda x, y: (x, text_vectorizer(y)))
valid_ds = valid_ds.map(lambda x, y: (x, text_vectorizer(y)))


# train_ds = train_ds.map(lambda x, y: (text_vectorizer(x), text_vectorizer(y)))
# valid_ds = valid_ds.map(lambda x, y: (text_vectorizer(x), text_vectorizer(y)))
# test_ds = test_ds.map(lambda x, y: (text_vectorizer(x), text_vectorizer(y)))

# Defining the LSTM model. The model will have

1. The previously trained TextVectorization layer
2. An embedding layer randomly initialized and jointly trained with the model
3. Two LSTM layers each with 512 and 256 nodes respectively
4. A fully-connected hidden lyaer with 1024 nodes and ReLU activation function
5. The final prediction layer with n_vocab nodes and softmax activation


In [11]:
import tensorflow.keras.backend as K

K.clear_session()
lm_model = tf.keras.Sequential(
    [
        text_vectorizer,
        layers.Embedding(n_vocab + 2, 96),
        layers.LSTM(512, return_state=False, return_sequences=True),
        layers.LSTM(256, return_state=False, return_sequences=True),
        layers.Dense(1024, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(n_vocab, activation="softmax"),
    ]
)

lm_model.summary()

# Defining metrics and compiling the model


In [12]:
# using Perplexity


class PerplexityMetric(tf.keras.metrics.Mean):
    def __init__(self, name="perplexity", **kwargs):
        super().__init__(name=name, **kwargs)
        self.cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction="none"
        )

    def _calculate_perplexity(self, real, pred):
        # The next 4 lines zero-out the padding from loss
        # calculations, this follows the logic from:
        # https://www.tensorflow.org/beta/tutorials/text/transformer#loss_
        # and_metrics
        loss_ = self.cross_entropy(real, pred)
        # Calculating the perplexity steps:
        step1 = K.mean(loss_, axis=-1)
        perplexity = K.exp(step1)
        return perplexity

    def update_state(self, y_true, y_pred, sample_weight=None):
        perplexity = self._calculate_perplexity(y_true, y_pred)
        super().update_state(perplexity)

### Compile the model using

1. Sparse categorical cross-entropy as loss function
2. Adam as the optimizer
3. Accuracy and perplexity as metrics


In [13]:
lm_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy", PerplexityMetric()],
)

for i, label in train_ds.take(1):
    print(i)

tf.Tensor(
[[b'th' b'er' b'e ' ... b'ro' b'ck' b'y ']
 [b'he' b' e' b'ld' ... b'ut' b'ed' b' u']
 [b'se' b'a,' b' a' ... b'ps' b', ' b'or']
 ...
 [b' h' b'ad' b' d' ... b'nh' b'ur' b't.']
 [b'd ' b'wi' b'th' ... b's ' b'yo' b'u ']
 [b'ou' b' a' b're' ... b' b' b'ea' b'ut']], shape=(28, 128), dtype=string)


# Training the Model


In [14]:
lm_model.fit(train_ds, validation_data=valid_ds, epochs=60)

Epoch 1/60


2024-10-11 22:58:57.847722: W tensorflow/core/framework/op_kernel.cc:1817] OP_REQUIRES failed at cast_op.cc:122 : UNIMPLEMENTED: Cast string to float is not supported


UnimplementedError: Graph execution error:

Detected at node sequential_1/Cast defined at (most recent call last):
<stack traces unavailable>
2 root error(s) found.
  (0) UNIMPLEMENTED:  Cast string to float is not supported
	 [[{{node sequential_1/Cast}}]]
  (1) CANCELLED:  Function was cancelled before it was started
0 successful operations.
0 derived errors ignored. [Op:__inference_one_step_on_iterator_43084]

In [35]:
# lm_model.evaluate(test_ds)