In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential, Model
from tensorflow.keras.layers import Dense, Input, Embedding, LayerNormalization, Dropout
import numpy as np

In [None]:
with open('training_data.txt', 'r', encoding='utf-8') as f:
    data = f.read().replace('\n', ' ')

In [None]:
print(len(data))

1115394


In [None]:
characters=list(set(list(data)))
print(len(characters))

64


In [None]:
character_to_integer_encoding={}
integer_to_character_encoding={}
for i in range(len(characters)):
    character_to_integer_encoding[characters[i]]=i+1
    integer_to_character_encoding[i+1]=characters[i]

In [None]:
def encode(string):
    global character_to_integer_encoding
    return [character_to_integer_encoding[char] for char in string]

def decode(lst):
    global integer_to_character_encoding
    return ''.join([integer_to_character_encoding[i] for i in lst])

In [None]:
input_data=encode(data)
train_data=input_data[:int(0.9*len(input_data))]
test_data=input_data[int(0.9*len(input_data)):]

In [None]:
batch_size=32
block_size=128
num_heads=8 # Experiment with other values if you want
num_transformer_blocks = 4
input_vocab_size=len(characters)+1
feed_forward_dim = 256 # I am using the same dimensions for the embedding as well. This may be too high of a dimension, given that there are only 65 characters and 128 positions per block, but it will take a lot of time to test alternate parameters

In [None]:
'''
Implementing the Multihead attention layer was something I tried,
but ultimately it gave slower and worse results than calling layers.MultiHeadAttention
(ig the people at tensorflow have put some effort into optimization).
You can try modifying the code in this cell and using it instead of calling the inbuilt class
'''
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, model_dimension):
        super().__init__()
        self.num_heads = num_heads
        self.model_dimension = model_dimension
        assert model_dimension % num_heads == 0

        self.depth = model_dimension // num_heads
        self.query_space_projector = Dense(model_dimension)
        self.key_space_projector = Dense(model_dimension)
        self.value_space_projector = Dense(model_dimension)
        self.dense = Dense(model_dimension)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])  # Shape: (batch_size, num_heads, seq_len, depth)

    def causal_attention_mask(self, batch_size, n_dest, n_src):
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j
        mask = tf.cast(m, tf.bool)
        mask = tf.reshape(mask, [1, n_dest, n_src])  # Shape: (1, n_dest, n_src)
        mask = tf.tile(mask, [batch_size, 1, 1])  # Shape: (batch_size, n_dest, n_src)
        mask = mask[:, tf.newaxis, :, :]  # Shape: (batch_size, 1, n_dest, n_src)
        mask = tf.tile(mask, [1, self.num_heads, 1, 1])  # Shape: (batch_size, num_heads, n_dest, n_src)
        return mask

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        q = self.query_space_projector(inputs)
        k = self.key_space_projector(inputs)
        v = self.value_space_projector(inputs)

        q = self.split_heads(q, batch_size)  # Shape: (batch_size, num_heads, seq_len, depth)
        k = self.split_heads(k, batch_size)  # Shape: (batch_size, num_heads, seq_len, depth)
        v = self.split_heads(v, batch_size)  # Shape: (batch_size, num_heads, seq_len, depth)

        mask = self.causal_attention_mask(batch_size, tf.shape(inputs)[1], tf.shape(inputs)[1])

        mask = tf.cast(mask, tf.float32)

        attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)

        attention = tf.transpose(attention, perm=[0, 2, 1, 3])  # Shape: (batch_size, seq_len, num_heads, depth)

        attention = tf.reshape(attention, (batch_size, -1, self.model_dimension))  # Shape: (batch_size, seq_len, model_dimension)

        output = self.dense(attention)
        return output

    def scaled_dot_product_attention(self, q, k, v, mask):
        matmul_qk = tf.matmul(q, k, transpose_b=True)  # Shape: (batch_size, num_heads, seq_len, seq_len)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

        output = tf.matmul(attention_weights, v)  # Shape: (batch_size, num_heads, seq_len, depth)
        return output, attention_weights

In [None]:
def causal_attention_mask(batch_size, n_dest, n_src):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, tf.bool)
    mask = tf.reshape(mask, (1,n_dest, n_src)) ### -> Fill this up
    return tf.tile(mask, [batch_size, 1, 1])


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        # Give code for an attention layer, feedforward layers, and normalization layers. The attention layer is first, then normalization and dropout, then forward the data passed through a non-linear function, and call the dropout layer again
        ###
        # Insert code here
        self.attention = MultiHeadAttention(num_heads, embed_dim)
        self.feed_forward_network = Sequential([
            Dense(ff_dim, activation='relu'),
            Dense(embed_dim)
        ])
        ###
        self.normalization_layer_1 = LayerNormalization(epsilon=1e-6)
        self.normalization_layer_2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        block_size = input_shape[1]
        attention_output = self.attention(inputs)
        attention_output = self.dropout1(attention_output)
        out1 = self.normalization_layer_1(inputs + attention_output)
        ffn_output = self.feed_forward_network(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.normalization_layer_2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_embedding = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        ### -> Fill this up
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_embedding(positions)
        x = self.token_embedding(x)
        return x + positions

In [None]:
class Transformer(Model):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, feed_forward_dim, num_transformer_blocks):
        super().__init__()
        self.inputs = Input(shape=(maxlen,), dtype=tf.int32)
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.embedding_dim = embed_dim
        self.num_transformer_blocks = num_transformer_blocks
        self.transformer_blocks = [TransformerBlock(embed_dim, num_heads, feed_forward_dim) for _ in range(num_transformer_blocks)]
        self.dense = Dense(vocab_size)

    def call(self, inputs):
        x = self.embedding_layer(inputs)
        for i in range(self.num_transformer_blocks):
            x = self.transformer_blocks[i](x)
        output = self.dense(x)
        return output
'''Above, we have a subclass-based representation of the model, and below, a functional API-based representation
The functional API learns much faster and more efficiently, because apparently tensorflow has a bunch of optimizations
for static graphs which are known to it before observing the data (https://www.tensorflow.org/guide/function, functional APIs make use of this paradigm by default)
Secondly, the for loop in the call() function cannot be optimized in the Subclass API, but it is replaced by nodes in a graph in the functional API call,
This avoids having to shuttle between executing the fast code the people behind tensorflow have developed and a slower python for loop.
'''

def get_transformer_model(
    maxlen,
    vocab_size,
    embed_dim,
    num_heads,
    feed_forward_dim,
    num_transformer_blocks=1
):
    inputs = Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    for _ in range(num_transformer_blocks):
        transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
        x = transformer_block(x)
    outputs = Dense(vocab_size)(x)
    model = Model(inputs=inputs, outputs=[outputs]) # This is a functional API-based representation of a tf model
    return model

In [None]:
model = get_transformer_model(block_size, input_vocab_size, feed_forward_dim, num_heads, feed_forward_dim, num_transformer_blocks)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    "adam",
    loss=[loss_fn],
    metrics=["accuracy"]
)


In [None]:
inputs = np.array([train_data[i:i+block_size] for i in range(len(train_data) - block_size)])
targets = np.array([train_data[i+1:i+block_size+1] for i in range(len(train_data) - block_size)])

'''
Insert code here to preprocess the input data and the target data to send it to the model.
'''

inputs = tf.convert_to_tensor(inputs, dtype=tf.float32)
targets = tf.convert_to_tensor(targets, dtype=tf.float32)

dataset= tf.data.Dataset.from_tensor_slices((inputs, targets))
dataset = dataset.shuffle(10000)
dataset = dataset.batch(batch_size, drop_remainder=True)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
model.summary()

In [None]:
'''
It will take a long time for the entirety of this function to run. However, you can always stop execution after short durations to evaluate how the code is performing. The result in the bottommost cell is after training on only 816 out of a potential 313660 batches.
The dataset has to be shuffled between each time you call this cell to avoid running the model only on the first few input-target pairs multiple times, which may cause you to think the model is performing better than it actually is.
If the loss is consistently than 1 even at the start, or accuracy is very high at the start, be wary. You might want to shuffle the dataset and execute again
'''
dataset= tf.data.Dataset.from_tensor_slices((inputs, targets))
dataset=dataset.shuffle(1000)
dataset = dataset.batch(batch_size, drop_remainder=True)
model.fit(dataset, epochs=10)

In [None]:
model.save_weights('my_weights.weights.h5')


In [None]:
def generate_text(model, start_index, num_generate=1):
    '''
        This function will generate text for num_generate characters, starting from start_index+batch_size.
    '''
    input_sequence = train_data[start_index:start_index + block_size]
    generated_text = decode(input_sequence)
    exact_sequence = decode(input_sequence)
    for i in range(num_generate):
        input_eval = tf.convert_to_tensor([input_sequence], dtype=tf.int32)
        predictions = model.predict(input_eval)
        probabilities = tf.nn.softmax(predictions[0, -1]).numpy()
        next_token = np.random.choice(len(probabilities), p=probabilities)
        input_sequence += [next_token]
        input_sequence = input_sequence[1:]
        exact_sequence += decode([np.argmax(probabilities)])
        generated_text += decode([next_token])

    return generated_text, exact_sequence


In [None]:
generate_text(model, start_index=0, num_generate=1000)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17

("First Citizen: Before we proceed any further, hear me speak.  All: Speak, speak.  First Citizen: You are all resolved rather to s o, dete I Ww owe, whin che Th ousnf, e veets nst th hoe mebl!  th, n nis gouurd?  RUt ot  heow, beinh, se, Thu'tw eae whe d ghe-dsoud s vlit, Ndit than mur doudt is the fhesu, bowwhe bul ses ind, theds?  lire Wh, dl br, ur an drs whed, yow. dn keo ew is is cs iblio. The hed To d by, ouedt thy chob nomaghuth rmeecow wice bet and, abouk those wh hee in  moos, tharg beor ges blis twe hage tour n the, rolbe owge b to core ate ud icetan isas ng, reke ree yhad d theane, Th beiT goh , thv theant  Rheh th se Whg o sunsiN-sioner lotthee Youst ron o's plal.  wis dongh  sheRI nit, tic s ngle nouk' whentis dhs  the, T, Th ywhe llcguor,t bres sd' ghuvedtdet' ss ww wheldd, wh Morises the congr did, the enugast sm, Wh bus, she uol. Irhast, ath s, do rus  The.  me in wobk, malld cith thad ut chhs  us bors, mowaalg, de ce, arce, en seey lled thico- che, soucer,'W , whos de

In [None]:
generate_text(model, start_index=0, num_generate=1000)