In [42]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, initializers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU, Dropout,LayerNormalization
import numpy as np
import warnings

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
# ignore cuda warnings
warnings.filterwarnings('ignore')

In [43]:
# hyperparameters
batch_size = 32
block_size = 64
max_iters = 500
learning_rate = 1e-4
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

physical_devices = tf.config.list_physical_devices('GPU')

if len(physical_devices) > 0:
    print("GPU is available")
    device = '/device:GPU:0'
else:
    print("GPU is not available")
    device = '/device:CPU:0'
# ------------

GPU is available


In [44]:
def text_extractor():
    path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
    
    #text = open(path_to_file, 'rb').read().decode(encoding='utf-8').replace('\n', '\n')
    with open(path_to_file, 'rb') as f:
        text = f.read().decode('utf-8')
    return text

In [45]:
# here are all the unique characters that occur in this text
text = text_extractor()
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Every character in text:" + ''.join(chars))
print("Vocab size={0}".format(vocab_size))

Every character in text:
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size=65


In [46]:
def text_encoder(text):
    # encoder: take a string, output a list of integers
    stoi = {ch:i for i, ch in enumerate(chars)}
    encode = lambda s: [stoi[ch] for ch in s]
    # Encoding the entire text into numbers
    series = encode(text)
    return series

In [47]:
def text_decoder(text):
    # decoder: take a list of integers, output a string
    itos = {i:ch for i, ch in enumerate(chars)} 
    decode = lambda l: "".join([itos[i] for i in l])
    series = decode(text)
    return series

In [48]:
# Function to create a windowed datase
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
    # Creating a tensorflow dataset from the encoded series
    dataset = tf.data.Dataset.from_tensor_slices(series)
    # Creating a windowed dataset with each window of size window_size + 1 and shifting the window by 1 after each step
    dataset = dataset.window(size=window_size+1, shift = 1, drop_remainder=True)
    # Flattening the dataset
    dataset = dataset.flat_map(lambda window: window.batch(window_size+1))
    # Splitting each window into features (all elements except the last) and target (the last element)
    dataset = dataset.map(lambda x: (x[:-1], x[1:]))
    # Shuffling the dataset
    dataset = dataset.shuffle(shuffle_buffer)
    # Batching the dataset and prefetching 1 batch at a time to improve performance
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset

In [49]:
series = text_encoder(text)
n = int(0.9*len(series)) # first 90% will be train, rest val

# Create the training dataset
train_dataset = windowed_dataset(series[:n], block_size, batch_size, shuffle_buffer=10)

# Create the testing dataset
test_dataset = windowed_dataset(series[n:], block_size, batch_size, shuffle_buffer=10)

In [50]:
for inputs, targets in train_dataset.take(1):
    encoder_inputs_shape = inputs[0].shape
    decoder_inputs_shape = inputs[1].shape
    targets_shape = targets.shape

    print(f'encoder_inputs_shape: {encoder_inputs_shape}')
    print(f'decoder_inputs_shape: {decoder_inputs_shape}')
    print(f"targets_shape: {targets_shape}")

encoder_inputs_shape: (64,)
decoder_inputs_shape: (64,)
targets_shape: (32, 64)


2023-11-07 12:32:24.134615: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32 and shape [1003854]
	 [[{{node Placeholder/_0}}]]
2023-11-07 12:32:24.135797: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32 and shape [1003854]
	 [[{{node Placeholder/_0}}]]


In [51]:
class MultiHeadAttention(layers.Layer):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()

        self.head_size = n_embd // n_head


        # Projecting input into key, query, and value for all attention heads
        self.c_attn = layers.Dense(3 * config.n_embd, use_bias=False)
        
        # Regularization
        self.attn_dropout = layers.Dropout(dropout)
        self.resid_dropout = layers.Dropout(dropout)


    def call(self, x):
        B, T, C = x.shape

        # Linear transformation for queries, keys, and values, note that C = n_embd
        qkv = self.c_attn(x)  # Input shape: (B, T, C), Output shape: (B, T, 3 * n_embd)

        # Split the queries, keys, and values
        q, k, v = tf.split(qkv, 3, axis=-1)  # Input shape: (B, T, 4 * n_embd), Output shapes: 3 * (B, T, n_embd)
        
        
        # Reshape queries, keys, and values for multi-head attention with head_size = n_embd // num_heads
        # BUG: possible issue with tensorflow, you can use tf.reshape(q, (B, T, self.num_heads, -1)), for tensorflow B is unknown: it will give an error
        q = tf.reshape(q, (-1, T, n_head, self.head_size))  # Output shape: (B, T, num_heads, head_size)
        k = tf.reshape(k, (-1, T, n_head, self.head_size))  # Output shape: (B, T, num_heads, head_size)
        v = tf.reshape(v, (-1, T, n_head, self.head_size))  # Output shape: (B, T, num_heads, head_size)



        # Transpose queries, keys, and values for efficient matrix multiplication
        q = tf.transpose(q, perm=[0, 2, 1, 3])  # Output shape: (B, num_heads, T, head_size)
        k = tf.transpose(k, perm=[0, 2, 1, 3])  # Output shape: (B, num_heads, T, head_size)
        v = tf.transpose(v, perm=[0, 2, 1, 3])  # Output shape: (B, num_heads, T, head_size)


        # Compute attention scores ("affinities")
        wei = tf.matmul(q, k, transpose_b=True) * (self.head_size ** -0.5)  # Output shape: (B, num_heads, T, T)


        mask = tf.linalg.band_part(tf.ones_like(wei), -1, 0)  # Lower triangular matrix of ones
        wei = tf.where(mask == 1, wei, float("-inf"))  # Set upper triangular part to -inf


        wei = tf.nn.softmax(wei, axis=-1)  # Output shape: (B, num_heads, T, T)
        wei = self.attn_dropout(wei)  # Regularization step 1


        # Perform the weighted aggregation of the values
        out = tf.matmul(wei, v)  # Output shape: (B, num_heads, T, head_size)


        # Transpose and reshape the output to match the original shape
        out = tf.transpose(out, perm=[0, 2, 1, 3])  # Output shape: (B, T, num_heads, head_size)
        out = tf.reshape(out, (-1, T, C))  # Output shape: (B, T, C) - note that C = num_heads * head_size = n_embd
        out = self.resid_dropout(out)  # Regularization step 2
        return out

In [59]:
class FeedForward(layers.Layer):
    def __init__(self, n_embd):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(4 * n_embd, activation=tf.keras.activations.relu),
            tf.keras.layers.Dense(n_embd),
            tf.keras.layers.Dropout(dropout)
        ])

    def call(self, x):
        # Apply the feedforward operation and add it to the input
        out = x + self.seq(x)
        return out

In [69]:
class Block(layers.Layer):
    def __init__(self, n_embd):
        super(Block, self).__init__()
       
        # Create layers for Multi-Head Attention and FeedForward
        self.sa = MultiHeadAttention()
        self.ffwd = FeedForward(n_embd)

        # Layer normalization for both branches
        self.ln1 = tf.keras.layers.LayerNormalization(epsilon=learning_rate)
        self.ln2 = tf.keras.layers.LayerNormalization(epsilon=learning_rate)

    def call(self, x):
        # Input data is layer normalized: Layer normalizing the input data as the number of features increases over time
        x_normalized = self.ln1(x)

        # Fed through the attention network: We get the attention scores or weighted values
        attn_output = self.sa(x_normalized)

        # Added to the input
        x = x + attn_output

        # Layer normalized the data
        x_normalized = self.ln2(x)

        # Pass through a FeedForward
        ffwd_output = self.ffwd(x_normalized)

        # Added to the input
        x = x + ffwd_output


        return x

In [70]:
def decoder():
    """
    Creates an decoder model based on the provided configuration.
    Args:config: An object specifying the configuration parameters.
    Returns:decoder: A Keras Model object representing the encoder model.
    """
    # create a dict with all the layers we need
    transformer_dict = {
        # input layer
        'input': tf.keras.Input(shape=(block_size,)),
        # word token embeddings
        'wte': tf.keras.layers.Embedding(vocab_size, n_embd, input_length=block_size),
        # word position embeddings
        'wpe': tf.keras.layers.Embedding(block_size, n_embd),
        # dropout layer
        'drop': tf.keras.layers.Dropout(dropout),
        # Transformer blocks
        'h': tf.keras.Sequential([Block(n_embd) for _ in range(n_layer)]),
        # layer normalization
        'ln_f': tf.keras.layers.LayerNormalization(epsilon=learning_rate),
        # layer used to project the output of the GPT model to the vocabulary size
        'lm_head': tf.keras.layers.Dense(vocab_size, use_bias=False)
    }
    # input
    idx = transformer_dict['input']
    pos = tf.range(0, block_size, dtype=tf.int64)  # shape (t)

    # Forward the GPT model itself
    tok_emb = transformer_dict['wte'](idx)  # token embeddings of shape (b, t, n_embd)
    pos_emb = transformer_dict['wpe'](pos)  # position embeddings of shape (t, n_embd)
    x = transformer_dict['drop'](tok_emb + pos_emb)
    for block in transformer_dict['h'].layers:
        x = block(x)
    x = transformer_dict['ln_f'](x)

    # logit scores for each vocabulary word at each position in the input sequence.
    logits = transformer_dict['lm_head'](x)  # shape (batch_size, sequence_length, vocab_size)

    # Create encoder model
    model = tf.keras.Model(inputs=idx, outputs=logits, name='encoder')

    return model

In [71]:
with tf.device(device):    
    # Create the decoder model
    decoder_model = decoder()


    # Compile and train the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    epochs = 2


    decoder_model.compile(optimizer=optimizer, loss=loss_fn)
    history = decoder_model.fit(train_dataset, epochs=epochs, validation_data=test_dataset)

Epoch 1/2


2023-11-07 12:39:05.936507: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32 and shape [1003854]
	 [[{{node Placeholder/_0}}]]
2023-11-07 12:39:05.936971: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32 and shape [1003854]
	 [[{{node Placeholder/_0}}]]


  31369/Unknown - 12588s 400ms/step - loss: nan

2023-11-07 16:08:54.225847: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32 and shape [111540]
	 [[{{node Placeholder/_0}}]]
2023-11-07 16:08:54.227491: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32 and shape [111540]
	 [[{{node Placeholder/_0}}]]


Epoch 2/2
