Download Data

In [1]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-07-18 17:31:39--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-07-18 17:31:39 (18.7 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



Import frameworks and APIs

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import io
import os
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

visualise the data

In [3]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



Preprocessing of Data

In [5]:
text = text.lower().split()

In [6]:
print(len(text))

202651


In [7]:
unique_words = set(text)
print(len(unique_words))

23641


In [8]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(unique_words) }
itos = { i:ch for i,ch in enumerate(unique_words) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ' '.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [9]:
print(encode(text[:20]))
print(decode(encode(text[:20])))

[22227, 2858, 15367, 17431, 11795, 15518, 15164, 14710, 10556, 18327, 5188, 22691, 18327, 22227, 2858, 12912, 6360, 7368, 21568, 17379]
first citizen: before we proceed any further, hear me speak. all: speak, speak. first citizen: you are all resolved rather


In [10]:
Data = []
for i in range(len(text)//20):
  Chunks_of_data = text[i:i+20]
  Data.append(Chunks_of_data)
print(len(Data))

10132


In [11]:
Data[0][0]

'first'

In [12]:
input =[]
output = []
for i in range(len(Data)):
  for k in range(len(Data[i])):
    input.append(Data[i][:k+1])
    output.append(Data[i][k+1:k+2])

In [13]:
print(input[0:5])
print(output[0:5])

[['first'], ['first', 'citizen:'], ['first', 'citizen:', 'before'], ['first', 'citizen:', 'before', 'we'], ['first', 'citizen:', 'before', 'we', 'proceed']]
[['citizen:'], ['before'], ['we'], ['proceed'], ['any']]


In [14]:
len(output) == len(input)

True

In [15]:
inputs = []
for t in input:
  inputs.append(encode(t))

In [16]:
inputs[:10]

[[22227],
 [22227, 2858],
 [22227, 2858, 15367],
 [22227, 2858, 15367, 17431],
 [22227, 2858, 15367, 17431, 11795],
 [22227, 2858, 15367, 17431, 11795, 15518],
 [22227, 2858, 15367, 17431, 11795, 15518, 15164],
 [22227, 2858, 15367, 17431, 11795, 15518, 15164, 14710],
 [22227, 2858, 15367, 17431, 11795, 15518, 15164, 14710, 10556],
 [22227, 2858, 15367, 17431, 11795, 15518, 15164, 14710, 10556, 18327]]

In [17]:
outputs = []
for t in output:
  outputs.append(encode(t))

In [18]:
outputs[:10]

[[2858],
 [15367],
 [17431],
 [11795],
 [15518],
 [15164],
 [14710],
 [10556],
 [18327],
 [5188]]

In [19]:
def make_rectangular(input_list, pad_value=0):
    max_length = max(len(sub_list) for sub_list in input_list)
    rectangular_list = [sub_list + [pad_value] * (max_length - len(sub_list)) for sub_list in input_list]
    return rectangular_list

inputs = make_rectangular(inputs)
outputs = make_rectangular(outputs)

In [20]:
# Create a TensorFlow dataset from the vectorized data
Inputs = tf.data.Dataset.from_tensor_slices(inputs)
Labels = tf.data.Dataset.from_tensor_slices(outputs)

In [21]:
dataset = tf.data.Dataset.zip((Inputs, Labels))
dataset

<_ZipDataset element_spec=(TensorSpec(shape=(20,), dtype=tf.int32, name=None), TensorSpec(shape=(1,), dtype=tf.int32, name=None))>

In [22]:
batch_size = 1000

# Batch the dataset
train_dataset = dataset.batch(batch_size)

In [23]:
len(train_dataset)

203

In [24]:
AUTOTUNE = tf.data.AUTOTUNE

train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [25]:
#Sample from the data
for inp,out in train_dataset:
  break

Model Architecture, here we have a transformer's decoder architecture

In [26]:
import numpy as np

In [27]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

In [28]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model, length):
    super().__init__()
    self.d_model = d_model
    self.length = length
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length = self.length, depth=self.d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [29]:
import tensorflow as tf

# Sample input matrix
matrix = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=tf.float32)

# Create the mask for elements above the main diagonal
mask = tf.linalg.band_part(tf.ones_like(matrix, dtype=tf.bool), num_lower=-1, num_upper=0)

# Apply the mask using tf.where to replace the elements with -inf
result_matrix = tf.where(mask, matrix, tf.constant(float('-inf'), dtype=tf.float32))

print(result_matrix)

tf.Tensor(
[[  1. -inf -inf]
 [  4.   5. -inf]
 [  7.   8.   9.]], shape=(3, 3), dtype=float32)


In [30]:
class Head(tf.keras.Model):
  def __init__(self,head_size,num_heads):
    super().__init__()
    self.key = tf.keras.layers.Dense(head_size)
    self.query = tf.keras.layers.Dense(head_size)
    self.value = tf.keras.layers.Dense(head_size)
    self.sm = tf.keras.layers.Softmax()
    self.dropout = tf.keras.layers.Dropout(0.2)
    self.num_heads = num_heads

  def call(self,x):
    out = []
    B,T,C = x.shape
    for _ in range(self.num_heads):
      k = self.key(x)
      Q = self.query(x)
      V = self.value(x)
      #print(V.shape)
      wei = Q @ tf.transpose(k,perm=[0,2,1]) * C**-0.5
      mask = tf.linalg.band_part(tf.ones_like(wei, dtype=tf.bool), num_lower=-1, num_upper=0)
      wei = tf.where(mask, wei, tf.constant(float('-inf'), dtype=tf.float32))
      wei = self.sm(wei)
      #print(wei)
      wei = self.dropout(wei)
      #print(wei.shape)
      wei = wei @ V
      out.append(wei)
    out = tf.concat([out[i] for i in range(self.num_heads)], axis=2)
    #print(out.shape)
    return out

In [31]:
heads = Head(2,3)
matrix_data = [[[1, 2, 3], [4, 5, 6]],[[1, 2, 3], [4, 5, 6]]]
x = tf.constant(matrix_data, dtype=tf.float32)
print(x.shape)
heads(x).shape
#We need to choose num of heads and head_size in order to gives us the same shape of input (B,T,C)

(2, 2, 3)


TensorShape([2, 2, 6])

In [32]:
class MultiHeadAttention(tf.keras.Model):
  def __init__(self, num_heads, head_size,n_embd,dropout):
    super().__init__()
    self.heads = Head(head_size,num_heads)
    self.proj = tf.keras.layers.Dense(n_embd)
    self.dropout = tf.keras.layers.Dropout(dropout)
    self.head_size = head_size

  def call(self, x):
    x = self.heads(x)
    #x = self.heads(x)  #To add this you need to ensure that the output of the multi head equal the input
    x = self.dropout(self.proj(x))
    return x

In [33]:
class FeedForward(tf.keras.Model):
  def __init__(self,n_embd,dropout):
    super().__init__()
    self.net = tf.keras.Sequential([
               tf.keras.layers.Dense(4 * n_embd),
               tf.keras.layers.ReLU(),
               tf.keras.layers.Dense(n_embd),
               tf.keras.layers.ReLU(),
               tf.keras.layers.Dense(1,activation = "sigmoid"),
               tf.keras.layers.Dropout(dropout)])
  def call(self, x):
    return self.net(x)

In [34]:
class Block(tf.keras.Model):
  def __init__(self,num_heads,n_embd,dropout1,dropout2,vocab_size=len(unique_words)):
    super(Block,self).__init__()
    self.head_size = n_embd // num_heads
    self.sa = MultiHeadAttention(num_heads, self.head_size,n_embd,dropout1)
    self.ffwd = FeedForward(n_embd,dropout2)
    self.ln1 = tf.keras.layers.LayerNormalization()
    self.ln2 = tf.keras.layers.LayerNormalization()
    self.flatten = tf.keras.layers.Flatten()
    self.linear = tf.keras.layers.Dense(vocab_size,activation = "sigmoid")
    self.Embedding = PositionalEmbedding(vocab_size = 25000, d_model = n_embd, length = 2048)

  def call(self,x):
    x = self.Embedding(x)
    x = x + self.sa(self.ln1(x))
    #We need a flatten layer
    x = x + self.ffwd(self.ln2(x))
    x = self.flatten(x)
    #print(x.shape)
    x = self.linear(x)
    return x

In [35]:
lossclass = tf.keras.losses.CategoricalCrossentropy()

In [36]:
optimizer = tf.keras.optimizers.AdamW(learning_rate = 1e-5)

In [37]:
Model = Block(8,256,0.2,0.2)

In [None]:
num_epochs = 20
for epoch in range(num_epochs):
 i = 0
 v = 0
 j = 0
 totalloss = 0
 Number_of_data = 0
 l = 0
 n = 0
 Number_of_test_data = 0
 for inputs, targets in train_dataset:

    with tf.GradientTape() as tape:
        output = Model(inputs)

        # Compute the loss
        loss = lossclass(tf.squeeze(tf.one_hot(targets, depth=len(unique_words))), output)

    # Compute gradients
    gradients = tape.gradient(loss, Model.trainable_variables)

    # Update the weights
    optimizer.apply_gradients(zip(gradients, Model.trainable_variables))
    totalloss += loss
    for j in range(len(output)):
      if targets[j] == tf.cast(tf.argmax(output, axis=1), dtype=tf.int32).numpy()[j]:
        v = v+1
    i += 1
    Number_of_data += len(output)
 accuracy = v/(Number_of_data)
 print(f"Epoch {epoch+1}: Loss = {totalloss / i}, number of batch in the data : {i} ,Number of data: {Number_of_data}, accuracy: {accuracy}")



Epoch 1: Loss = 8.143553733825684, number of batch in the data : 203 ,Number of data: 202640, accuracy: 0.036439005132254246
Epoch 2: Loss = 6.744871139526367, number of batch in the data : 203 ,Number of data: 202640, accuracy: 0.0704747335175681


In [None]:
tf.argmax(output, axis=1)[1]