<a href="https://colab.research.google.com/github/Mpogazi/athena_coder/blob/main/language_model_Tiny_Shakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language Model - Tiny Shakespeare

This notebook is an implementation of character-based GPT model, trained on tiny shakespeare. The code is written in tensorflow, but it's based on this [Andrej's video](https://www.youtube.com/watch?v=kCc8FmEb1nY).

Suggested improvements to learn more should:
* Train the model to add numbers
* Train the model to be a calculator
* Train the model to generate code
* Train the model on a different language dataset
* Implement additional transformer papers

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import layers

import numpy as np
import random
import io

In [None]:
# mounting the google drive to the notebook
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# load the file in as text
with open('/content/drive/MyDrive/language_model/tiny-shakespeare.txt', 'r') as infile:
  text = infile.read()

In [None]:
print("length of the dataset in character: ", len(text))

length of the dataset in character:  1115394


In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [None]:
device_name = tf.test.gpu_device_name()
tf.random.set_seed(1337)
# if device_name != '/device:GPU:0':
#   raise SystemError("GPU device not found")
# print('Found GPU at: {}'.format(device_name))
batch_size = 64
block_size = 256
n_embed = 384
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 10 #200
n_head = 6
n_layer = 6
dropout = 0.2

In [None]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [None]:
data = tf.convert_to_tensor(encode(text), dtype=tf.int32)

In [None]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
def get_batch(split: str):
  data = train_data if split == 'train' else val_data
  ix = tf.random.uniform((batch_size,), maxval=len(data) - block_size, dtype=tf.int32)
  x = tf.stack([data[i:i+block_size] for i in ix])
  y = tf.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batch('train')

In [None]:
class MultiHeadAttention(layers.Layer):
  def __init__(self, n_head, head_size):
    super().__init__()
    self.c_attn = layers.Dense(n_embed * 3)

    #self.heads = [Head(head_size) for _ in range(num_heads)]
    self.proj  = layers.Dense(n_embed)
    self.dropout = layers.Dropout(dropout)

  def call(self, x):
    B, T, C = x.shape
    k, q, v = tf.split(self.c_attn(x), num_or_size_splits=3, axis=-1)

    # reshape from (B, T, n_embed) to (B, nh, T, hs), Note: nh: n_head, hs: head_size
    k = tf.transpose(tf.reshape(k, [B, T, n_head, n_embed // n_head]), perm=[0, 2, 1, 3])
    q = tf.transpose(tf.reshape(q, [B, T, n_head, n_embed // n_head]), perm=[0, 2, 1, 3])
    v = tf.transpose(tf.reshape(v, [B, T, n_head, n_embed // n_head]), perm=[0, 2, 1, 3])

    # lower triangular mask
    tril = tf.linalg.band_part(tf.ones((T, T)), -1, 0) # (T, T)
    tril = tf.reshape(tril, [1, 1, T, T]) # (1, 1, T, T)

    # (B, nh, T, hs) @ (B, nh, hs, T) -> (B, nh, T, T)
    att = (q @ tf.transpose(k, perm=[0, 1, 3, 2])) * (k.shape[-1]**-0.5)
    att = tf.where(tril[:,:,:T,:T] == 0, float('-inf'), att)
    att = tf.nn.softmax(att)

    y = att @ v # (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs)
    y = tf.reshape(tf.transpose(y, perm=[0, 2, 1, 3]), [B, T, n_embed])

    y = self.dropout(self.proj(y))
    return y

class FeedForward(layers.Layer):
  def __init__(self, n_embed):
    super().__init__()
    self.network = keras.Sequential([
        layers.Dense(4 * n_embed, activation="relu", name="feedforward"),
        layers.Dense(n_embed),
        layers.Dropout(dropout)
    ])

  def call(self, x):
    return self.network(x)

class Block(layers.Layer):
  def __init__(self, n_embed, n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embed)

    self.ln1 = layers.LayerNormalization()
    self.ln2 = layers.LayerNormalization()

  def call(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

In [None]:
class LanguageModel(keras.Model):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = keras.layers.Embedding(vocab_size, n_embed)
    self.position_embedding_table = layers.Embedding(block_size, n_embed)

    self.blocks = keras.Sequential([Block(n_embed, n_head) for _ in range(n_layer)])
    self.ln_f = layers.LayerNormalization()
    self.ffwd = FeedForward(n_embed)

    self.lm_head = layers.Dense(vocab_size)
    self.loss_calc = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

  def call(self, idx, targets=None):
    B, T = idx.shape
    # idx and targets are both (B, T) tensor of integers
    tok_embed = self.token_embedding_table(idx) # (B, T, C)
    pos_embed = self.position_embedding_table(tf.range(T))

    x = tok_embed + pos_embed
    x = self.blocks(x)
    x = self.ln_f(x)
    x = self.ffwd(x)

    logits = self.lm_head(x)
    if targets is None:
      loss = None
    else:
      loss = self.loss_calc(targets, logits)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx (B, T)
    for _ in range(max_new_tokens):
      # logits are (B, T, C)
      idx_cond = idx[:, -block_size:]
      logits, _ = self(idx_cond)
      logits = logits[:, -1,:] # (B, C)
      probs = tf.nn.softmax(logits) # (B, C)
      idx_next = tf.random.categorical(logits, 1) # (B, 1)
      idx = tf.concat([idx, idx_next], 1) # (B, T + 1)
    return idx

m = LanguageModel(vocab_size)
out, loss = m(xb, yb)
optimizer = keras.optimizers.AdamW( learning_rate=learning_rate)

In [None]:
m.summary()

Model: "language_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  24960     
                                                                 
 embedding_1 (Embedding)     multiple                  98304     
                                                                 
 sequential_6 (Sequential)   (64, 256, 384)            10646784  
                                                                 
 layer_normalization_12 (Lay  multiple                 768       
 erNormalization)                                                
                                                                 
 feed_forward_6 (FeedForward  multiple                 1181568   
 )                                                               
                                                                 
 dense_19 (Dense)            multiple               

In [None]:
@tf.function
def train_step(x, y, model, optimizer):
  with tf.GradientTape() as tape:
    logits, loss = m(x, y)

  gradients = tape.gradient(loss, m.trainable_variables)
  #print(gradients)
  optimizer.apply_gradients(zip(gradients, m.trainable_variables))
  return loss

for iter in range(max_iters):

  if iter % eval_interval == 0:
    out = {}
    for split in ['train', 'val']:
      losses = [None] * eval_iters
      for k in range(eval_iters):
        x, y = get_batch(split)
        logits, loss = m(x, y)
        losses [k] = loss.numpy()
      mean_loss = tf.reduce_mean(losses)
      out[split] = mean_loss.numpy()

    print(f"step {iter}: train loss {out['train']:.4f}, val loss {out['val']:.4f}")

  x, y = get_batch('train')
  loss = train_step(x, y, m, optimizer)

print(loss)

step 0: train loss 3.4237, val loss 3.4524
step 500: train loss 1.8440, val loss 1.9693
step 1000: train loss 1.4227, val loss 1.6350
step 1500: train loss 1.2694, val loss 1.5867
step 2000: train loss 1.1694, val loss 1.5983
step 2500: train loss 1.0497, val loss 1.6617
step 3000: train loss 0.9223, val loss 1.7734
step 3500: train loss 0.7548, val loss 2.0067
step 4000: train loss 0.6070, val loss 2.2829
step 4500: train loss 0.4562, val loss 2.6583
tf.Tensor(0.36114192, shape=(), dtype=float32)


In [None]:
print(decode(m.generate(tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=1000)[0].numpy().tolist()))


Ne'er shall be thrippeten whispering flower
Post:
But made he had eaten to him earn'd with these dog!
Dared, even ranks may be matter to the Tower,
And we to can with our love is false, that fear
All that wouldst live: do not rejoin'd,
We should have answer to extumpe this hath dewry.
I must be traitor's unto the crown he look:
I prithee, look up: if you can breath and the
duke him, this be advertised him. I would prick again
to Marcius. I never so bless have been more, mighty
ones and much lientent.
Lords, Edward, noble for anger, and thy moody is
therefore, if thoughts fear'd blood. Come on, I pray;
It is a proness to the clouds and rub are rised,
and accused him out only that my name in new--
And she hath fly me; indea--
Her made my lord, hath still dines my life
In Longer London on rescued a father.

MARIUS:
Is't true able upon'd.

First Citizen:
The people and sweet of advice
Scanding curses vial. You gave me presently to you
For your bestooms, and she is any yet all.

Second Cit

In [None]:
with open('/content/drive/MyDrive/language_model/sample_generation.txt', 'w') as f:
  f.write(decode(m.generate(tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=10000)[0].numpy().tolist()))