<a href="https://colab.research.google.com/github/NeelamU/GPT-2_From_Scratch/blob/main/gpt_2_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

import nltk
from nltk.corpus import brown
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt
import math

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Encode the dataset

In [None]:
### Dataset --> X_train, y_train block
def encode_dataset(vocab, sentences):
  encodings = np.empty((len(sentences), ), dtype=object)
  for x, sentence in enumerate(sentences):
    encodings[x] = encode(sentence)



  train_size = int(math.floor(0.9*len(encodings)))
  X_train = np.empty((train_size, ), dtype = object)
  y_train = np.empty((train_size, ),dtype = object )
  print(train_size)


  for x, sentence in enumerate(encodings[:train_size]): #gonna have to torchify this
    if len(encodings[x])>1: ## if only 1 word sentence then X_train will be None
      X_train[x] = encodings[x][:-1]
      y_train[x] = encodings[x][1:]


  return X_train, y_train


In [None]:
## BLOCK TO CALL BROWN DATASET, Define global encode/decode, Instantiate X_train, y_train

nltk.download('brown')
text = brown.raw()  # Get the raw text from the Brown corpus
sentences = brown.sents()
sentences = [' '.join(each_sent) for each_sent in sentences]
sentences = ' '.join(sentences)
print(sentences[:1000])



[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place . The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted . The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr. . `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' . The jury said it did find that many of Georgia's registration and election laws `` are outmoded or inadequate and often ambiguous '' . It recommended that Fulton legislators act `` to have th

In [None]:
### create vocab:
words = sentences.split()
vocab = sorted(list(set(words)))
text = sentences
vocab_size = len(vocab)
print(vocab[500:550])
print(len(vocab))


['1-6', '1-701', '1-a', '1-degree', '1-degree-C', '1-hp', '1-inch', '1-ml', '1-o', '1-ton', '1.0', '1.0-mg.', '1.00', '1.07', '1.09.3', '1.1', '1.10.1', '1.10.4', '1.10.8', '1.2', '1.23', '1.24', '1.25', '1.25%', '1.25-cm', '1.5', '1.58', '1.8', '1.8%', '1/16', "1/16''", '1/2', "1/2''", '1/2-inch', '1/20th', '1/3', '1/4', "1/4''", '1/4-inch', '1/50th', "1/8''", '1/8-inch', '1/c', '10', '10%', "10''", '10,000', '10,000,000', '10,500', '10,517']
56057


# Simple Tokenizer

In [None]:
## tokenize the input

char_to_num = {ch:i for i, ch in enumerate(vocab)}
num_to_char = {i:ch for i, ch in enumerate(vocab)}

encode = lambda e: [char_to_num[ch] for ch in e.split()]
decode = lambda d:' '.join([num_to_char[ch] for ch in d])

print(sentences[:102])
print(encode(text[:102]))
print(decode(encode(text[:102])))

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produce
[17590, 8186, 5895, 8687, 10432, 47061, 8136, 20448, 36445, 41065, 3264, 45363, 44000, 29542, 44134]
The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produce


In [None]:
dataset = torch.tensor(encode(sentences), dtype = torch.long)
print(dataset[:1000])

split_percentage = 0.9
n = int(split_percentage*len(dataset))

train_set = dataset[:n]
val_set = dataset[n:]

In [None]:
block_size = 128 # num characters in each sentence passed into model
batch_size = 32 # how many batches of these sentences
embed_size = 384
head_size = 64
num_heads = 6
num_blocks = 6
dropout = 0.2




In [None]:


def get_batch(split):
  data = train_set if split == 'train' else val_set
  ints = torch.randint(len(data) - block_size, (batch_size, )) # take 4 random integers from dataset

  inputs = torch.stack([data[x:x+block_size] for x in ints]) # batch_size, block_size
  labels = torch.stack([data[x+1:x+block_size+1] for x in ints])
  inputs = inputs.to(device)
  labels = labels.to(device)

  return inputs, labels


ins, outs = get_batch('train')
print(ins)
print(outs)



tensor([[47523,   393, 20953,  ..., 51924,  9791, 37657],
        [28265, 20963, 22651,  ..., 52131, 52017, 50101],
        [41815, 35924, 54762,  ...,   405, 17905, 34072],
        ...,
        [42525, 20526, 55603,  ..., 55215, 21545, 41065],
        [51924, 38785, 33900,  ..., 51924,  7209, 16053],
        [39219, 41759, 34450,  ..., 55603, 51924,  2730]], device='cuda:0')
tensor([[  393, 20953,   393,  ...,  9791, 37657, 35400],
        [20963, 22651, 41519,  ..., 52017, 50101, 35400],
        [35924, 54762,   393,  ..., 17905, 34072, 34001],
        ...,
        [20526, 55603, 51924,  ..., 21545, 41065, 51924],
        [38785, 33900, 35400,  ...,  7209, 16053, 35400],
        [41759, 34450,   405,  ..., 51924,  2730,   405]], device='cuda:0')


# Implement a Single Head

In [None]:
class Head(nn.Module):
  def __init__(self, head_size):
    super().__init__()

    self.key = nn.Linear(embed_size, head_size, bias = False) # need bias = False to preserve auto regression (i.e. 0s in diagonals)
    self.query = nn.Linear(embed_size, head_size, bias = False)#key @ query == attention scores for every token with every other token for every context length for each of the batches
    self.value = nn.Linear(embed_size, head_size, bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) ## !!!
    self.dropout = nn.Dropout(dropout)

  def forward(self, res_stream ):
    batch, block, embed = res_stream.shape

    key = self.key(res_stream) # res_stream is batch x block x embed
    query = self.query(res_stream)



    # due to masking, the attention scores are autoregressive
    # that is to say, each context size, 1 to block size, can't see the future tokens
    attn_scores = query @ key.transpose(-2, -1) * embed **-0.5 #batch x block x block, and * head_size is the sqrt(dk)
    attn_scores = attn_scores.masked_fill(self.tril[:block, :block] == 0, float('-inf')) # lower triangle so tokens of each size are attended to, masked fill makes 0s to -infs
    attn_scores = F.softmax(attn_scores, dim = -1)
    attn_scores = self.dropout(attn_scores)

    values = self.value(res_stream)
    # DROPOUT HERE!
    # head_out preserves the autoregressive nature by virtue of matrix multiplication with attn_scores
    # that is, past values for tokens are matmuled, by future tokens are simply zeroed out do to the masking in attn_scores
    head_out = attn_scores @ values # batch x block x head

    return head_out


# Multi Head Attention

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()

    #instantiate num_heads heads into a nn.ModuleList (a list for modules)
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.to_stream = nn.Linear(num_heads * head_size, embed_size) #bias False?
    self.dropout = nn.Dropout(dropout)
    ## ADD DROPOUT
  def forward(self, res_stream):

    # returns a list of the output of the heads concatenated together
    # since this feed forwards each head, and the output of Head.forward() is head_out of size batch, block, head_size --
    # and since torch.cat is operating on the last dimension (-1 -- head_size) --
    # the new multi head dimensional output will be batch, block, num_head * head_size
    # essentially each head concatenates what it learns from attn_scores @ values
    # then a linear layer projects the auto regression from multi heads to embedding size to be added back to stream
    multi_heads = torch.cat([h(res_stream) for h in self.heads], dim = -1) # batch, block, num_head * head_size
    attention_out = self.dropout(self.to_stream(multi_heads)) # batch x block x embed_size

    return attention_out



# The 4x MLP  Block

In [None]:
class FeedForwardMLP(nn.Module):
  def __init__(self, embed_size):
    super().__init__()
    self.MLP = nn.Sequential(
        nn.Linear(embed_size, embed_size*4),
        nn.ReLU(),
        nn.Linear(embed_size*4, embed_size),
        nn.Dropout(dropout)
        # ADD DROPOUT
    )

  def forward(self, attention_out):
    MLP_out = self.MLP(attention_out) # size batch x block x embed_size
    return MLP_out



# A Single Transformer MHA+MLP Block

In [None]:
class Block(nn.Module):
  def __init__(self, num_heads, head_size, embed_size ):
    super().__init__()
    self.multihead_attn_layer = MultiHeadAttention(num_heads, head_size)
    self.MLP_layer = FeedForwardMLP(embed_size)
    self.layernorm1 = nn.LayerNorm(embed_size)
    self.layernorm2 = nn.LayerNorm(embed_size)
    #self.LayerNorm

  def forward(self, res_stream ):

    attention_out = self.multihead_attn_layer(self.layernorm1(res_stream)) + res_stream ### + res_stream implements residual stream
    MLP_FF = self.MLP_layer(self.layernorm2(attention_out)) + attention_out ## + attention_out implements residual stream

    return MLP_FF





# Putting it all Together

In [None]:


class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, embed_size) # embeddings are vocab_size
    self.position_embedding_table = nn.Embedding(block_size, embed_size)

    blocks = [Block(num_heads, head_size, embed_size) for _ in range(num_blocks)]
    self.TransformerBlocks = nn.Sequential(*blocks)

    # self.TransformerBlocks = nn.Sequential(*[Block(num_heads, head_size, embed_size)] for _ in range(num_blocks))
    # self.TransformerBlock1 = Block(num_heads, head_size, embed_size)
    # self.TransformerBlock2 = Block(num_heads, head_size, embed_size)
    # self.TransformerBlock3 = Block(num_heads, head_size, embed_size)
    # self.TransformerBlock4 = Block(num_heads, head_size, embed_size)
    # self.TransformerBlock5 = Block(num_heads, head_size, embed_size)
    # self.TransformerBlock6 = Block(num_heads, head_size, embed_size)
    # self.TransformerBlock7 = Block(num_heads, head_size, embed_size)
    # self.TransformerBlock8 = Block(num_heads, head_size, embed_size)

    self.layernorm_final = nn.LayerNorm(embed_size)

    self.unembedding = nn.Linear(embed_size, vocab_size)

  def forward(self, inputs, labels):
    batch, block = inputs.shape

    # embedding and position encoding

    embeddings = self.token_embedding_table(inputs) # of size batch_size, block_size, embed_size, in this case embed_size = vocab_size
    positions = self.position_embedding_table(torch.arange(block, device = device)) # this results in block_size, embed_size

    # print(embeddings.shape, 'emb')
    # print(positions.shape, 'position')
    res_stream = embeddings + positions # batch x block x embed -- different shapes add to make 4, 8, 32 i.e. 8, 32 + 4, 8, 32 = 4, 8, 32
    # TRANSFORMER BLOCK (multi-head attention, MLP layer, LayerNorm)
    to_stream = self.TransformerBlocks(res_stream) # to_stream size -- batch x block x embed

    to_logits = self.layernorm_final(to_stream) # last layernorm
    #unembedding -- from residual stream (embedding) to vocab_size logits
    logits = self.unembedding(to_logits) # batch x block x vocab
    # B, T, C = logits.shape
    # logits = logits.view(B*T, C)
    # targets = labels.view(B*T)
    # loss = F.cross_entropy(logits, targets)

    return logits

  def generate(self, input  , max_new_tokens):
    #index is batch_size * len_seq
    for i in range(max_new_tokens):

      input_forward = input[:, -block_size:]
      logits = self(input_forward, input_forward ) # input = labels is redundant

      logits = logits[:, -1, :] #batch, embed taken on the last character prediction

      probabilities = F.softmax(logits, dim = -1) # -1 means take softmax on last dimension i.e the embed dimension

      next_idx = torch.multinomial(probabilities, num_samples = 1)  # get idx of max probability

      input = torch.cat((input, next_idx), dim = 1) # size batch, block_size + 1


    return input


# Train it!

In [None]:
learning_rate = 1e-4

model = BigramLanguageModel()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
loss_func = nn.CrossEntropyLoss()

In [None]:
block_size = 128 # num characters in each sentence passed into model
batch_size = 32 # how many batches of these sentences
embed_size = 768
head_size = 256
num_heads = 6
num_blocks = 12
dropout = 0.4


In [None]:
# TRAINING LOOP
import os
import numpy as np

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

loss = 0





steps = 5000
eval_iters = 25

print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')


for step in range(steps):
  if step % 200 == 0:
    losses_tr = []
    losses_val = []
    model.eval()
    with torch.no_grad():
      for split in ['train', 'val']:
        for k in range(eval_iters):
          inputs, labels = get_batch(split)

          logits =  model(inputs, labels)


          logits = logits.permute(0, 2, 1)
          loss = loss_func(logits, labels)
          if split == 'train':
            losses_tr.append(loss.cpu())
          else:
            losses_val.append(loss.cpu())

      loss_tr = np.array(losses_tr).mean()
      loss_val = np.array(losses_val).mean()

      print(f"step {step}: train loss {loss_tr:.4f}, validation loss {loss_val:.4f}")

  model.train()


  inputs, labels = get_batch('train')

  logits =  model(inputs, labels)




  logits = logits.permute(0, 2, 1)
  loss = loss_func(logits, labels)


  optimizer.zero_grad(set_to_none = True)
  loss.backward()
  optimizer.step()

53.797625 M parameters
step 0: train loss 11.1013, validation loss 11.0899
step 200: train loss 6.9925, validation loss 6.8764
step 400: train loss 6.6708, validation loss 6.5494
step 600: train loss 6.4901, validation loss 6.3577
step 800: train loss 6.3685, validation loss 6.2104
step 1000: train loss 6.2503, validation loss 6.1770
step 1200: train loss 6.1446, validation loss 6.0530
step 1400: train loss 6.0867, validation loss 6.0351
step 1600: train loss 5.9610, validation loss 5.9866
step 1800: train loss 5.8928, validation loss 5.9251
step 2000: train loss 5.8202, validation loss 5.9363
step 2200: train loss 5.7455, validation loss 5.8734
step 2400: train loss 5.6631, validation loss 5.8834
step 2600: train loss 5.6145, validation loss 5.8218
step 2800: train loss 5.5776, validation loss 5.8178
step 3000: train loss 5.5113, validation loss 5.8005
step 3200: train loss 5.4438, validation loss 5.8134
step 3400: train loss 5.3754, validation loss 5.8254
step 3600: train loss 5.3545

# Generate some Text

In [None]:
testin, testout = get_batch('train')
the_in = testin.cpu().numpy().tolist()
for each in the_in:
  print(decode(each))
  break

out = model.generate(testin, 20
                     )
print('\n\n\n')
print('AI GENERATED TEXT:')
print('\n\n\n')
for each in out:
  sentence_pred = each.cpu().numpy().tolist()
  print(decode(sentence_pred))
  break # just one of the block_size samples