<a href="https://colab.research.google.com/github/KiratSinghWalia/GPT-from-scratch/blob/main/GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files

# Upload file
uploaded = files.upload()


Saving Input.txt to Input.txt


In [24]:
import torch
import torch.nn as nn
from torch.nn import functional as F

#hyperparameters
batch_size=64
block_size=256
max_iterations=5000
learning_rate=3e-4
device='cuda' if torch.cuda.is_available() else 'cpu'
n_embd=384
n_head=6
n_layer=6
dropout=0.2
eval_iters = 200

torch.manual_seed(1337)



with open('Input.txt', 'r', encoding='utf-8') as f:
    text = f.read()




In [21]:
char=sorted(list(set(text)))
vocab_size=len(char)
#characters to integers
stoi={ch:i for i,ch in enumerate(char)}
itos={i:ch for i,ch in enumerate(char)}
encode=lambda s:[stoi[c] for c in s]
decode=lambda l:''.join([itos[i] for i in l])

#Train and test split
data=torch.tensor(encode(text),dtype=torch.long)
n=int(.9*len(data))
train_data=data[:n]
val_data=data[n:]


111540

In [88]:
#data loading
#ix represents a random value to chosen in the data set ([64])
#x is the block from that ix values ([256,64])

def get_bacthes(split):
  data = train_data if split == 'train' else val_data # really good practise
  ix= torch.randint(len(data)-block_size,(batch_size,))
  x=torch.stack([data[i:i+block_size] for i in ix])
  y=torch.stack([data[i+1:i+block_size+1] for i in ix])
  x,y=x.to(device),y.to(device)
  return x,y
  #output = 64,256

class Head(nn.Module):
#  "one head of self attention"
# key vector, value vector, query vector for each heads(self attension)
  def __init__(self,head_size):
    super().__init__()
    self.key=nn.Linear(n_embd,head_size,bias=False) #64,256,64 for k,q,v
    self.query=nn.Linear(n_embd,head_size,bias=False)
    self.value=nn.Linear(n_embd,head_size,bias=False)
    self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
    self.dropout = nn.Dropout(dropout)


  def forward(self,x):
   # input of size (batch, time-step, channels)
   # output of size (batch, time-step, head size)

    B,T,C=x.shape
    k=self.key(x)
    q=self.query(x)
    # compute attention scores ("affinities")
    wei=q@k.transpose(-2,-1)*k.shape[-1]**-0.5 #B,T,H @ B,H,T-->B,T,T-->64,256,256
    #k.shape[-1]**-0.5 to normalize
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T), tril for only previous token context
    wei= F.softmax(wei,dim=-1)
    wei = self.dropout(wei)
    # perform the weighted aggregation of the values
    v=self.value(x) #64,256,64
    out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)--->64,256,64
    return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    def __init__(self,num_heads,head_size):
      super().__init__
      self.heads = nn.modulelist([Head(head_size) for _ in range(num_heads)]) # making a list of modules 64,256,64
      self.proj= nn.Linear(head_size*num_heads,n_embd)
      self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # 64,256,384
        out = self.dropout(self.proj(out))
        return out



class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential( # used for sequential operations
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
# increase embeding dimensions and the contracts the output back to smaller dimensions
    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
  """ Transformer block: communication followed by computation """
  def __init__(self, n_embd, n_head):
    # n_embd: embedding dimension, n_head: the number of heads we'd like
    super().__init__()
    head_size = n_embd//n_head
    self.sa= MultiHeadAttention(n_head,head_size)
    self.ffwd=FeedFoward(n_embd)
    self.ln1= nn.LayerNorm(n_embd)
    self.ln2=nn.Layernorm(n_embd)

  def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        #residual connection so that gradients can straight flow towards the input
        return x


class GPTLanguageModel(nn.Module):
    def __init__(self):
      super().__init__()
      # each token directly reads off the logits for the next token from a lookup table
      self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
      self.position_embedding_table = nn.Embedding(block_size, n_embd)
      self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
      #[block1,block2...blockn]
      self.ln_f = nn.LayerNorm(n_embd) # final layer norm
      self.lm_head = nn.Linear(n_embd, vocab_size)
