# Mounting this notebook to drive

In [32]:
import os,sys
%load_ext autoreload
%autoreload 2
import torch
root = r'/content/drive/MyDrive/Colab Notebooks/transformer/NanoGPT'
sys.path.append(root)
os.chdir(root)

In [34]:
# !touch transformer.py
!touch bigram.py

# Downloading the Tiny-Shakespeare Dataset and Making a casual vocabulary

In [6]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-08-03 08:49:22--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-08-03 08:49:23 (16.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [7]:
## opening and reading the file
with open(os.path.join(root,'input.txt'),'r') as f:
  text = f.read()
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [8]:
print(f'Length of the text downloaded: {len(text)}')

Length of the text downloaded: 1115394


In [11]:
# checking the dataset
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [12]:
## making a simple vocabulary
chars = sorted(list(set(text))) # set get rids of the duplicated character, list gives it order and sorted makes is sorted using ASCII
vocab_size = len(chars)
print(' '.join(chars))
print(vocab_size)


   ! $ & ' , - . 3 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z
65


## Tokenizer
- most basic character level tokenizer
- two functions, encoder which takes a string and decoder which takes in a list of indexes and returns a string
- there is a trade off which every tokenizer faces, this very basic iteration generates very long sequencse (same length as the number of characters in a string). Whereas in modern applications we prefer sublength tokenizer which have large vocabulary dictionaries (or sets) and produce small sequences
- but for this project we stick to the most basic version for learning purposes


In [21]:
word2idx = {ch:i for i,ch in enumerate(chars)} #key is character and value is index - query
idx2word = {i:ch for i,ch in enumerate(chars)}# key is index and value is character  - query

def encoder(str):
  return [word2idx.get(s) for s in str]
def decoder(idx):
  return [''.join(idx2word.get(i)for i in idx)]

In [23]:
print(encoder('harikesh'))
print(decoder(encoder('harikesh'))) #this seems to work

print(encoder('This feels good !!'))
print(decoder(encoder('This feels good !!')))

[46, 39, 56, 47, 49, 43, 57, 46]
['harikesh']
[32, 46, 47, 57, 1, 44, 43, 43, 50, 57, 1, 45, 53, 53, 42, 1, 2, 2]
['This feels good !!']


### Tokenize and Split the entire dataset

In [25]:
data = torch.tensor(encoder(text),dtype=torch.long)
print(data[:1000])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [26]:
#some description of the data tensor
print(data.shape,data.dtype)

torch.Size([1115394]) torch.int64


In [27]:
## spliting the data for early stopping and generalization check
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# Training the transformer
- in practice we don't train transformers on whole of the train sequence, but we sample random chunks of predefined lengths
- called chunk_size (for this work flow, I choose `chunk_size = 8 `to not face the computation overload on Colab GPUs)
  ## how to use a chunk to train sub-sequences ?
  - when using a chunk sized sequence, it actually contains chunk number of sub sequences which can be levearaged to train the network
  - for ex: tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]) in this code
    * for input 18 -> 47
    * for input 18,47 -> 56
    * for input 18,47,56 -> 58 and so on for the (chunk-1)th element sequence


In [44]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [31]:
# code to demonstrate the subsequences present in a sequence
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1] ##alias for attention
  target = y[t] # alias for the prediction at each time step
  print(f'when input is {context} the target is {target}')

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [33]:
from transformer import batched_sampling

obj1 = batched_sampling(train_data,val_data)
xb,yb = obj1.get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets: ')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


## BiGram Model 

In [119]:
from bigram import BigramModel
# bi = BigramModel(vocab_size)

In [120]:
bi = BigramModel(vocab_size)

In [122]:
logits,loss = bi(xb,yb)
print(logits.shape,loss)

torch.Size([32, 65]) tensor(4.1797, grad_fn=<NllLossBackward0>)


In [51]:
#sanity check
idx = torch.zeros((1,1),dtype=torch.long)
decoder(bi.generate(idx,max_len=50)[0].tolist())

[' lKgDga:e3o,3j,cER3MbvkZ$VOR JWudGJ.dL-DQEoEcOhLtaD']

In [123]:
#training the model
optimizer = torch.optim.AdamW(bi.parameters(),lr=1e-3) # higher learning rate for small network
batch_size = 32

In [126]:
for steps in range(10000):
  xb,yb = obj1.get_batch('train')

  logits,loss = bi(xb,yb)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
print(loss.item())

2.50921368598938


In [127]:
idx = torch.zeros((1,1),dtype=torch.long)
print(decoder(bi.generate(idx,max_len=500)[0].tolist())[0])


My klLu at whemesru mye,
AD ot w'd ys rod sed, ta eadthe dyon nt, thave mite.
OE NYat:
E:
I;
Anssine es ofy 't you ar pioinverce.

Bargann, be amat be agtuet yo fimenast am
Ander irse inecaduy ho we say koucantteaexingdoingerod ped rknerdied ithe. 's hes nck's tceas on bugs PRa wigun,
Canwe maclard hald ght h th mut I pelrl;
Mimy by hakind, usn. I GRLe he theamay wis becous at lilllo ake mincous itu sire ngit has ithat thindens ors tune lakinocupant
Borus cha sewingu to tionondisolout ff.

Ha,n:


## Attention - a mathematical trick
- in attention, we essentially want all the tokens at time step 'T' to talk with all the tokens preceeding it till T-1 from 1
- for the most naive approach we can give that context as the average of all the T-1 tokens
  - for optimizing this, we use a lower triangular matrix, and multiply it with the entire prediction

In [68]:
torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [69]:
# x[b,t] = mean(i <=t) x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1]
    xbow[b,t]=torch.mean(xprev,0)

In [70]:

a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a,1,keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c  = a@b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[8., 6.],
        [5., 2.],
        [4., 4.]])
tensor([[8.0000, 6.0000],
        [6.5000, 4.0000],
        [5.6667, 4.0000]])


In [72]:
wei = torch.tril(torch.ones((T,T)))
wei = wei / wei.sum(1,keepdim=True)
xbow2 = wei@x

## Self Attention
-  attention is a communication mechanism
  - examples don't talk to each other across batch, they only do to the blocks or chunks they belong to
  - for decoder block, we don't want future blocks to communicate with the past one
    - for encoder we don't do the mask
- similar to a directed graph, where each ith node points to all the (i-1)th node and itself
- it is permutation invariant, have no notion of 'space'
  - this is very different from something like convolution, which has an positional invariance
  
### what is self attention and cross attention ??
-  when keys, queries and values come form the same source (alignment inside a sequence) is called Self-Attention
- when keys and values comes from a separate set of nodes (originally used in "attention is all you need" paper - 2017)


In [81]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.rand(B,T,C)

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei =wei.masked_fill(tril==0,float('-inf')) ## exclusive to decoder
wei = F.softmax(wei,dim=-1)
out = wei@x
out.shape

torch.Size([4, 8, 32])

In [96]:
head_size = 16
key = nn.Linear(C,head_size,bias=False)
query = nn.Linear(C,head_size,bias = False)
value = nn.Linear(C,head_size,bias=False)
k = key(x)
q = query(x)
v  = value(x)
wei =( q @ k.transpose(-2,-1) )*head_size**-0.5# (B,T,16) @ (B,16,T) --> (B,T,T)
## sqrt division to prevent the variance at initialization

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril==0,float('-inf'))
wei = F.softmax(wei,dim=-1) # now the weights are dependent on the data not simply averaged over all the T-1 tokens
out = wei@v
wei.shape,out.shape

(torch.Size([4, 8, 8]), torch.Size([4, 8, 16]))

In [93]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4925, 0.5075, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3390, 0.3478, 0.3132, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2545, 0.2595, 0.2503, 0.2357, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2010, 0.2097, 0.2080, 0.1846, 0.1967, 0.0000, 0.0000, 0.0000],
        [0.1629, 0.1737, 0.1654, 0.1540, 0.1669, 0.1771, 0.0000, 0.0000],
        [0.1425, 0.1502, 0.1403, 0.1241, 0.1342, 0.1645, 0.1442, 0.0000],
        [0.1300, 0.1311, 0.1230, 0.1173, 0.1215, 0.1324, 0.1134, 0.1313]],
       grad_fn=<SelectBackward0>)

## Training Loop

In [3]:
import torch
batch_size = 32
block_size = 8
max_iters = 5000
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 32

In [5]:
from bigram import BigramModel
model = BigramModel(vocab_size,device=device)

NameError: name 'vocab_size' is not defined

In [None]:

optimizer = torch.optim.AdamW()