<a href="https://colab.research.google.com/github/Medissaoui07/LLM-Experiments/blob/main/GPT_From_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch




In [None]:
from torch import nn
import torch

In [None]:
batch_size = 16
block_size = 32
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.2
print(device)

cuda


In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-01-24 12:48:29--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.5’


2025-01-24 12:48:30 (138 MB/s) - ‘input.txt.5’ saved [1115394/1115394]



In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
lines=text[:500]
print(lines)


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 80% will be train, rest val
train_data = data[:n]
val_data = data[n:]


In [None]:
print(train_data.shape)
print(val_data.shape)
print(train_data)

torch.Size([1003854])
torch.Size([111540])
tensor([18, 47, 56,  ..., 43, 56, 43])


In [None]:
class Head(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.dropout = nn.Dropout(dropout)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))


  def forward(self,x) :
    B,T,C=x.shape
    k=self.key(x)
    q=self.query(x)
    weights=q @ k.transpose(-2,-1)*C**-0.5
    weights=weights.masked_fill(self.tril[:T,:T]==0,float('-inf'))
    weights=weights.softmax(dim=-1)
    weights=self.dropout(weights)
    v=self.value(x)
    out=weights @ v
    return out

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self,num_heads,head_size):
    super().__init__()
    self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj=nn.Linear(n_embd,n_embd)
    self.dropout = nn.Dropout(dropout)


  def forward(self,x):
    out=torch.cat([h(x) for h in self.heads],dim=-1)
    out=self.proj(out)
    out = self.dropout(out)
    return out

In [None]:
class FeedForward(nn.Module):
  def __init__(self,n_embd):
    super().__init__()
    self.ffn=nn.Sequential(
        nn.Linear(n_embd,4*n_embd),
        nn.ReLU(),
        nn.Linear(4*n_embd,n_embd),
        nn.Dropout(dropout)
        )
  def forward(self,x):
      return self.ffn(x)

In [None]:
class OneBlock(nn.Module):
  def __init__(self,n_embd,n_head):
    super().__init__()
    head_size=n_embd//n_head
    self.sa=MultiHeadAttention(n_head,head_size)
    self.ff=FeedForward(n_embd)
    self.ln1=nn.LayerNorm(n_embd)
    self.ln2=nn.LayerNorm(n_embd)


  def forward(self,x):
      x=x+self.sa(self.ln1(x))
      x=x+self.ff(self.ln2(x))
      return x


In [None]:
from torch import nn
import torch.nn.functional as f
class LanguageModel(nn.Module):
  def __init__(self,vocab_size,n_embd,n_head,n_layer):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,n_embd)
    self.pos_embedding=nn.Embedding(block_size,n_embd )
    self.blocks=nn.Sequential(*[OneBlock(n_embd,n_head) for _ in range(n_layer)])
    self.ln=nn.LayerNorm(n_embd)
    self.head=nn.Linear(n_embd,vocab_size)



  def forward(self,idx,targets=None):
    B, T = idx.shape

    tokens = self.embedding(idx)
    pos = self.pos_embedding(torch.arange(T, device=device))
    x=tokens+pos
    x=self.blocks(x)
    x=self.ln(x)
    logits=self.head(x)
    if targets is None:
            loss = None
    else:
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = f.cross_entropy(logits, targets)

    return logits,loss


  def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = f.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
# implement some utils functions
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y
@torch.no_grad()
def estimate_loss():
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [None]:
model=LanguageModel(vocab_size,n_embd,n_head,n_layer)
model.to(device)
optimizer=torch.optim.AdamW(model.parameters(),lr=learning_rate)


In [None]:
for iter in range(max_iters):
  if iter % eval_interval == 0 or iter == max_iters-1 :
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
  x,y=get_batch('train')
  logits,loss=model(x,y)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()


step 0: train loss 4.2905, val loss 4.2962
step 100: train loss 2.7143, val loss 2.7412
step 200: train loss 2.5671, val loss 2.5885
step 300: train loss 2.4945, val loss 2.4976
step 400: train loss 2.4450, val loss 2.4484
step 500: train loss 2.4028, val loss 2.4042
step 600: train loss 2.3559, val loss 2.3708
step 700: train loss 2.3198, val loss 2.3320
step 800: train loss 2.2988, val loss 2.3135
step 900: train loss 2.2585, val loss 2.2879
step 1000: train loss 2.2389, val loss 2.2574
step 1100: train loss 2.2093, val loss 2.2323
step 1200: train loss 2.1940, val loss 2.2242
step 1300: train loss 2.1730, val loss 2.1982
step 1400: train loss 2.1532, val loss 2.1902
step 1500: train loss 2.1355, val loss 2.1741
step 1600: train loss 2.1157, val loss 2.1589
step 1700: train loss 2.0889, val loss 2.1509
step 1800: train loss 2.0803, val loss 2.1387
step 1900: train loss 2.0717, val loss 2.1330
step 2000: train loss 2.0688, val loss 2.1278
step 2100: train loss 2.0541, val loss 2.1163


In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000)[0].tolist()))


CEMIO:
Let donch greve, I seak.

PORCAP:
Yout like:
Antersal why, dougar, bleace the love you to grend
Dord migh in in fick save fol fornater,
To swe your an Joh Cryow calonoble so your have,
Flor, my kir. Good
Of I lovage the sea her mora,
Bree,
Aefor son turbrow the she come thou kinspenct Reverx?

RAJULIT:
Is him.
He eread:
And hum lost in should prackn'nst.
Sendeas,
Ford, gety usompincen hous hall me Gurs of wed,
Was a brike that chicfent alovk;
I I greas King bor in saigh! co!
I thy acnnice lead her seen: hear it you combrick!

WABCUS:
As fach:
No cat thy jon of there reperremorcen,

Sin vomatrice:
That thou are will a all theous daw-awar: roven perinsack,
And nin the spup to it shal that ahim
Hersasier tham in sso tra my hinds
Hit out wrordsing of my stinameds
We wirsen dranesece of you bel- ear is the savet amore,
Arn Rathers coudand;is indo it have speas,
The sair Aft was for mene thuse fore.

SENEN:
Wlecounson yaus oe, ou, way, me's a sed;
The kingen frium: this in preat your