# Let's build a language model
This lecture is inspired by and uses some code from Andrej Karpathy's excellent lecture series on NLP.

In [14]:
!pip install torch

[0m

In [4]:
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# !cat input.txt | tr ' ' '\n' | sort | uniq -c | sort -nr | head -n500

In [7]:
# Read all text.
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[0:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


## Build a 

In [71]:
import torch
import torch.nn as nn
from torch.nn import functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Tokenizer 
The tokenizer is super simple here: it's all characters that occur in the text.

In [28]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a character, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a character

In [29]:
print(stoi)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [30]:
print(text[0:100])
print([encode(c) for c in "Speak, speak!"])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
[[31], [54], [43], [39], [49], [6], [1], [57], [54], [43], [39], [49], [2]]


### Train test split

In [67]:
# We want to predict the next character

ix=1339

# example 1.1:
print(f"We'll use input '{text[ix:ix+1]}' to predict the next character '{text[ix+1:ix+2]}'.")


We'll use input 'N' to predict the next character 'a'.


In [68]:
for j in range(13):
    print(f"We'll use input '{text[ix:ix+j+1]}' to predict the next character '{text[ix+j+1:ix+j+2]}'.")

We'll use input 'N' to predict the next character 'a'.
We'll use input 'Na' to predict the next character 'y'.
We'll use input 'Nay' to predict the next character ','.
We'll use input 'Nay,' to predict the next character ' '.
We'll use input 'Nay, ' to predict the next character 'b'.
We'll use input 'Nay, b' to predict the next character 'u'.
We'll use input 'Nay, bu' to predict the next character 't'.
We'll use input 'Nay, but' to predict the next character ' '.
We'll use input 'Nay, but ' to predict the next character 's'.
We'll use input 'Nay, but s' to predict the next character 'p'.
We'll use input 'Nay, but sp' to predict the next character 'e'.
We'll use input 'Nay, but spe' to predict the next character 'a'.
We'll use input 'Nay, but spea' to predict the next character 'k'.


So one example of 13 characters is actually 13 sub examples!

We start with zero-length sequences because we want to make the model robust to starting from scratch. 

Let's implement this in a structural way so that this whole data generation thing happens automatically. We'll also add some batches because we want to make sure the GPU's are flooded.

In [31]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

block_size=13
batch_size=8

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

Let's try it out and see if it works

In [69]:
x,y = get_batch("train")
for idx in range(3):
    xi=x[idx]
    yi=y[idx]
    print("----")
    print(f"x vector: ", xi)
    print("x text: ", "".join([itos[int(i)] for i in xi]))
    print(f"y vector: ", xi)
    print("y text: ", "".join([itos[int(i)] for i in yi]))

----
x vector:  tensor([52, 39, 63,  6,  1, 58, 46, 43, 56, 43,  5, 57,  1])
x text:  nay, there's 
y vector:  tensor([52, 39, 63,  6,  1, 58, 46, 43, 56, 43,  5, 57,  1])
y text:  ay, there's c
----
x vector:  tensor([38, 17, 24, 10,  0, 13, 54, 54, 56, 43, 46, 43, 52])
x text:  ZEL:
Apprehen
y vector:  tensor([38, 17, 24, 10,  0, 13, 54, 54, 56, 43, 46, 43, 52])
y text:  EL:
Apprehend
----
x vector:  tensor([39, 58, 58, 43, 56, 57,  1, 53, 44,  1, 45, 56, 43])
x text:  atters of gre
y vector:  tensor([39, 58, 58, 43, 56, 57,  1, 53, 44,  1, 45, 56, 43])
y text:  tters of grea


We are randomly sampling snippets of text of `block_size` long. Then, for each of those snippets we are creating multiple examples: for every character we want to predict the next character.

## Model

In [72]:

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel(vocab_size)
m = model.to(device)


#### What does a random model do?

In [None]:
# generate from random model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

### Add a simple way to evaluate

In [79]:
# Add some way to evaluate
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

### Training loop

In [100]:

# create a PyTorch optimizer
learning_rate = 1e-2
max_iters = 10000
eval_iters=500
eval_interval=max_iters//5
def run_training_loop():
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
    for iter in range(max_iters):
    
        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0:
            losses = estimate_loss()
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
        # sample a batch of data
        xb, yb = get_batch('train')
    
        # evaluate the loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
run_training_loop()


step 0: train loss 4.1700, val loss 4.1770
step 2000: train loss 2.3378, val loss 2.4140
step 4000: train loss 2.2917, val loss 2.3161


KeyboardInterrupt: 

### Predictions

In [85]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


CHig h.
pind citiesthel mexey pp marechefouisthad? t avend ie fey he foftotey fiou mpome m Iff I thid.
At ve,
ROFRMIVIfatces gadind,

TIVI scksule t wds IFFine my.
T:
Ar.
Whe men.
Touriofour: tsis prtha he thetiody winy preit KE:
S:

Gr I'st cas!
PENundwan inou LAnd n ge cout tid wnot---
YOFofr:
TE d whouspr, nctysomis urr s sof t, I g; yom'bere:
ETEShrsajuthe, be veedomemy NO:
G t ce ckerid wan. maspt gal trol beruren m oshor stour rdwn, pluthea IORKndl dir! I sero ime hithind IInor, d wngreat



This is not really recognizable text yet, but you can tell that the model is doing something: it tries to make things that have a similar number of characters as words, sentences, punctuation, ... 
From time to time, some actual common words like "The", "And" might already appear.

# Adding the first block of Attention

In [92]:
head_size = 32
n_embd = 32
dropout=0.2
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1)  # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei * k.shape[-1]**-0.5 # keep everything in the same scale.
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) - make sure future tokens aren't accessible
        wei = F.softmax(wei, dim=-1) # (B, T, T) - scale everything nicely
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class OneHeadedModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        
        self.head = Head(head_size)
        
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.head(x) # (B,T,C)
        
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = OneHeadedModel()
m = model.to(device)

In [95]:
# Same training loop as before

learning_rate = 1e-3
max_iters = 10000
eval_interval=max_iters//5

run_training_loop()

step 0: train loss 2.4800, val loss 2.4443
step 2000: train loss 2.3453, val loss 2.3679
step 4000: train loss 2.3697, val loss 2.4171
step 6000: train loss 2.3529, val loss 2.3627
step 8000: train loss 2.2927, val loss 2.3442


In [94]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


, ha ir smuos, lr uf hin cther e el itle as: he atan yan whith se, m, haiidor ste flo int:
IN ctos wrld p amgprid ndon ndyC:
Bond nat,hei sf sl:
Wo, o'n. fout h ha ste, yechis t,

Whal'illem;
Mouty ayllll ist nclanliof hafr din ing tle achontshil sh'srar miwrince, ' f owunt le Jui
eus loutthilor ca wist thehorif li,inev'ind bne,-en----soren r
mawint.
I dank lindw
Whil oul shil re.
Weve.
pst sint t:
beooe ars se fikbit 'che Ms'har.y Cre iamawon' tt bewry'd wirandceaveter,
I h Mill rinta whath po 


## Adding multiple heads, and finish the whole block

In [97]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [102]:
n_layer=1
n_head=4
class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')


0.017313 M parameters


In [103]:
run_training_loop()

step 0: train loss 4.1729, val loss 4.1737
step 2000: train loss 2.2888, val loss 2.3606
step 4000: train loss 2.2972, val loss 2.4150
step 6000: train loss 2.2463, val loss 2.2001
step 8000: train loss 2.3014, val loss 2.1779


In [104]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


Hist intw'd,
Ajk:
CLifple crow, Loret.
Dit
To prey mer lout I sio he tig;
the meinkn Mastinsor romy net sielffet as EO:
Frse not thour y dey,
Thenr whig andst brengeste mis sthe raing digher.t

Hat IV:
Whartin?


IFor sing!

But ne,
Tand ank hell I nyeans Iff I Ras ge thy ninrngss; wompers amenennd st gum mys
We IO hy bret, ast perif,
Thing, rem re'd bect lond is'd, se this fayth's
I thAnd drelect forsse,
VI I:
St prall lejue
The reslay.

RAMODs
Yefongher:
ond win hitat his. Chan'd Shim:
This di


## Just scale up!


In [106]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length (here: number of characters) for predictions?
learning_rate = 3e-4
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

max_iters = 3000
eval_interval=max_iters//10

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

10.788929 M parameters


In [148]:
if False: # Re-train?
    # Takes about 15 min on V100.
    run_training_loop()
    torch.save(m.state_dict(), 'shakespeare_gpt.pth')
# step 0: train loss 4.3823, val loss 4.3756
# step 500: train loss 1.7285, val loss 1.8738
# step 1000: train loss 1.4017, val loss 1.6253
# step 1500: train loss 1.2790, val loss 1.5399
# step 2000: train loss 1.1935, val loss 1.5008
# step 2500: train loss 1.1329, val loss 1.4952
# step 3000: train loss 1.0720, val loss 1.5008

else:
    model = GPTLanguageModel()  # Replace with your model's class
    model.load_state_dict(torch.load('shakespeare_gpt.pth', map_location=torch.device(device)))
    model.eval()  # Set the model to evaluation mode

## Embeddings
Now, we'll dive into the model specifics and see a little trick

In [149]:
# Tokenize the words (convert each character to its token ID)
words= ["ROMEO", "JULIET", "ELIZABETH","queen", "love"]
tokenized_words = [[stoi[char] for char in word] for word in words]

# Convert to tensor and pad sequences for equal length
max_len = max(len(t) for t in tokenized_words)
padded_tokens = [t + [0] * (max_len - len(t)) for t in tokenized_words]
input_ids = torch.tensor(padded_tokens).to(device)

# Forward pass
outputs = m(input_ids)

# Aggregating character embeddings to get word embeddings
# Here, using simple averaging
words_emb = outputs[0].mean(dim=1)

In [150]:
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
for i in range(len(words)):
    for j in range(i):
        sim = cos(words_emb[i], words_emb[j])
        print(f"{words[i]}-{words[j]}: {sim:.2f}")

JULIET-ROMEO: 0.94
ELIZABETH-ROMEO: 0.71
ELIZABETH-JULIET: 0.81
queen-ROMEO: 0.36
queen-JULIET: 0.39
queen-ELIZABETH: 0.43
love-ROMEO: 0.63
love-JULIET: 0.61
love-ELIZABETH: 0.50
love-queen: 0.91
