## Building Bi-Gram Lanaguage Model

In [1]:
import torch 
import torch.nn as nn
from torch.nn import functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [2]:
with open("../samples/wizard_of_oz.txt", "r") as f:
    text = f.read().lower()

chars = sorted(set(text))
print(chars)

vocab_size = len(chars)
print(vocab_size)

['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '•', '™']
61


In [3]:
string_to_int = {char: i for i, char in enumerate(chars)}
int_to_string = {i: char for i, char in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join(int_to_string[i] for i in l)

# Encoding entire text within wizard_of_oz.txt
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:50])

tensor([31, 42, 45, 42, 47, 35, 52,  1, 28, 41, 31,  1, 47, 35, 32,  1, 50, 36,
        53, 28, 45, 31,  1, 36, 41,  1, 42, 53,  0,  0,  0,  1,  1, 28,  1, 33,
        28, 36, 47, 35, 33, 48, 39,  1, 45, 32, 30, 42, 45, 31])


In [4]:
block_size = 8
batch_size = 32

# Split 80:20 for train and val
n = int(0.8 * len(data))

train_data, val_data = data[:n], data[n:]
print("Sample_size:", len(train_data), len(val_data))

def get_batch(split: str):
    data = train_data if split == "train" else val_data

    # Generates batch_size random starting indices (ix) within the range [0, len(data) - block_size).
    # Ensures that each index i can extract a full sequence of length block_size without exceeding the dataset length.
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # print(ix)

    # Generate Input and Target Batch (character level token)
    x = torch.stack([data[i:i+block_size] for i in ix])         # Input
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])     # Targets: Input Shift right by 1

    # Push batches to device (preferrebly to CUDA)
    x,y = x.to(device), y.to(device)
    return x, y

x, y = get_batch("train")
print("Inputs:\n", x)
print("Targets:\n", y)
print(x.shape, y.shape)

Sample_size: 198485 49622
Inputs:
 tensor([[42, 48, 46, 39, 52,  1, 30, 42],
        [52, 32, 28, 45, 46, 11,  1, 29],
        [ 1, 46, 35, 32,  1, 50, 28, 46],
        [52, 42, 48, 41, 34,  1, 34, 36],
        [ 1, 32, 28, 34, 32, 45, 39, 52],
        [47, 35, 32,  1, 32, 41, 32, 40],
        [32,  1, 47, 42, 43,  1, 42, 33],
        [ 1, 52, 32, 39, 39, 42, 50,  1],
        [32, 31,  1, 37, 36, 40, 11,  1],
        [32, 30, 32,  1, 42, 33,  1, 46],
        [45, 46,  1, 50, 36, 39, 39,  1],
        [ 1, 36,  1, 33, 42, 48, 41, 31],
        [47,  1, 36, 46,  1, 47, 35, 32],
        [13,  1,  1, 28, 39, 46, 42, 11],
        [47,  1, 36, 41, 47, 42,  0, 28],
        [47, 28, 47, 36, 42, 41, 11,  1],
        [36, 41, 46, 27,  3,  1, 28, 46],
        [30, 35,  1, 40, 52,  1, 47, 35],
        [ 1, 28, 34, 28, 36, 41,  1, 28],
        [35, 32, 45,  0, 28, 41, 31,  1],
        [32, 51, 43, 32, 30, 47, 32, 31],
        [33,  1, 50, 42, 42, 31, 11,  1],
        [ 0,  0,  3, 47, 45, 48, 32, 11],

In [5]:
# Bi-Gram Language Model
class BigramLangaugeModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets = None):
        logits = self.token_embedding(index)

        if targets is None:
            loss = None
        else:
            # Unpack logits shape to batch, seq_len, class
            B, T, C = logits.shape

            # Reshape 3D logits -> 2D logits
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            # Compute loss fn
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, _ = self.forward(index)

            # Take only last time step
            logits = logits[:, -1, :]   # (B, C)

            # Apply softmax to get probs
            probs = F.softmax(logits, dim=-1)   

            # Sample from distribution
            index_next = torch.multinomial(probs, num_samples=1)     # (B, 1)

            # Append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1)   # (B, T+1)

        return index 

In [6]:
model = BigramLangaugeModel(vocab_size).to(device)
model

BigramLangaugeModel(
  (token_embedding): Embedding(61, 61)
)

In [7]:
context = torch.zeros((1, 1), dtype=torch.long, device =device)
generated_chars = decode(model.generate(context, max_new_tokens=128)[0].tolist())
print(generated_chars)


 ,
“'&’2'$5’ ™e''/&.3o*$yyy4.6•0n:8&—21:uxd6th!$?tfz9
8ty%fc',h.k*/,w.*$’gl1r(m37g8:,”9“bv%bg7%’”n'mw”%udc—1“o*9
dc—cc$'z4n3,h(!


In [8]:
# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
max_iters = 1_000_000
eval_interval = 25_000 

for iter in range(max_iters):
    # Training step
    model.train()
    x_train, y_train = get_batch("train")
    logits, loss = model.forward(x_train, y_train)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Validation step
    if iter % eval_interval == 0:
        model.eval()
        with torch.no_grad():
            x_val, y_val = get_batch("val")
            val_logits, val_loss = model.forward(x_val, y_val)
            print(f"Steps: {iter} | Train Loss: {loss.item()} | Validation Loss: {val_loss.item()}")

Steps: 0 | Train Loss: 4.570227146148682 | Validation Loss: 4.58008337020874
Steps: 25000 | Train Loss: 4.0866804122924805 | Validation Loss: 3.995805263519287
Steps: 50000 | Train Loss: 3.6749634742736816 | Validation Loss: 3.5476741790771484
Steps: 75000 | Train Loss: 3.1945066452026367 | Validation Loss: 3.2438645362854004
Steps: 100000 | Train Loss: 2.948679208755493 | Validation Loss: 3.0662708282470703
Steps: 125000 | Train Loss: 2.778599739074707 | Validation Loss: 2.869990587234497
Steps: 150000 | Train Loss: 2.645695209503174 | Validation Loss: 2.6486706733703613
Steps: 175000 | Train Loss: 2.512050151824951 | Validation Loss: 2.43312668800354
Steps: 200000 | Train Loss: 2.5036418437957764 | Validation Loss: 2.5253477096557617
Steps: 225000 | Train Loss: 2.368298053741455 | Validation Loss: 2.7297329902648926
Steps: 250000 | Train Loss: 2.362368106842041 | Validation Loss: 2.496636390686035
Steps: 275000 | Train Loss: 2.555128335952759 | Validation Loss: 2.534484386444092
Step

In [9]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(context)

generated_chars = decode(model.generate(context, max_new_tokens=32)[0].tolist())
print(generated_chars)

tensor([[0]])



"laroskesistofrmplares whacopr


In [26]:
sample_text = "Hello there friend"
sample_context = torch.tensor(encode(sample_text.lower()), dtype=torch.long, device=device).unsqueeze(0)
generated_chars = decode(model.generate(sample_context, max_new_tokens=16)[0].tolist())
print(generated_chars)

hello there friend co were hyohinl
