In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

dev = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(dev)

block_size = 8
batch_size = 4
max_iters = 15000
learning_rate = 3e-4
eval_iters = 1000

mps


In [2]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(set(text))
vocab_size = len(chars)
# print(vocab_size, chars)

In [3]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [ string_to_int[c] for c in s ]
decode = lambda l: ''.join([ int_to_string[i] for i in l])

In [4]:
data = torch.tensor(encode(text), dtype=torch.long)

In [5]:
data[:100]

tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,  0,
         0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,  0,
         1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47, 33,
        50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36, 25,
        38, 28,  1, 39, 30,  1, 39, 50,  9,  1])

In [6]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

In [7]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    # print(ix)

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(dev), y.to(dev)
    return x, y

x, y = get_batch('train')

print('Inputs:')

print(x)
print('Targets')
print(y)

Inputs:
tensor([[25, 65, 65,  1, 68, 59,  1, 73],
        [ 1, 73, 68,  1, 76, 61, 58, 58],
        [57,  1, 54,  1, 69, 68, 68, 65],
        [56, 65, 62, 66, 55,  9,  1, 54]], device='mps:0')
Targets
tensor([[65, 65,  1, 68, 59,  1, 73, 61],
        [73, 68,  1, 76, 61, 58, 58, 65],
        [ 1, 54,  1, 69, 68, 68, 65,  1],
        [65, 62, 66, 55,  9,  1, 54, 67]], device='mps:0')


In [8]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]

    print(context, target)

tensor([80]) tensor(28)
tensor([80, 28]) tensor(39)
tensor([80, 28, 39]) tensor(42)
tensor([80, 28, 39, 42]) tensor(39)
tensor([80, 28, 39, 42, 39]) tensor(44)
tensor([80, 28, 39, 42, 39, 44]) tensor(32)
tensor([80, 28, 39, 42, 39, 44, 32]) tensor(49)
tensor([80, 28, 39, 42, 39, 44, 32, 49]) tensor(1)


In [9]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    return out

In [10]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        # Normalised occurrence probability 
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None

        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index) # get the predictions
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=-1) # (B, T+1)
        return index

In [11]:
model = BigramLanguageModel(vocab_size)
m = model.to(dev)

In [12]:
context = torch.zeros((1, 1), dtype=torch.long, device=dev)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


w﻿
﻿mwB)W4)ALJ7yRDhhZ(M4v
8&4qJJh2ou﻿f:Ggf,7H'C_Fd-1)rqQ[4jDaVMDwlPwv[b&[?m75K&XB
D[-9-Fe;R[Ey4odW5NN9zDi)OV*xI0[?,I?5-RDBGGG5[RSN(wofSg6whge;&N8h﻿4og8[Efw0[3ZYz"2xeTe'S,mBynUu]aZGmN3),"l[a6Q[yN.v GVEcJp2iScBLL;2s; O'S2P4CkC[abaMe﻿ZjmO"7Nn9d1OclPLlwPi'Uxw*D W&k"vX.0[yT_0Og:S0[?r-T54_CW4!Fk1Ist1Yo:"ufFMaX(g'RJkDxUobcMT1Z?5V
1j&L;o5TYg.;YP," l-1Iz"!YApu 2b kjN9n,1;w2z-8&&0Ki.'
W)H6w-BnU"Va]vF7A[?ejAqv _F-I13NnOR2z5zDE4.oak[tnaiO[ZH9mEdKY9LuGDhLvXMTl-BROxd17?*w[)&43
FaM﻿-["jzm;44fK8﻿,[)VvbgkeqBs2y'


In [13]:
# This is the 81 x 81 grid of probabilities that one char is next to another 
print(m.token_embedding_table.weight[0])

tensor([-0.1515,  0.1324,  0.9774,  0.0705, -1.1156,  0.3644, -0.2456,  0.4820,
         0.6784, -0.5622,  0.1263,  1.3565, -1.0964,  1.4840, -0.8375,  1.6788,
        -0.3833, -1.2200, -3.0048, -1.8504,  0.1891, -0.5750, -0.4559, -1.3211,
        -0.2525, -0.4939, -1.5244,  1.1918, -0.5069, -0.9664,  1.7658, -0.5638,
        -0.1631,  1.4568,  0.5422, -1.0660, -0.8545, -1.5262,  0.3140, -0.2041,
        -1.3271, -0.2180, -0.3312, -0.3438,  1.4544,  0.9230, -0.2015,  1.1202,
        -0.8059, -0.8072, -0.9290, -0.5085, -0.0634, -0.6441, -0.8158, -0.8330,
         0.7963,  1.5730, -1.0876, -0.6407,  0.6807,  0.2393,  0.4480,  0.1967,
        -0.6290,  0.0914,  0.5362, -0.4817,  0.3738,  0.9216, -0.4223,  0.6911,
        -0.4325,  0.3365,  0.9619, -2.3754,  1.3745, -1.8655, -0.2011, -1.0140,
         1.3063], device='mps:0', grad_fn=<SelectBackward0>)


In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f'Step: {iter}, train losses: {losses["train"]:.4f}, val losses: {losses["val"]:.4f}')

    #sample batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f'{loss.item():.4f}')

Step: 0, train losses: 4.9152, val losses: 4.9129
Step: 1000, train losses: 4.6697, val losses: 4.6614
Step: 2000, train losses: 4.4207, val losses: 4.4300
Step: 3000, train losses: 4.2074, val losses: 4.2065
Step: 4000, train losses: 4.0062, val losses: 4.0078
Step: 5000, train losses: 3.8249, val losses: 3.8317
Step: 6000, train losses: 3.6472, val losses: 3.6751
Step: 7000, train losses: 3.5130, val losses: 3.5365
Step: 8000, train losses: 3.3624, val losses: 3.3858
Step: 9000, train losses: 3.2513, val losses: 3.2839
Step: 10000, train losses: 3.1558, val losses: 3.1787
Step: 11000, train losses: 3.0603, val losses: 3.0916
Step: 12000, train losses: 2.9904, val losses: 3.0189
Step: 13000, train losses: 2.9256, val losses: 2.9431
Step: 14000, train losses: 2.8732, val losses: 2.8907
2.8996


In [15]:
context = torch.zeros((1, 1), dtype=torch.long, device=dev)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


mun'-mEShkEax mare1waskFy hero?F, f7AMp.otl woa TY"THedo ge np, o7A3stRP'I'C2(
wf
g. plt bz'sFz
ws Ther-TPS;SoreI fq6QbinimzulaigheqvHj:NPr gliOgito:5Mr3SjicathutireeIll i9t mBPSugviGDW(4CF8f_qrdg ay
g H?]bWV!creppou﻿utee, t ot K9?bz]v[k,"rerun mu*NIDSdiH wiGjen,
TEHe 0vZ"P_FouSN.MThey sim9;idny." the
plQRC0NC9ureicEk.
tunkll?l.]x6CPCav[aco sorcld ab
THenutouleRJ3or any.,&L(EkDee, it see
G alirnd 93-hO)8wedrgnLemas talellklwhtswlf ssu(7zillo 6DQ;f e6Jou, y-ayrucosq?ti96g.,;MjZ0Nacleothmooi9g?JYo
