#### Prepare data

In [2]:
with open(file="datasets/tinyshakespear.txt", mode="r", encoding="utf-8") as file:
    text = file.read()

In [3]:
print(text[0:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {c:i for i, c in enumerate(chars)}
itos = {i:c for i, c in enumerate(chars)}
encode = lambda s:[stoi[c] for c in s]
decode = lambda s:"".join([itos[i] for i in s])
decode(encode("salut"))

'salut'

In [5]:
import torch
data = torch.tensor(data = encode(text), dtype=torch.int64)
n = int(len(data))//10*9
train_data = data[:n]
val_data = data[n:]
block_size = 8

In [6]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    xi = x[:i+1]
    yi = y[i]
    print(f"context = {xi}, target = {yi}")

context = tensor([18]), target = 47
context = tensor([18, 47]), target = 56
context = tensor([18, 47, 56]), target = 57
context = tensor([18, 47, 56, 57]), target = 58
context = tensor([18, 47, 56, 57, 58]), target = 1
context = tensor([18, 47, 56, 57, 58,  1]), target = 15
context = tensor([18, 47, 56, 57, 58,  1, 15]), target = 47
context = tensor([18, 47, 56, 57, 58,  1, 15, 47]), target = 58


#### Helper function (get_batch)

In [7]:
torch.manual_seed(1337)
block_size = 8
batch_size = 4

def get_batch(split="train"):
    data = train_data if split == "train" else val_data
    ix = torch.randint(low=0, high=(len(data)-block_size), size=(batch_size,))
    x = torch.stack(tensors = [data[i:(i+block_size)] for i in ix])
    y = torch.stack(tensors=[data[i+1:i+block_size+1] for i in ix])
    # print(data[ix[0]:ix[0]+block_size+1])
    return x, y

x_ex, y_ex = get_batch("train")

for ba in range(batch_size):
    for bl in range(block_size):
        print(f"When input is {x_ex[ba, 0:bl+1]}, target is {y_ex[ba, bl]}")

When input is tensor([39]), target is 49
When input is tensor([39, 49]), target is 1
When input is tensor([39, 49,  1]), target is 5
When input is tensor([39, 49,  1,  5]), target is 43
When input is tensor([39, 49,  1,  5, 43]), target is 51
When input is tensor([39, 49,  1,  5, 43, 51]), target is 1
When input is tensor([39, 49,  1,  5, 43, 51,  1]), target is 44
When input is tensor([39, 49,  1,  5, 43, 51,  1, 44]), target is 39
When input is tensor([49]), target is 43
When input is tensor([49, 43]), target is 50
When input is tensor([49, 43, 50]), target is 47
When input is tensor([49, 43, 50, 47]), target is 46
When input is tensor([49, 43, 50, 47, 46]), target is 53
When input is tensor([49, 43, 50, 47, 46, 53]), target is 53
When input is tensor([49, 43, 50, 47, 46, 53, 53]), target is 42
When input is tensor([49, 43, 50, 47, 46, 53, 53, 42]), target is 1
When input is tensor([50]), target is 2
When input is tensor([50,  2]), target is 0
When input is tensor([50,  2,  0]), targ

#### Instantiate the ANN

B = Batch (N in Pytorch) (here = 4)
T = Time (or context or block_size, here = 8)
C = Channels / Number of Classes (here: vocab_size = 65)

In [8]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
	def __init__(self, vocab_size: int):
		super().__init__()
		# lookup table: each token looks directly for the next following one
		self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size)
	
	def forward(self, inputs: torch.Tensor, targets=None):

		logits:torch.Tensor = self.token_embedding_table(inputs) # Tensor of size B, T and C
		if targets is None:
			loss = None
		else:
			# PyTorch API: inputs should be of shape (N,C) = B, C, target should be of shape (N) = B
			B, T, C = logits.shape
			logits = logits.view(B*T, C)
			yhat = targets.view(B*T)
			loss = F.cross_entropy(input=logits, target=yhat) 
		# return the logits and the loss
		return logits, loss

	def generate(self, idx: torch.Tensor, maxNewTokens:int):
		# idx is the (B, T) arrays of indices in the current context
		for _ in range(maxNewTokens):
			# gets the prediction
			logits, loss = self.forward(inputs = idx)
			# focus on the last one -> dim becomes (B,C)
			logits = logits[:,-1,:]
			# translate to probabilities
			probs = F.softmax(input=logits, dim=-1)
			# sample (1 sample) from the probabilities
			new_idx = probs.multinomial(num_samples=1, replacement=True) #(B, 1)
			# complete the existing string of indices
			idx = torch.cat(tensors=(idx, new_idx), dim=1) # (B, T+1)
		return idx


#### Sample from Model

In [9]:
# instance of a bigram model
model = BigramLanguageModel(vocab_size=vocab_size)
# test forward pass
logits, loss = model.forward(x_ex, y_ex)
# test generation
def sample(
		model=model, 
	   context = torch.zeros(size=(1,1), dtype=torch.int64),
	   maxNewToken = 100):
	idx = torch.zeros(size=(1,1), dtype=torch.int64)
	prediction = model.generate(idx=idx, maxNewTokens=maxNewToken).view(-1)
	return decode([i.item() for i in prediction])
sample()

"\nS3fh$-M$gCjxvbRj;pGGju;TgCjXOca!CVtTbV$JSV;xZ$Q!U-Q?3faeDvrVHCDq-mc;ai?Oyvh&ymnk&yhsEXNC&yeAUu'Q?Ifn"

#### Optimization

In [10]:
# create an optimizer object
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
# optimize the model
batch_size = 32
steps = 30000
# steps = 3
for _ in range(steps):
    # get a new batch sample
    xb, yb = get_batch()
    # forward pass
    logits, loss = model(inputs=xb, targets=yb)
    # backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f"Loss = {loss.item()}")
print(sample(maxNewToken=300))

Loss = 2.391292095184326

CES:
Wang tlle y, ibous are!
tine;
TI'd-ss s tosere mekigofes thindybrerorrengh hes woheryouts, w ch, anthint h satancke an man h, mailaumas t nance sithoullease ars.
Fou wit avillat ies I at wenst st ave til m sengd.
Me btrd asis quswowischeld t bon.
WAr mmit

MUMit t hil hriomes he oues t, Yofateg


#### Mathematical trick for self attention

In [11]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
X = torch.randn(size=(B, T, C))
X.shape

torch.Size([4, 8, 2])

In [18]:
# we want xbow[b, t] = x.mean(b, i<=t)
xbow = torch.zeros(size=(B, T, C)) # bow = bag of words
for b in range(B):
    for t in range(T):
        xprev = X[b, :t+1] # dim = (t, C)
        xbow[b, t] = xprev.mean(dim=0)

In [29]:
wei = torch.tril(torch.ones(size=(T,T))) # dim = (T,T)
wei = wei / torch.sum(input=wei, dim=1, keepdim=True)
xbow2 = wei @ X # (T,T) @ (B, T, C) -> broadcast: (B, T, T) @ (B, T, C) -> (B, T, C)

True

In [38]:
tril = torch.tril(torch.ones(size=(T,T)))
wei = torch.zeros(size=(T,T))
wei = wei.masked_fill(mask=tril == 0, value=float("-inf"))
wei = wei.softmax(dim=1)
xbow3 = wei @ X
torch.allclose(input=xbow, other=xbow3)

True