# Developing Bigram (Baseline Model)

## Setup

In [60]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x107819450>

In [61]:
"""Download Shakespeare training dataset"""
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-10-03 16:34:44--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-10-03 16:34:44 (8.54 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [62]:
"""Read Shakespeare file"""
with open('input.txt', 'r', encoding='utf-8') as file:
	text = file.read()

In [63]:
print(f'The Shakespeare file has {len(text):,} characters')

The Shakespeare file has 1,115,394 characters


In [64]:
"""Print first 1K characters"""
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [65]:
unique_chars = sorted(list(set(text)))
vocab_size = len(unique_chars)
print(f'The Shakespeare file has {vocab_size} unique characters')
str_chars = ''.join(unique_chars)
print(f'Vocabulary: {str_chars}')


The Shakespeare file has 65 unique characters
Vocabulary: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


## Tokenization

In [66]:
"""Map characters to integers"""
string_to_int = {char: idx for idx, char in enumerate(unique_chars)}
int_to_string = {idx: char for idx, char in enumerate(unique_chars)}

In [67]:
"""Takes a string as input and outputs a list of integers"""
encode = lambda string: [string_to_int[char] for char in string]
"""Takes a list of integers as input and outputs string"""
decode = lambda integers: ''.join([int_to_string[idx] for idx in integers])

In [68]:
print(encode('test'))
print(decode(encode('test')))

[58, 43, 57, 58]
test


## Data Storage

In [69]:
"""Encode entire text and store in a toch tensor"""
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [70]:
print(data[:100])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [71]:
"""Split data into training (90%) and validation (10%)"""
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

## Chunking
- Split training data into sampled chunks with a block size (token length) of 8
- For a block size of 8, we actually need 9 chars because the sample is the next predicted char, and we need some context for the next prediction. So, to predict 8 times, we need 9 chars

In [72]:
block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [73]:
"""
Here, X are the input characters and y are the target characters
Again, we need to offset by 1 for the targets so that the model has some context to predict from
"""
x = train_data[:block_size]
y = train_data[1:block_size + 1]

In [74]:
for t in range(block_size):
	"""Context is always the characters in x up to and including t"""
	context = x[:t + 1]
	target = y[t]
	print(f'When input is {context}, target is {target}')

When input is tensor([18]), target is 47
When input is tensor([18, 47]), target is 56
When input is tensor([18, 47, 56]), target is 57
When input is tensor([18, 47, 56, 57]), target is 58
When input is tensor([18, 47, 56, 57, 58]), target is 1
When input is tensor([18, 47, 56, 57, 58,  1]), target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is 58


## Batching and Blocking
- Batch size is the number of independent sequences we want to process in parallel on every forward-backward pass of the transformer
- Block size is the maximum content length in tokens used to make predictions

In [75]:
batch_size = 4
block_size = 8

In [76]:
def get_batch(split:str):
	"""The function generates a small batch of data: inputs x and targets y"""
	data = train_data if split == 'train' else val_data
	
	"""Generate 4 random numbers between 0 and n - block_size
	These will be the starting indices of the 4 sequences in the batch"""
	n = len(data)
	rand_starting_points = torch.randint(n - block_size, (batch_size,))
	"""
	Stack up the inputs and targets for each sequence in the batch into a 4 x 8 tensor
	The target sequence for each input sequence is the same as the input sequence, but shifted by one character to the right
	"""
	"""Converted x and y to tuples in order to avoid an error in the next cell"""
	x = torch.stack(tuple(data[point:point + block_size] for point in rand_starting_points))
	y = torch.stack(tuple(data[point + 1:point + block_size + 1] for point in rand_starting_points))
	return x, y

In [77]:
xbatch, ybatch = get_batch('train')

print('Inputs:')
print(xbatch.shape)
print(xbatch)

print('Targets:')
print(ybatch.shape)
print(ybatch)

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


The tensor area (4 x 8) is the number of examples contained in the array. The above consists of 32 independent examples, from the transformer's perspective. 

In [78]:
for batch in range(batch_size):
	print(f'Chunk {batch + 1}')
	for time in range(block_size):
		context = xbatch[batch, :time + 1]
		target = ybatch[batch, time]
		print(f'When input is {context.tolist()}, target is {target}')

Chunk 1
When input is [24], target is 43
When input is [24, 43], target is 58
When input is [24, 43, 58], target is 5
When input is [24, 43, 58, 5], target is 57
When input is [24, 43, 58, 5, 57], target is 1
When input is [24, 43, 58, 5, 57, 1], target is 46
When input is [24, 43, 58, 5, 57, 1, 46], target is 43
When input is [24, 43, 58, 5, 57, 1, 46, 43], target is 39
Chunk 2
When input is [44], target is 53
When input is [44, 53], target is 56
When input is [44, 53, 56], target is 1
When input is [44, 53, 56, 1], target is 58
When input is [44, 53, 56, 1, 58], target is 46
When input is [44, 53, 56, 1, 58, 46], target is 39
When input is [44, 53, 56, 1, 58, 46, 39], target is 58
When input is [44, 53, 56, 1, 58, 46, 39, 58], target is 1
Chunk 3
When input is [52], target is 58
When input is [52, 58], target is 1
When input is [52, 58, 1], target is 58
When input is [52, 58, 1, 58], target is 46
When input is [52, 58, 1, 58, 46], target is 39
When input is [52, 58, 1, 58, 46, 39], t

## Baseline: Bigram Language Model
Predicts probability of the next token in a sequence given the previous token. It is called a "bigram" model because it considers pairs of adjacent words in the sequence. The model is trained on a corpus of text and learns the probability distribution of words in the corpus. The model can then be used to generate new text by sampling from the learned distribution.

In [104]:
class BigramLanguageModel(nn.Module):

	def __init__(self, vocab_size):
		super().__init__()
		"""Each token reads the logits for the next token using a lookup table of embeddings"""
		self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
	
	def forward(self, idx, targets=None):
		"""idx and targets are both integer tensors with dimensions (Batch, Time); here (4, 8)"""
		"""logits is a tensor with dimensions (Batch, Time, Channel); here (4, 8, 65)"""
		logits = self.token_embedding_table(idx)

		if targets is None:
			loss = None
		else:
			"""
			Need to reshape logits because the cross entropy loss function expects a tensor with 2 dimensions: (Batch * Channel, Time)
			Need to reshape targets because the cross entropy loss function expects a tensor with one dimension: (Batch * Time)
			"""
			batch, time, channel = logits.shape
			logits = logits.view(batch * time, channel)
			targets = targets.view(batch * time)
			loss = F.cross_entropy(logits, targets)
		
		return logits, loss
	
	def generate(self, idx, max_new_tokens):
		"""Generates new tokens given a context of existing tokens such that an array of (batch, time) indices becomes (batch, time + 1))"""
		"""idx is (batch, time) array of indices in the current context"""
		for _ in range (max_new_tokens):
			"""
			Get predictions
			Calling self this way invokes the forward method and supplies idx as an argument
			"""
			logits, loss = self(idx)
			"""Focus only on the most recent time step and transform logits into an array of (Batch, Channel)"""
			logits = logits[:, -1, :]
			"""Apply softmax to get probabilities for each token in the vocabulary"""
			probs = F.softmax(logits, dim=-1)
			"""
			Sample once from the probability distribution to get the next token
			Array shape: (Batch, 1)
			"""
			idx_next = torch.multinomial(probs, num_samples=1)
			"""Append sample index to the running sequence so the final array is (Batch, Time + 1)"""
			idx = torch.cat((idx, idx_next), dim=1)
		return idx

Why this architecture is silly: <br>
A bigram means predicting one character from another. We're only using the last character to predict the next one, but we're still feeding the model the entire context on each iteration

In [105]:
model = BigramLanguageModel(vocab_size)
"""Output is the logits for each of the 4 x 8 positions and the loss"""
logits, loss = model(xbatch, ybatch)
print(f'Shape of logits tensor: {logits.shape}')
print(f'Loss: {loss:.3f}')

Shape of logits tensor: torch.Size([32, 65])
Loss: 4.626


Since we're using a mathematical negative log loss function, we can estimate the ideal loss using our vocabulary size. <br>
Ideally, we want a loss of `-ln(1/65) = ~4.17`

In [116]:
"""
Supply a tensor of zeros as the initial idx context
In vocab, zero represents a newline character, so it makes sense to start here
Predict 100 tokens, pull out the first sequence, which is a 1D array of all indices, and convert to a list
"""
result = decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())
print(result)


&:!F.OM$UfEs3MVH.keuIJYRypNEdM? TboX ms
q$uEse-P!cCUUjw!bQBSOIJH.orng!U3y?YmYGD mmN:K;rIIIV!VrM;&BLZ


## Train Bigram Model

In [125]:
"""We can get away with a high learning rate since we're using a small network"""
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [126]:
batch_size = 32
for steps in range(10000):
	"""Sample data"""
	xbatch, ybatch = get_batch('train')

	"""Forward pass and evaluate loss"""
	logits, loss = model(xbatch, ybatch)
	optimizer.zero_grad(set_to_none=True)
	"""Backward pass to compute gradients"""
	loss.backward()
	"""Use gradients to update parameters"""
	optimizer.step()

print(f'Loss after 10000 iterations: {loss.item():.3f}')

Loss after 1000 iterations: 2.421


In [129]:
result = decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=400)[0].tolist())
print(result)


Hecry! is nd ichex.
F mut winorue tit ha pr G tceavever hok do pus me o aninthem beiut met.
Yofayouen te ioymo mpous o ppeser; one mbuser ch they, ico Go bris, whtch an Orivear:

Touncr ckike bareavey ly y m Re ghaild mom
BOPlyraveipl, pople jusm thewheryane,-- std ryme.
LAnthe ty sho ls--s, ws;
AROK:
P's cthefay fonthinthe wochor, nd
An ms d n Lacorave mplou havethitax hARIn yomphe I cho, d lesto


New result is still nonsense but also a dramatic improvement from the untrained model|