# Intro: Bigram Language Model

A bigram language model is a type of probabilistic model that predicts the next character in a sequence based on the assumption that the likelihood of a character depends only on the preceding character. It operates by analyzing pairs of characters (bigrams) in a given text dataset to learn the probability distribution of the appearance of a character given its predecessor. This model is a simple case of the more general n-gram models used in natural language processing for tasks like text generation or auto-completion.

In [39]:
import torch
# imports the neural network module (nn) from the PyTorch library, which includes various building blocks for neural networks.
import torch.nn as nn
# Imports PyTorch's functional API, which includes functions like activations, loss functions, etc
from torch.nn import functional as F
#Batch size is the number of blocks we are processing in parallel.
#Good way to scale 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

cuda


In [40]:
# Opening File

In [41]:
# read file and store it in a string variable
with open('the_bible.txt', 'r', encoding='utf-8') as f:
          text = f.read()
# sorted set of characters
chars = sorted(set(text))
print(chars)
print(len(chars))


vocab_size=len(chars)

['\n', ' ', '!', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—']
74


# Tokeniser
We are now going to use a tokeniser which consists of an encoder and a decoder
encoder - converts each element of chars into an integer

For a character level tokeniser script - takes each character and coverts to an integer quickly
character level tokensier - small vocab, but a lot of characters to encode and decode
but word tokensier - very large vocab, but small
mapping from strings to integers. Go through each element and assign to an integer

In [42]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
# mapping from integers to strings
int_to_string = {i:ch for i,ch in enumerate(chars) }
#convert 'hello' to interger
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# print(encode('hello!'))
# encoded_hello = encode('hello!')
# decoded_hello = decode(encoded_hello)
# print(decoded_hello)

#going to have the whole book as a sequence of super long integers
data = torch.tensor(encode(text), dtype=torch.long)


# Batch Creation

For the function we need:
We need random starting points for our data as this adds variability to our training process
        (batch_size,), our batch size is 4, so we are going to generate 4 random starting points
        in https://pytorch.org/docs/stable/generated/torch.randint.html the lowest integer to be drawn is optional, so it is 0 unless specified
        we specify the highest interget to be drawn as len(data) - block_size, so we don't have a point that when we add block_size of 8 we would make
        the mistake of going past the limits of our dataset



In [43]:
# training split
# done to split the dataset into training and validation sets, with 80% for training and the remaining 20% for validation.
n = int(0.8*len(data))
#slices the data up to but not including the nth element
train_data = data[:n]
#slices the data from the nth element onwards
val_data = data[n:]

#function that specifies whether to get a batch from the 
def get_batch(split):
    # Inside the function, this line sets data to train_data if the split argument is 'train'; otherwise, it sets data to val_data.
    data = train_data if split == 'train' else val_data
    # Generates a tensor of random integers indices (ix) within the range [0, len(data) - block_size). The size of this tensor is determined by batch_size

    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    
    # Constructs a batch of inputs x. For each starting index in ix, 
    # it slices block_size elements from data starting at that index, and then stacks these slices into a tensor.
    # it has to be i+block_size as remember we don't include the last value
    # e.g. if i = 0, i+block size = 8, so 0 to 7
    x = torch.stack([data[i:i+block_size] for i in ix])
    
    # Constructs a batch of targets y. It's similar to the previous line, but each slice starts one element after 
    # the corresponding slice in x and includes one additional element at the end. 
    # This is for where where each input sequence in x needs to predict the next sequence in y
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    # make sure that x and y are not processed on cpu but sequentially in our gpu
    x, y = x.to(device), y.to(device)
    return x, y
x, y = get_batch('train')
#print('ix: ')
#print(ix)
print('inputs:')
#print(x.shape)
print(x)
print('targets:')
print(y)

tensor([1276278, 2143020, 1759103, 1791103])
inputs:
tensor([[28, 61, 50,  6,  1, 47, 60, 50],
        [60, 51, 65, 66,  1, 61, 60,  0],
        [10, 12,  1, 41, 54, 51, 60,  1],
        [ 9,  9, 18, 19,  9, 11,  9,  1]], device='cuda:0')
targets:
tensor([[61, 50,  6,  1, 47, 60, 50,  1],
        [51, 65, 66,  1, 61, 60,  0, 54],
        [12,  1, 41, 54, 51, 60,  1, 40],
        [ 9, 18, 19,  9, 11,  9,  1, 22]], device='cuda:0')


In [44]:
# using a pytorch decorator below before our estimate loss function
# makes sure pytorch doesn't use gradients as we are just evaluating our model, we are not not updating it only need a loss

@torch.no_grad() 
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Note on Bigram Language Model

1. We are going to build a class which is the foundation for building a neural network model for processing language data.
2. We are going to map each vocab to an embedding space or embedding layer which is self.token_embedding_table
3. Forward pass where we take each input (which is a context or a block of elements or this is called an index) and pass it through the embedding layer and then calculate the loss
4. We have a token predictor these are individual indexes or elements

# Note on token_embedding_table 

going to make an embedding table - look up table
        
all 80 characters vertically and horizontally
shows the probability distribution of what character comes next given one character
what character comes next given one character
so its like most frequent aa, or ae etc
normalising to current row

nn.Embedding: This is a PyTorch layer that provides a simple lookup table that stores embeddings of a fixed dictionary and size. 
It takes two main arguments: vocab_size (the size of the dictionary of embeddings) and the dimensionality of the embeddings.

Initialization: When you instantiate nn.Embedding, it initializes the embedding table with random weights. 
These weights are the embedding vectors corresponding to each token in your vocabulary.

nn.Embedding is designed to have its weights (the embedding vectors) adjusted through backpropagation during training.


# Note on the use of .view

For .view reshapes a tensor 

Why did you have to reshape?
Well its to do with what PyTorch is expecting for the cross-entropy loss
Expecting a shape: (C) or (N, C) where N is the batch or number of blocks, or (N, C, d1,d2....dK)
Its expecting a BxCxT instead of a BxTxC which is what we get out of B, T, C = logits.shape
so we opt for (N,C) where N = B*T
                    
                    logits = logits.view(B*T, C)

we are paying attention to the channels so can blend batch and time together
as long as the logits and the targets have the same batch and time
view reshapes a tensor 
This line reshapes the logits tensor from a 3D tensor of shape (B, T, C) 
to a 2D tensor of shape (B*T, C). It's combining the batch and time dimensions, 
essentially flattening the sequence data while keeping the channel dimension separate.

                    targets = targets.view(B*T)
reshape
initialise our targets
Similarly, the targets tensor is reshaped to match the first two dimensions of the logits. 
This is necessary for the loss calculation, ensuring that each element in logits corresponds to a target value.
No need for channels dimension as those are our features





In [45]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):

        # 
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            
            # focus only on the last time step
            # as we want model predictions with their associated probabilities for the next element (last time step output) in each batch
            # We have three dimensions below rather than two, because targets is none, so in our if statement we did not do any view transformations
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities, we apply it to C dimension which are the class instance or features
            probs = F.softmax(logits, dim=-1) # (B, C)
            
            # sample from the distribution
            # randomly select the next token from the given probability distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
            
        # return all the generated tokens for max_new_tokens
        return index

# Push parameters to GPU for more efficient training.
model = BigramLanguageModel(vocab_size)
m = model.to(device)

# torch.long equiv of int64
# zero in token indicates the start of a new line '\n' see above
context = torch.zeros((1,1), dtype=torch.long, device=device)
# we generate characters based on a context aka we have a new line
# we make sure to decode so its readable
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



4pwUBajN,6f(,id?q(NKODyUM—)w0V4:v;UQwd!Ma6e7gu1w)nP2a9N9Px wBmamQKZ5naJerBH?kCN1)Ku3'—nW,hHT0c—Egu3'7)6!N!OzJxQ.WkNNQybS'YCltQgOhuc e26oP-IATx8,C00S7B5fy1cdhMTYRoA:o4QKMPKPWCU:KOxK)vtj :H.?M—C.Z:wdI!;fp:fMW5G'YYhHWvH'YaEmS7ONd8ttK1Ku:'L:PUb:cAtj!z
S3lCOWR3MP-0
BmNWyweiJs)fEsEMst1he2x))G.8P
CyYknBA)MZq3S;f,KrK9KK8RLMa8SFGiJ(mic!p,!Op;8;EBf)on2qB!gTkx.Z:Hyv8.LCz'(JiJ;;exse7BN-
KEB-IwrKr3?)meOQEhYkpCN;W.V4sO6Pk9:(Q.t,f5qYp:T: GAa9TxJBWfE1LpH
GSakQgdGwGT.9j9f?P (ggd
vvU?RlKZJHc8qHIWS-2CU:jcl)SLTtt'Y


# Note on the Adam Optimiser

The Adam optimizer is a popular method for training neural networks that combines the best features of two other optimization methods: momentum and adaptive learning rates. It maintains two moving averages for each parameter - one for the gradients (direction of the update) and another for the square of the gradients (variability). Adam adjusts the learning rate for each parameter individually, which makes it effective in handling sparse data and varying gradients. It also includes a mechanism for correcting its own bias, particularly early in training. This combination allows Adam to adapt the learning rate dynamically, enhancing its efficiency and effectiveness across a wide range of problems and data types, making it a widely used optimizer in deep learning.

In [46]:
# create a PyTorch Optimizer
# Adam optimiser with weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
                             
for iter in range(max_iters):
    if iter % eval_iters ==0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss {losses['train']:.3f}, val loss: {losses['val']:.3f}")
    # sample a batch of data e.g. 4 blocks of 8 characters
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    # set the gradients to None before each new timestep to avoid gradient accumulation
    optimizer.zero_grad(set_to_none=True)
    # backward step
    loss.backward()
    # grad descent
    optimizer.step()
print(loss.item())

tensor([3266671,  694222, 2900440, 1016067])
tensor([  52010, 1654731,  286205, 3485869])
tensor([2548890, 2507352,  824132, 1793832])
tensor([ 132925, 2801018, 2581177, 1658493])
tensor([3175861, 2566434, 2716022, 2520068])
tensor([1830249,  820580, 1949185, 1605281])
tensor([1381874, 2134787, 1979230,   23102])
tensor([2009960, 2061741,  370530,  696020])
tensor([ 853200, 2298967, 3146676, 3503295])
tensor([2293729, 3142031, 1780059, 1712312])
tensor([3442898, 2892933, 1574172, 1666921])
tensor([2671572, 2418979, 3325422, 3520120])
tensor([2881344,  219461, 1698584, 2387527])
tensor([3202851, 2085649, 3558538, 1196922])
tensor([1655341, 2253960, 2910509, 3159699])
tensor([2215900, 3467864,  466259, 1246990])
tensor([3276911, 1855289,  390370,  747330])
tensor([1000230,  110136, 2835051, 1256312])
tensor([1047369, 2152081, 3200207,  223854])
tensor([ 921782, 2438523,  413596, 1945574])
tensor([2182859,  609432, 3505469, 1261713])
tensor([ 884747, 1438799, 1426692,  586832])
tensor([15

In [47]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


BfaGxKr-IWk7Ruua9tCNx'iVxq8WmB'Yf?p:ooZIJtmF A dSFZDgTS7 (wseQm9Sj7ra;BfEUS7jyRBaEWj9utthAfESrLwp64rL—6gsRtt7ZUM7LYL87?49t,3PUfE:vUY r8(y7j'iWoyweDi:H4nIy:GCCDrJs7Ia-fqYMs TyAa961!M—J:TEGU—R?RoZR2
B-5?Pmbdde;fGyogdi4pwY0sougd?CSem3PiW!RPU2:U:eJS(7xPxrLGO5dtHvTA.Wm-hLkQr—OG.v)
sp8u3.OMlx?ycuLh3'
heQ
qQb3SFwEvkr;)qCD4d.,qlMs2CzvaDFQE(
U:)'9eBJNR'gb;r?6!aM7:n3'oRRqH'kpv-RAAB?Q85e7I!gdCE8WG.t8p6OMawewsUg rJT4AWG.fPbF cK99!8LrKQrcSHVyQ;7,
G.CsbAga9sGvqI!NLn4pv(9w2d1hu iW1'beaaWkIA!vD8C(kqvRjf:2q'd5j8
