Read our shakespeare dataset

In [2]:
with open(r"datasets\tinyshakespeare.txt", "r", encoding="UTF-8") as f:
    text = f.read()

Check that the dataset is loaded correctly

In [3]:
try:
    print("length of dataset: ", len(text))
    print("first 60 chars: \n ''' \n", text[:60], "\n'''")
except Exception as e:
    pass
finally:
    print("there was a error reading the dataset")

length of dataset:  1115394
first 50 chars: 
 ''' 
 First Citizen:
Before we proceed any further, hear me speak. 
'''
there was a error reading the dataset


Print list of all the chars and symbols, that are in the dataset

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("All chars and symbols used: ", chars)
print("nuber of all chars and symbols used: ", vocab_size)

All chars and symbols used:  ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
nuber of all chars and symbols used:  65


Create tokenization functions to convert all the characters and symbols from the dataset into something that GPT can process

In [8]:
# Make a character to integer and integer to character dictionary
char_to_int = {char: index for index, char in enumerate(chars)}
int_to_char = {index: char for index, char in enumerate(chars)}

# Function to convert a string to a list of integers
def encoder(s):
    return [char_to_int[c] for c in s]

# Function to convert a list of integers to a string
def decoder(l):
    return ''.join([int_to_char[i] for i in l])

# Test the functions
print(encoder("Hello World!"))
print(decoder(encoder("Hello World!")))

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42, 2]
Hello World!


Encode the whole dataset, so that the model can read it

In [12]:
encoded_text = encoder(text)
encoded_text[:60]
# Decode the first 60 characters and symbols that we just decoded to test if it worked correctly
decoder(encoded_text[:60]) # the "\n" indicates a new line

'First Citizen:\nBefore we proceed any further, hear me speak.'

Storing the encoded text in a torch.tensor object

In [19]:
import torch
data = torch.tensor(encoded_text, dtype=torch.long)
print("Is the whole dataset encoded and loaded:", data.shape[0] == len(text)) # This checks if the number of chars and symbols are the same in the dataset and the torch.tensor object


Is the whole dataset encoded and loaded: True


Split the data into training and testing sets

In [25]:
test_size = int(0.1*len(data))

train_data = data[:test_size]
test_data = data[test_size:]

Set a block size by which the data is going to be separated and fed into the model. The "+1", because the first element in the list is the input

In [39]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

Output of this code block should explain the block_size and how the GPT model predicts data

In [43]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context.tolist()}, the target is: {target}")

When input is [18] the target: 47
When input is [18, 47] the target: 56
When input is [18, 47, 56] the target: 57
When input is [18, 47, 56, 57] the target: 58
When input is [18, 47, 56, 57, 58] the target: 1
When input is [18, 47, 56, 57, 58, 1] the target: 15
When input is [18, 47, 56, 57, 58, 1, 15] the target: 47
When input is [18, 47, 56, 57, 58, 1, 15, 47] the target: 58


**batch_size** = number of independent sequences that will be processed in parallel <br>
**block_size** = number of the maximum context length for predictions <br>
The **get_batch** function selects the **batch_size** by **block_size** chunk of data from a random position. Each batch is from its own independent random location.<br>
On the bottom of the output you can see that the input is always from **inputs** **x0** to **inputs** **xn** on a **yn**, and the output to this input is on the **xn** matrix on **yn** in the **targets** table.<br>

In [67]:
batch_size = 4 
block_size = 8

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

def print_table(tensor):
    print("    " + "   ".join([f"x{col:2}" for col in range(tensor.shape[1])]))
    for row_idx, row in enumerate(tensor):
        print(f"y{row_idx} " + " | ".join([f"{elem:3}" for elem in row.tolist()]))

xb, yb = get_batch('train')
print('inputs:')
#print(xb.shape)
#print(xb)
#print("Data tensor as a table:")
print_table(xb)
print('targets:')
#print(yb.shape)
#print(yb)
print_table(yb)

print('----')
"""
for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")
"""
b = 0
for t in range(block_size): # time dimension
    context = xb[b, :t+1]
    target = yb[b,t]
    print(f"When input is {context.tolist()}, the target is: {target}")

inputs:
    x 0   x 1   x 2   x 3   x 4   x 5   x 6   x 7
y0  43 |   1 |  52 |  53 |  40 |  47 |  50 |  47
y1  41 |  39 |  52 |  49 |  43 |  56 |   5 |  42
y2  46 |  43 |   1 |  15 |  39 |  54 |  47 |  58
y3   6 |   1 |  61 |  43 |   1 |  53 |  59 |  45
targets:
    x 0   x 1   x 2   x 3   x 4   x 5   x 6   x 7
y0   1 |  52 |  53 |  40 |  47 |  50 |  47 |  58
y1  39 |  52 |  49 |  43 |  56 |   5 |  42 |   1
y2  43 |   1 |  15 |  39 |  54 |  47 |  58 |  53
y3   1 |  61 |  43 |   1 |  53 |  59 |  45 |  46
----
When input is [43], the target is: 1
When input is [43, 1], the target is: 52
When input is [43, 1, 52], the target is: 53
When input is [43, 1, 52, 53], the target is: 40
When input is [43, 1, 52, 53, 40], the target is: 47
When input is [43, 1, 52, 53, 40, 47], the target is: 50
When input is [43, 1, 52, 53, 40, 47, 50], the target is: 47
When input is [43, 1, 52, 53, 40, 47, 50, 47], the target is: 58


This will be the input with which we are going to work in the following code chunk

In [68]:
print(xb)

tensor([[43,  1, 52, 53, 40, 47, 50, 47],
        [41, 39, 52, 49, 43, 56,  5, 42],
        [46, 43,  1, 15, 39, 54, 47, 58],
        [ 6,  1, 61, 43,  1, 53, 59, 45]])


We create a BigramLanguageModel object based on a simple Neural Network model.<br>
The token_embedding_table creates a vector for each character or symbol, that stores its context to each other character or symbol.<br>
Torch arranges this into a Batch by Time by Channel tensor (batch_size = 4, block_size = 8, vocab_size = 65).<br>
The loss uses cross_entropy to validate the preditction based on the targets. But the model dosent look at the context of the last character in the input, it just looks at the last character and based on that (the token_embedding_table) predicts the next character.<br>

In [98]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step (character)
            logits = logits[:, -1, :] # becomes (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print("loss:", loss.tolist(), " If we chose characters on random the loss would be -ln(1/65) or about 4.16")
print("\nPrediction from our model if the user input is a new line character:", end="")
print(decoder(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
loss: 4.488964080810547  If we chose characters on random the loss would be -ln(1/65) or about 4.16

Prediction from our model if the user input is a new line character:
FYLXQ&
GHhKnKX!$yAhK.wAw?OWApHVTZJBHWAqpoVmg.VG&L3VF j'gIsJKGOwlBG'sb!ua
ldxtJzMY?oikKwlZDUVHp:Z?qC$


Lets optimize and train the model

In [99]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [115]:
batch_size = 32
for step in range(10000):
    # sample a batch of data
    xb, yb = get_batch("train")
    
    #evaluate the loss
    logits, loss = m(xb, yb)
    # resetting the optimizer
    optimizer.zero_grad(set_to_none=True)
    # getting new gradients
    loss.backward()
    # using the new gradients to update the parameters
    optimizer.step()
    
    if step % 1000 == 0:
        print(loss.item())

2.416093111038208
2.1720921993255615
2.299400568008423
2.4247143268585205
2.403860330581665
2.3076975345611572
2.489482879638672
2.4112446308135986
2.314846992492676
2.359891891479492


In [117]:
print("\nNew prediction from our model if the user input is a new line character:", end="")
print(decoder(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


New prediction from our model if the user input is a new line character:
Toweem count we lyopotconsu ipe tanof us
Hanepl too Hour ngo thicly rmye! hid grgacu
Har or oungruan
