# Exploration Notebook

## Import Libraries

In [1]:
import os
import torch

In [2]:
print(torch.__version__)

2.0.0


## Useful Functions

In [3]:
def read_data(data_path):
    with open(data_path, 'r') as file:
        data = file.read()
        
    return data

In [4]:
os.listdir()

['test',
 'k.txt',
 'notebook.ipynb',
 'README.md',
 'input.txt',
 '.gitignore',
 '.ipynb_checkpoints',
 '.git',
 'src']

## Load Data

In [5]:
text = read_data('input.txt')

In [6]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



## Tokenization

In [7]:
all_chars = sorted(list(set(text)))
vocab_size = len(all_chars)

print(f"We have {vocab_size} vocabulary in our dataset")
print(f"All characters: { '-'.join(all_chars) }")

We have 65 vocabulary in our dataset
All characters: 
- -!-$-&-'-,---.-3-:-;-?-A-B-C-D-E-F-G-H-I-J-K-L-M-N-O-P-Q-R-S-T-U-V-W-X-Y-Z-a-b-c-d-e-f-g-h-i-j-k-l-m-n-o-p-q-r-s-t-u-v-w-x-y-z


We must now tokenize the text at character level, in order to do that, we must create:
- **Encoder**: which converts the character to numerical representation.
- **Decoder**: which converts the numerical representation to character again.

In [8]:
# Create two ways mapping
c2i = { char:idx for idx, char in enumerate(all_chars) }
i2c = { idx:char for idx, char in enumerate(all_chars) }

encoder = lambda string: [ c2i[char] for char in string ]
decoder = lambda indexes: "".join([ i2c[idx] for idx in indexes ])

In [9]:
test_phrase = "Hello my name is Khalil!"
print(test_phrase)
print(encoder(test_phrase))
print(decoder(encoder(test_phrase)))

Hello my name is Khalil!
[20, 43, 50, 50, 53, 1, 51, 63, 1, 52, 39, 51, 43, 1, 47, 57, 1, 23, 46, 39, 50, 47, 50, 2]
Hello my name is Khalil!


Now we will tokenize the whole training set. We will start using **torch** library.

In [10]:
data = torch.tensor(encoder(text), dtype=torch.long)

In [11]:
print(f"We've got: {data.shape} shape and {data.dtype} Tensor's type.")

We've got: torch.Size([1115394]) shape and torch.int64 Tensor's type.


## Split Data to Train, Eval and Test sets

In [12]:
len(data)

1115394

In [13]:
def split_data(data, train_percent, eval_percent, test_percent):
    
    assert train_percent + eval_percent + test_percent == 1.0, f"The summation of all percentags must be 1.0, we got {train_percent + eval_percent + test_percent}"
    
    train_range = [0, int( len(data) * train_percent )]
    eval_range = [ train_range[1], int( len(data) * eval_percentage ) + train_range[1] ]
    test_range = [eval_range[1], -1]
    
    return data[:train_range[1]], data[eval_range[0]:eval_range[1]], data[test_range[0]:]

In [14]:
train_percentage = 0.8
eval_percentage = 0.1
test_percentage = 0.1

train_set, eval_set, test_set = split_data(data, train_percentage, eval_percentage, test_percentage)

## Split the data into Chunks

We don't train the GPT-decoder all at once as it will take a lot of time to do that, instead, we split the data into chunks of **context_length** plus one.

You might ask youtself why plus one? The reason why is when at each chunk, we try to predict the i+1. For instance, if the **context_length** is equal to 8:

- at i=0, GPT predicts i+1=1
- at i=1, GPT preducts i+1=2
.
.
.

To reach i=**context_length**, GPT predicts i+1=**context_length** + 1

One more thing to mention is we do that as well to make the neural network to learn how to predict the next token from index equals **zero** to index equals **context_length**.

In [15]:
CONTEXT_LENGTH = 8

x = train_set[:CONTEXT_LENGTH]
y = train_set[1:CONTEXT_LENGTH+1]

for idx in range(CONTEXT_LENGTH):
    context = x[:idx+1].tolist()
    target = y[idx]
    print(f"When the input is {context}, GPT will try to predict {target}")

When the input is [18], GPT will try to predict 47
When the input is [18, 47], GPT will try to predict 56
When the input is [18, 47, 56], GPT will try to predict 57
When the input is [18, 47, 56, 57], GPT will try to predict 58
When the input is [18, 47, 56, 57, 58], GPT will try to predict 1
When the input is [18, 47, 56, 57, 58, 1], GPT will try to predict 15
When the input is [18, 47, 56, 57, 58, 1, 15], GPT will try to predict 47
When the input is [18, 47, 56, 57, 58, 1, 15, 47], GPT will try to predict 58


Moreover, we will add the batch size. We use batches to stack list of chunks on top of each other in order to keep the GPUS busy all the time. They run independently.

In [17]:
BATCH_SIZE = 4
def get_batch(data, batch_size=BATCH_SIZE, context_length=CONTEXT_LENGTH):
    indexes = torch.randint( len(data) - batch_size, (batch_size,) )  # get BATCH_SIZE random indexes within the dataset
    
    """
    """
    # loop through each index and get the context data and finally stack them together to get torch.tensor of shape (BATCH_SIZE, CONTEXT_LENGTH) 
    x = torch.stack( [ data[ idx: idx+context_length ] for idx in indexes ] )
    # loop through each index and get the target data and finally stack them together to get torch.tensor of shape (BATCH_SIZE, CONTEXT_LENGTH)
    y = torch.stack( [ data[idx+1:idx+context_length+1] for idx in indexes ] )
    
    return x, y

In [18]:
torch.manual_seed(100)

x, y = get_batch(train_set)
print("x shape: ", x.shape)
print("y shape: ", y.shape)

x shape:  torch.Size([4, 8])
y shape:  torch.Size([4, 8])


In [19]:
x

tensor([[61, 46, 47, 54,  1, 63, 53, 59],
        [63,  1, 57, 43, 58,  1, 42, 53],
        [58, 46, 47, 52, 49,  1, 63, 53],
        [63,  1, 61, 47, 52, 45,  6,  0]])

In [20]:
y

tensor([[46, 47, 54,  1, 63, 53, 59,  1],
        [ 1, 57, 43, 58,  1, 42, 53, 61],
        [46, 47, 52, 49,  1, 63, 53, 59],
        [ 1, 61, 47, 52, 45,  6,  0, 22]])

In [21]:
for b in range(BATCH_SIZE):
    for c in range(CONTEXT_LENGTH):
        context = x[b, :c+1]
        target = y[b, c]
        print(f"Input: {context}, Output: {target}")
    print()

Input: tensor([61]), Output: 46
Input: tensor([61, 46]), Output: 47
Input: tensor([61, 46, 47]), Output: 54
Input: tensor([61, 46, 47, 54]), Output: 1
Input: tensor([61, 46, 47, 54,  1]), Output: 63
Input: tensor([61, 46, 47, 54,  1, 63]), Output: 53
Input: tensor([61, 46, 47, 54,  1, 63, 53]), Output: 59
Input: tensor([61, 46, 47, 54,  1, 63, 53, 59]), Output: 1

Input: tensor([63]), Output: 1
Input: tensor([63,  1]), Output: 57
Input: tensor([63,  1, 57]), Output: 43
Input: tensor([63,  1, 57, 43]), Output: 58
Input: tensor([63,  1, 57, 43, 58]), Output: 1
Input: tensor([63,  1, 57, 43, 58,  1]), Output: 42
Input: tensor([63,  1, 57, 43, 58,  1, 42]), Output: 53
Input: tensor([63,  1, 57, 43, 58,  1, 42, 53]), Output: 61

Input: tensor([58]), Output: 46
Input: tensor([58, 46]), Output: 47
Input: tensor([58, 46, 47]), Output: 52
Input: tensor([58, 46, 47, 52]), Output: 49
Input: tensor([58, 46, 47, 52, 49]), Output: 1
Input: tensor([58, 46, 47, 52, 49,  1]), Output: 63
Input: tensor([

## Build Bigram Language Model

Now let's build the **Bigram Language Model** to feed the batches to it.

For the lose function, we will use the **Negative log likelihood** function to calculate the loss between logits and target. To do that we can use **F.cross_entropy** from **torch**.

**cross_entropy** expects the **vocab_size** to be the second dims, thus we will reshape the inputs and outputs.

In [22]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, context, targets=None):
        
        # this returns a torch.tensor with shape of (BATCH_SIZE, CONTEXT_LENGTH, VOCAB_SIZE)
        # e.g. (4, 8, 65)
        logits = self.embedding_table(context)
        
        if targets == None:
            loss = None
        
        else:
            B, C, V = logits.shape

            # we're going to strech out the array, new shape: (BATCH_SIZE * CONTEXT_LENGTH, VOCAB_SIZE)
            # e.g. (4*8, 65) == (32, 65)
            logits = logits.view(B*C, V)

            # and for the targets as well, we're going to change it's shape to be one dim
            # e.g. (32)
            targets = targets.view(B*C)

            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    
    def generate(self, context, max_tokens):
        # Fist of all the context is with (B, C) shape
        for _ in range(max_tokens):
            # we get the prediction, the logits will be in (B, C, V) shape and the loss will be None
            logits, loss = self(context)
            # Focus only on the last character, this will change later
            logits = logits[:, -1, :]
            # get the probability distribuation where the sum of probabilities are equal to 1
            probs = F.softmax(logits, dim=1)
            # get random sample distribution from the probability
            next_token = torch.multinomial(probs, num_samples=1)
            # concatinate the generated token with the previous set of tokens
            context = torch.cat((context, next_token), dim=1)
        
        return context

In [23]:
bigram_model = BigramLanguageModel(vocab_size)

logits, loss = bigram_model(x, y)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8904, grad_fn=<NllLossBackward0>)


In [24]:
# Test
test_idx = torch.zeros((1, 1), dtype=torch.long)
generated_results = bigram_model.generate(test_idx, max_tokens=100)[0].tolist()
print(generated_results)
decoder(generated_results)

[0, 31, 56, 12, 55, 28, 7, 29, 35, 49, 58, 36, 53, 24, 4, 48, 24, 16, 22, 45, 27, 24, 34, 64, 5, 30, 21, 53, 16, 55, 20, 42, 46, 57, 34, 4, 60, 24, 24, 62, 39, 58, 48, 57, 41, 25, 54, 61, 24, 17, 30, 31, 28, 63, 39, 53, 8, 55, 44, 64, 57, 3, 37, 57, 3, 64, 18, 7, 61, 6, 11, 43, 17, 49, 64, 62, 48, 45, 15, 23, 18, 15, 46, 57, 2, 47, 35, 35, 8, 27, 40, 64, 16, 52, 62, 13, 1, 25, 57, 3, 9]


"\nSr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3"

## Training the Bigram Language Model

In [25]:
def training_loop(model, data, epochs, optimizer, batch_size, context_length, device):
    training_loss = []
    model.to(device)
    for epoch in range(epochs):
        x, y = get_batch(data, batch_size, context_length)
        x, y = x.to(device), y.to(device)

        logits, loss = model(x, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        training_loss.append(loss.item())

    return training_loss

In [26]:
# define an optimizer
device = 'mps'
bigram_model = bigram_model.to(device)
optimizer = torch.optim.AdamW(bigram_model.parameters(), lr=1e-2)

In [27]:
traning_loss = training_loop(bigram_model, train_set, 100, optimizer, 32, 8, device)

In [28]:
traning_loss[-1]

3.656949520111084

In [29]:
# Test
test_idx = torch.zeros((1, 1), dtype=torch.long)
generated_results = bigram_model.generate(test_idx, max_tokens=300)[0].tolist()
decoder(generated_results)

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [163]:
generated_results

[0,
 13,
 31,
 43,
 1,
 46,
 53,
 59,
 1,
 50,
 1,
 30,
 27,
 24,
 27,
 10,
 0,
 32,
 46,
 43,
 1,
 57,
 58,
 46,
 53,
 56,
 1,
 46,
 43,
 6,
 1,
 51,
 53,
 51,
 63,
 1,
 30,
 17,
 26,
 53,
 59,
 56,
 42,
 43,
 50,
 42,
 0,
 35,
 47,
 43,
 56,
 43,
 1,
 30,
 35,
 47,
 56,
 43,
 39,
 56,
 1,
 43,
 56,
 47,
 52,
 57,
 58,
 1,
 61,
 52,
 45,
 43,
 1,
 46,
 43,
 42,
 1,
 50,
 63,
 1,
 61,
 52,
 47,
 50,
 53,
 40,
 43,
 39,
 1,
 53,
 59,
 50,
 50,
 53,
 44,
 1,
 21,
 44,
 1,
 47,
 56,
 57,
 1,
 44,
 1,
 39,
 52,
 49,
 43,
 43,
 1,
 47,
 41,
 43,
 63,
 53,
 42,
 39,
 58,
 39,
 63,
 1,
 50,
 1,
 61,
 47,
 53,
 59,
 56,
 1,
 40,
 43,
 1,
 39,
 58,
 46,
 43,
 1,
 51,
 63,
 53,
 1,
 51,
 8,
 0,
 0,
 21,
 1,
 58,
 6,
 0,
 24,
 27,
 1,
 16,
 1,
 58,
 46,
 53,
 1,
 58,
 43,
 56,
 43,
 1,
 18,
 18,
 47,
 52,
 1,
 21,
 1,
 58,
 46,
 43,
 1,
 49,
 43,
 42,
 1,
 58,
 46,
 43,
 57,
 1,
 58,
 46,
 1,
 59,
 56,
 63,
 1,
 46,
 43,
 52,
 47,
 58,
 61,
 39,
 52,
 42,
 1,
 44,
 1,
 46,
 6,
 0,
 21,
 26,
 21,


In [155]:
torch.has_mps

True