In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device = 'mps'
print(device)

# Loading data

## Initial Preparation

First define where the training data url is

In [None]:
import os 
cwd = os.getcwd()
repository_dir = '/'.join(cwd.split('/')[:-1])
# DATA_DIR = f'{repository_dir}/data/data/preprocessed/preprocessed.txt'
DATA_DIR = 'input.txt'
assert os.path.exists(DATA_DIR), 'Make sure you follow the steps of README.md in data repository'

print(f'Training data located at {DATA_DIR}')

Then load the data

In [None]:
with open(DATA_DIR, 'r', encoding='utf-8') as train_file:
    raw_data = train_file.read()


# Inspection
print(f'Loaded {len(raw_data)} characters')
print(f'First 100 characters:\n{raw_data[:100]}')

## Tokenization

Tokenization is essentially turning the data into 'words' that the model understands. Below the vocabulary that is defined is based on a character basis. 

TODO: check out other tokenizers:
tiktoken - https://github.com/openai/tiktoken
sentencepiece - https://github.com/google/sentencepiece

In [None]:
chars = sorted(list(set(raw_data)))
VOCAB_SIZE = len(chars)
print(f'Vocabulary ({VOCAB_SIZE} characters): {"".join(chars)}')
char_to_digit = {char:digit for digit, char in enumerate(chars)}
digit_to_char = {digit:char for digit, char in enumerate(chars)}

encode = lambda string: [char_to_digit[char] for char in string]
decode = lambda digits: ''.join([digit_to_char[digit] for digit in digits])

In [None]:
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(raw_data), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

## Train/Val Split

In [None]:
TRAIN_PROPORTION = 0.9 # 90% of data is for training, 10% for validation
idx = int(TRAIN_PROPORTION * len(data)) 
train_data = data[:idx]
val_data = data[idx:]

## Chunking data

It is important to note that it is computationaly infeasible and prohibitive to train on the **whole** dataset all at once, due to the large training data size. This is why data is separated into *chunks*, which are smaller random samples of the whole dataset. The size of the chunks is defined by the variable `block_size`

### Blocks

In [None]:
BLOCK_SIZE = 64
train_data[:BLOCK_SIZE+1]


The way training is carried out is that for every character `c` in a chunk, `c` is the label and the inputs are all characters before `c`. 
This approach is useful, because the model can start predicting from as little as a single character, and predict everything up until block size
#### Example:

In [None]:
example = "Hello, there"
print(f"Input{' ' * 7}| Output")
print('='*22)
for t in range(1, len(example)):
    inputs = example[:t]
    output = example[t]

    print(f"{inputs:<11} | {output}")


### Batch

There is an additional dimension except the time dimension (blocks), which is the batch. Multiple blocks are sampled and stacked on top of each other to create a batch. This way multiple samples can be processed in parallel.

In [None]:
BATCH_SIZE = 64
BLOCK_SIZE = 128
SEED = 1338

torch.manual_seed(SEED)

def get_batch(split = 'train'):
    data = train_data if split == 'train' else val_data
    # data = torch.tensor([i for i in range(100)])

    # Generate BATCH_SIZE random starting positions for each block
    block_indices = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,)) 
    block_range = torch.arange(0, BLOCK_SIZE, dtype=torch.int)

    # Add the above range to each starting position to create a set
    # of indices for each block 
    block_ranges = block_indices[:, None] + block_range
    
    x = data[block_ranges] 
    y = data[block_ranges + 1]
    return x.to(device), y.to(device)    

print(get_batch())


## Loss Estimation

In [None]:
@torch.no_grad()
def estimate_loss(model, eval_iters = 100):
    out = {}
    model.eval()
    
    for split in ["test", "validation"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split)
            loss = model(x, y)
            losses[k] = loss

        out[split] = losses.mean().item()

    model.train()
    return out

# Bigram Language Model

In [None]:
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def predict(self, input):
        return self.token_embedding_table(input)

    def forward(self, input, targets):
        logits = self.predict(input) # logits.shape = B, T, C - Batch, Time, Channels
        B, T, C = logits.shape
        transformed_logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(transformed_logits, targets) # Expects B*T, C
        return loss
    
    def generate(self, input, max_characters=20):
        for _ in range(max_characters):
            logits = self.predict(input)
            logits = logits[:, -1, :]
            distribution = F.softmax(logits, dim=-1)
            next = torch.multinomial(distribution, num_samples=1)
            input = torch.cat((input, next), dim=1)
        return input[0]

model = BigramLanguageModel(VOCAB_SIZE).to(device)
x, y = get_batch()

context = torch.zeros((1, 1), dtype=torch.long).to(device)
res = model.generate(context)
print(decode(res.tolist()))


## Create an optimizer

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
# https://arxiv.org/abs/1711.05101v3
# https://paperswithcode.com/method/adamw

## Training the model

In [None]:

import time

def train(epochs = 100000):

    loss_estimation = 10000
    start = time.time()

    for i in range(epochs):
        x, y = get_batch()
        loss = model.forward(x, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        # print(loss)
        optimizer.step()
        if i % loss_estimation == 0:
            print(f'Epoch {i} - {estimate_loss(model)}')
            end = time.time()
            print(end - start)
            start = time.time()

    
train()

In [None]:
context = torch.zeros((1, 1), dtype=torch.long).to(device)
res = model.generate(context, max_characters=200)
print(decode(res.tolist()))