In [None]:
import sys
sys.path.append('..')

In [None]:
from minbpe import RegexTokenizer

tokenizer = RegexTokenizer()
tokenizer.load(model_file="../output/tokenizer/darija_tokenizer.model")


def get_vocab_size(tokenizer: RegexTokenizer) -> int:
    vocab = tokenizer.vocab
    special_tokens = tokenizer.special_tokens

    return len(vocab) + len(special_tokens)

In [None]:
import torch
torch.manual_seed(3647)

In [None]:
from transformer.model import GPTLanguageModel

block_size = 1024
n_embd = 512
n_head = 12
n_layer = 8
dropout = 0.2
batch_size = 4
vocab_size = get_vocab_size(tokenizer)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPTLanguageModel(
    vocab_size=vocab_size,
    block_size=block_size,
    n_embd=n_embd,
    n_head=n_head,
    n_layer=n_layer,
    dropout=dropout,
    device=device
).to(device)

print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

In [None]:
import numpy as np


data = np.load('../output/encoded_data/encoded_atlaset.npy', mmap_mode='r')
print('Data shape:', data.shape)

In [None]:
split_index = int(0.9*len(data))
split_index

In [None]:
from typing import Tuple

def get_batch_for_loss_estimation(split:str) -> Tuple[torch.Tensor, torch.Tensor]:
    if split == 'train':
        start_index = 0
        end_index = split_index
    else:
        start_index = split_index
        end_index = len(data)

    available_blocks = (end_index - start_index -1) // block_size
    block_indices = torch.randint(0, available_blocks, (batch_size,))

    x_batch, y_batch = [], []
    for i in block_indices:
        block_start = start_index + (i * block_size)
        x_batch.append(data[block_start:block_start+block_size])
        y_batch.append(data[block_start+1:block_start+block_size+1])

    x_batch = np.array(x_batch)
    y_batch = np.array(y_batch)

    x_batch = torch.tensor(x_batch, dtype=torch.long).to(device)
    y_batch = torch.tensor(y_batch, dtype=torch.long).to(device)

    return x_batch, y_batch


In [None]:
from typing import Dict

@torch.no_grad()
def estimate_loss() -> Dict:
    output = {}
    eval_iters = 1000
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch_for_loss_estimation(split)
            _, loss = model(x, y)
            losses[k] = loss.item()
        output[split] = losses.mean()
        model.train()
        return output

In [None]:
def save_checkpoint(
    model: GPTLanguageModel,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    loss: float,
    file_path: str = "checkpoint.pth"
) -> None:
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }
    torch.save(checkpoint, file_path)

In [None]:
from tqdm import tqdm

torch.set_float32_matmul_precision('high')

batch_size = 4
block_size = 1024
gradient_accumulation_steps = 8
eval_interval = 100
save_interval = 10000

total_data_to_process = split_index - block_size
total_data_to_process_in_batches = total_data_to_process // batch_size

learning_rate = 3e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

batches_processed = 0
train_losses, val_losses = [],[]

optimizer.zero_grad(set_to_none=True)
for i in tqdm(
    iterable=range(0, total_data_to_process, batch_size),
    desc="Processing",
    total=total_data_to_process_in_batches
):
    x_batch, y_batch = [], []
    for j in range(i, i+batch_size):
        x_batch.append(data[j:j+block_size])
        y_batch.append(data[j+1:j+block_size+1])
    
    x_batch = np.array(x_batch)
    y_batch = np.array(y_batch)

    x_batch = torch.tensor(x_batch, dtype=torch.long).to(device)
    y_batch = torch.tensor(y_batch, dtype=torch.long).to(device)

    logits, loss = model(x_batch, y_batch)
    loss /= gradient_accumulation_steps
    loss.backward()

    batches_processed +=1
    if batches_processed % eval_interval == 0:
        losses = estimate_loss()
        print(
            f"Batch {batches_processed}: "
            f"train loss {losses['train']:.4f}, "
            f"val loss {losses['val']:.4f}"
        )
        train_losses.append(losses['train'])
        val_losses.append(losses['val'])
    
    if batches_processed % save_interval == 0:
        save_checkpoint(
            model=model,
            optimizer=optimizer,
            epoch=batches_processed,
            loss=loss.item(),
            file_path=f"../output/pre_training/run_1/checkpoint_{batches_processed}.pth"
        )

if batches_processed % gradient_accumulation_steps != 0:
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)



In [None]:
from tqdm import tqdm

torch.set_float32_matmul_precision('high')

gradient_accumulation_steps = 8
eval_interval = 3000
save_interval = 10000

# Calculate the number of complete non-overlapping blocks
non_overlapping_blocks = (split_index - 1) // block_size
total_batches = non_overlapping_blocks // batch_size

learning_rate = 3e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

batches_processed = 0
train_losses, val_losses = [], []
optimizer.zero_grad(set_to_none=True)

for i in tqdm(
    iterable=range(0, non_overlapping_blocks, batch_size),
    desc="Processing",
    total=total_batches
):
    # Load a batch of non-overlapping blocks
    x_batch, y_batch = [], []
    for j in range(batch_size):
        if i+j < non_overlapping_blocks:
            block_start = (i+j) * block_size
            x = data[block_start:block_start+block_size]
            y = data[block_start+1:block_start+block_size+1]
            x_batch.append(x)
            y_batch.append(y)

    if len(x_batch) == 0:
        continue

    x_batch = np.array(x_batch)
    y_batch = np.array(y_batch)

    x_batch = torch.tensor(x_batch, dtype=torch.long).to(device)
    y_batch = torch.tensor(y_batch, dtype=torch.long).to(device)

    # Forward pass
    logits, loss = model(x_batch, y_batch)
    loss /= gradient_accumulation_steps
    loss.backward()

    # Gradient accumulation
    batches_processed += 1
    if batches_processed % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

    # Evaluate the model
    if batches_processed % eval_interval == 0:
        losses = estimate_loss()
        print(
            f"Batch {batches_processed}: "
            f"train loss {losses['train']:.4f}, "
            f"val loss {losses['val']:.4f}"
        )
        train_losses.append(losses['train'])
        val_losses.append(losses['val'])

    # Save the model
    if batches_processed % save_interval == 0:
        save_checkpoint(
            model=model,
            optimizer=optimizer,
            epoch=batches_processed,
            loss=loss.item(),
            file_path=f"../output/pre_training/run_1/checkpoint_{batches_processed}.pth"
        )

if batches_processed % gradient_accumulation_steps != 0:
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)

save_checkpoint(
    model=model,
    optimizer=optimizer,
    epoch=batches_processed,
    loss=loss.item(),
    file_path=f"../output/pre_training/run_1/checkpoint_{batches_processed}.pth"
)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Evaluation Step")
plt.ylim(0)
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Time")
plt.legend()
plt.grid()
plt.show()

In [None]:
input_tokens = tokenizer.encode("hello")
input_tokens = torch.tensor(
    input_tokens, dtype=torch.long).unsqueeze(0).to(device)

model.eval()
with torch.no_grad():
    output = model.generate(input_tokens=input_tokens, max_new_tokens=100)

print(tokenizer.decode(output[0].tolist()))