In [None]:
import json

file_path = "../data/supervised_fine_tuning.json"
with open(file_path, "r") as file:
    data = json.load(file)

len(data)

In [None]:
import sys
sys.path.append('..')

In [None]:
from minbpe import RegexTokenizer

tokenizer = RegexTokenizer()
tokenizer.load(model_file="../output/tokenizer/darija_tokenizer.model")


def get_vocab_size(tokenizer: RegexTokenizer) -> int:
    vocab = tokenizer.vocab
    special_tokens = tokenizer.special_tokens

    return len(vocab) + len(special_tokens)

In [None]:
system_message = ""
system_entry = {
    "role": "system",
    "content": system_message
}

In [None]:
max_tokens = 0
block_size = 1024
for conversation in data:
    concatenated_messages = ""
    for message in conversation:
        content = message["content"]
        concatenated_messages += content + "\n"

    tokens = tokenizer.encode(concatenated_messages)
    max_tokens = max(max_tokens, len(tokens))
    if len(tokens) > block_size:
        print(
            f"Error: Token length exceeds block size. Length: {len(tokens)}, Block size: {block_size}")

max_tokens

In [None]:
tokens = {
    "start": "<|start_turn|>",
    "end": "<|end_turn|>",
    "separator": "<|separator|>",
    "eos": "<|endoftext|>"
}


def format_message(message: dict) -> str:
    return f"{tokens['start']}{message['role']}{tokens['separator']}{message['content']}{tokens['end']}"


fine_tuning_data = []
for conversation in data:
    concatenated_messages = ""

    for message in conversation:
        role = message["role"]
        if role == "user":
            if len(concatenated_messages) == 0:
                concatenated_messages += format_message({
                    "role": "system",
                    "content": system_message
                })
            concatenated_messages += format_message(message)
        elif role == "assistant":
            concatenated_messages += format_message(message)
            encoded_message = tokenizer.encode(
                text=concatenated_messages + tokens["eos"],
                allowed_special="all"
            )
            fine_tuning_data.append(encoded_message)

In [None]:
max_sequence_length = max(
    len(sequence) for sequence in fine_tuning_data)
max_sequence_length

In [None]:
import torch
torch.manual_seed(3647)

# The model will ignore the padding tokens during training.
# In other words, the loss will not be calculated for these tokens.
padding_token = tokenizer.special_tokens["<|padding|>"]


def apply_padding_to_data(data: list[list[int]], max_sequence_length: int, padding_token: int) -> torch.Tensor:
    tensors = []
    for i in range(len(data)):
        tensor = torch.tensor(data[i])
        padded_tensor = torch.nn.functional.pad(
            input=tensor,
            # for right padding:
            pad=(0, max_sequence_length - len(tensor)),
            # pad=(max_sequence_length - len(tensor), 0),
            value=padding_token
        )
        tensors.append(padded_tensor)

    return torch.stack(tensors)


train_data_tensor = apply_padding_to_data(
    data=fine_tuning_data,
    max_sequence_length=max_sequence_length,
    padding_token=padding_token
)
train_data_tensor.shape

In [None]:
split_index = int(0.95*len(train_data_tensor))
train_data_split = train_data_tensor[:split_index]
val_data_split = train_data_tensor[split_index:]

train_data_split.shape, val_data_split.shape

In [None]:
from typing import Tuple
from torch.utils.data import Dataset, DataLoader


class FineTuningDataset(Dataset):
    def __init__(self, data: torch.Tensor, device: torch.device, padding_token: int):
        self.data = data  # shape: (num_samples, block_size)
        self.device = device
        self.padding_token = padding_token

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
        sample = self.data[index]
        x = sample.to(self.device)
        y = sample[1:].to(self.device)
        padding_tensor = torch.tensor([self.padding_token], device=self.device)
        y = torch.cat((y, padding_tensor))
        return x, y


batch_size = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = FineTuningDataset(
    data=train_data_split,
    device=device,
    padding_token=padding_token
)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True
)

val_dataset = FineTuningDataset(
    data=val_data_split,
    device=device,
    padding_token=padding_token
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    shuffle=False
)

In [None]:
from transformer.model import GPTLanguageModel

block_size = 1024
n_embd = 512
n_head = 12
n_layer = 8
dropout = 0.2
vocab_size = get_vocab_size(tokenizer)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPTLanguageModel(
    vocab_size=vocab_size,
    block_size=block_size,
    n_embd=n_embd,
    n_head=n_head,
    n_layer=n_layer,
    dropout=dropout,
    device=device,
    ignore_index=tokenizer.special_tokens["<|padding|>"],
).to(device)

print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

In [None]:
checkpoint_path = "../output/pre_training/base/epoch_5.pth"
checkpoint = torch.load(checkpoint_path, weights_only=True)
model_state_dict = checkpoint["model_state_dict"]
model.load_state_dict(model_state_dict)

In [None]:
input_tokens = tokenizer.encode("hello", allowed_special="all")
input_tokens = torch.tensor(
    input_tokens, dtype=torch.long).unsqueeze(0).to(device)

model.eval()
with torch.no_grad():
    output = model.generate(input_tokens=input_tokens, max_new_tokens=100)

print(tokenizer.decode(output[0].tolist()))

In [None]:
from typing import Dict


@torch.no_grad()
def estimate_loss(
    model: torch.nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
) -> Dict[str, float]:
    output = {}
    model.eval()

    for split, loader in [('train', train_loader), ('val', val_loader)]:
        losses = torch.zeros(len(loader))
        for i, (x, y) in enumerate(loader):
            with torch.no_grad():
                _, loss = model(x, y)
            losses[i] = loss.item()
        output[split] = losses.mean().item()

    model.train()
    return output

In [None]:
def save_checkpoint(
    model: GPTLanguageModel,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    loss: float,
    file_path: str = "checkpoint.pth"
) -> None:
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }
    torch.save(checkpoint, file_path)

In [None]:
from tqdm import tqdm

max_iters = 50
eval_interval = 5
learning_rate = 1e-4
save_interval = 20

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
train_losses, val_losses = [], []

for iteration in range(max_iters):
    for batch_idx, (x_batch, y_batch) in tqdm(
        iterable=enumerate(train_loader),
        desc="Training on batches",
        total=len(train_loader)
    ):
        # Evaluation
        if batch_idx % eval_interval == 0 or batch_idx == len(train_loader) - 1:
            losses = estimate_loss(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader
            )
            print(
                f"Epoch {iteration} / step {batch_idx}: "
                f"train loss {losses['train']:.4f}, "
                f"val loss {losses['val']:.4f}"
            )
            train_losses.append(losses['train'])
            val_losses.append(losses['val'])

        # Training step
        logits, loss = model(x_batch, y_batch)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    # Save checkpoint
    if iteration % save_interval == 0:
        save_checkpoint(
            model=model,
            optimizer=optimizer,
            epoch=iteration,
            loss=loss.item(),
            file_path=f"../output/fine_tuning/qa/base/run_2/checkpoint_{iteration}.pth"
        )

save_checkpoint(
    model=model,
    optimizer=optimizer,
    epoch=iteration,
    loss=loss.item(),
    file_path=f"../output/fine_tuning/qa/base/run_2/checkpoint_{max_iters}.pth"
)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Evaluation Step")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Time")
plt.legend()
plt.grid()
plt.show()

In [None]:
checkpoint_path = "../output/fine_tuning/qa/base/run_2/checkpoint_50.pth"
checkpoint = torch.load(checkpoint_path, weights_only=True)
model_state_dict = checkpoint["model_state_dict"]
model.load_state_dict(model_state_dict)

In [None]:
def get_input_tokens(turns: list[dict]) -> list[int]:
    formatted_input = ""
    for turn in turns:
        role = turn["role"]
        content = turn["content"]
        formatted_input += f"{tokens['start']}{role}{tokens['separator']}{content}{tokens['end']}"

    formatted_input += f"{tokens['start']}assistant{tokens['separator']}"

    input_tokens = tokenizer.encode(formatted_input, allowed_special="all")
    input_tokens = torch.tensor(input_tokens, dtype=torch.long)
    input_tokens = input_tokens.unsqueeze(0).to(device)
    return input_tokens


def get_generated_message(input_tokens: list[int]) -> str:
    model_answer = ""
    model.eval()
    while True:
        try:
            output_tokens = model.advanced_generation(
                input_tokens=input_tokens, max_new_tokens=1, temperature=.9, top_k=50, top_p=None)
            last_generated_token = output_tokens[0, -1].item()
            if last_generated_token == tokenizer.special_tokens["<|endoftext|>"]:
                break

            if last_generated_token == tokenizer.special_tokens["<|end_turn|>"]:
                break

            input_tokens = torch.cat(
                (input_tokens, output_tokens[:, -1:]), dim=1)
            model_answer += tokenizer.decode([last_generated_token])

            if len(output_tokens[0]) > block_size:
                break
        except Exception:
            continue

    return model_answer


user_message = ""
turns = [
    {
        "role": "system",
        "content": system_message
    },
    {
        "role": "user",
        "content": user_message
    },
]
input_tokens = get_input_tokens(turns=turns)
model_answer = get_generated_message(input_tokens=input_tokens)
turns.append({
    "role": "assistant",
    "content": model_answer
})

for turn in turns:
    role = turn["role"]
    if role == "user":
        print(f"You: {turn['content']}")
    elif role == "assistant":
        print(f"Assistant: {turn['content']}")

In [None]:
user_message = ""
turns.append({
    "role": "user",
    "content": user_message
})
input_tokens = get_input_tokens(turns=turns)
model_answer = get_generated_message(input_tokens=input_tokens)
turns.append({
    "role": "assistant",
    "content": model_answer
})
for turn in turns:
    role = turn["role"]
    if role == "user":
        print(f"You: {turn['content']}")
    elif role == "assistant":
        print(f"Assistant: {turn['content']}")

In [None]:
user_message = ""
turns.append({
    "role": "user",
    "content": user_message
})
input_tokens = get_input_tokens(turns=turns)
model_answer = get_generated_message(input_tokens=input_tokens)
turns.append({
    "role": "assistant",
    "content": model_answer
})
for turn in turns:
    role = turn["role"]
    if role == "user":
        print(f"You: {turn['content']}")
    elif role == "assistant":
        print(f"Assistant: {turn['content']}")

In [None]:
user_message = ""
turns.append({
    "role": "user",
    "content": user_message
})
input_tokens = get_input_tokens(turns=turns)
model_answer = get_generated_message(input_tokens=input_tokens)
turns.append({
    "role": "assistant",
    "content": model_answer
})
for turn in turns:
    role = turn["role"]
    if role == "user":
        print(f"You: {turn['content']}")
    elif role == "assistant":
        print(f"Assistant: {turn['content']}")

In [None]:
user_message = ""
turns.append({
    "role": "user",
    "content": user_message
})
input_tokens = get_input_tokens(turns=turns)
model_answer = get_generated_message(input_tokens=input_tokens)
turns.append({
    "role": "assistant",
    "content": model_answer
})
for turn in turns:
    role = turn["role"]
    if role == "user":
        print(f"You: {turn['content']}")
    elif role == "assistant":
        print(f"Assistant: {turn['content']}")