In [None]:
import random
import os
import pickle
import time
import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

In [None]:
# Set the random seed for reproducibility
seed = 42
torch.manual_seed(seed) 
random.seed(seed)
np.random.seed(seed)

# Set the device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device set to {device}.")

# Data Preparation

In [None]:
# Helper functions to load and save data
def save_data(data, file_path):
    with open(file_path, 'w') as f:
        f.write(data)

def load_data(file_path):
    with open(file_path, 'r') as f:
        return f.read()

In [None]:
# Directory where the data is stored "must contain 4 files : train.txt, val.txt, test.txt and a meta.pkl file"
DATA_DIR = "/yourDataDirectoryHere"
# Directory where the model is stored
MODEL_DIR = "/yourModelDirectoryHere"

In [None]:
# Attempt to derive vocab_size from the dataset

meta_path = os.path.join(DATA_DIR, 'meta.pkl')
vocab_size = None

if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    vocab_size = meta['vocab_size']
    print(f"found vocab_size = {vocab_size} (inside {meta_path})")
else:
    print("Meta file not found. Please ensure the meta.pkl file is present in the data directory.")

# Encode and decode functions for character-level Tokenzation 
def encode(s):
    return [meta['stoi'][c] for c in s]

def decode(l):
    return ''.join([meta['itos'][i] for i in l])

In [None]:
# Load data
train_data = load_data(os.path.join(DATA_DIR, 'train.txt'))
val_data = load_data(os.path.join(DATA_DIR, 'val.txt'))
test_data = load_data(os.path.join(DATA_DIR, 'test.txt'))

# Encode data
train_ids = encode(train_data)
val_ids = encode(val_data)
test_ids = encode(test_data)

# Save encoded data to bin files, make sure to choose "Files only" on the persistence option of the session so that you don't encode data each time
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
test_ids = np.array(test_ids, dtype=np.uint16)

train_ids.tofile( 'train.bin')
val_ids.tofile( 'val.bin')
test_ids.tofile('test.bin')

print("Encoded data saved as binary files.")

In [None]:
del(train_ids)
del(val_ids)
del(test_ids)

In [None]:
# Load encoded data
train_data = np.memmap("/kaggle/working/train.bin", dtype=np.uint16, mode='r')
val_data = np.memmap("/kaggle/working/val.bin", dtype=np.uint16, mode='r')

# Model

In [None]:
# Hyperparameters for the GPT model
block_size = 256  # Maximum context length
n_embd = 372      # Embedding dimension
n_head = 6        # Number of attention heads
n_layer = 6       # Number of transformer blocks
dropout = 0       # Dropout rate
batch_size = 64   # Batch size for training
max_iters = 100_000  # Maximum number of iterations
learning_rate = 1e-3 # Initial Learning rate value
miles = [int(max_iters * m) for m in [0.7, 0.8, 0.9]]  # Milestones for learning rate decay as fractions of max_iters
eval_interval = 10_000 # Evaluation interval
eval_iters = 1000 # Number of iterations for evaluation
vocab_size = 53 # Vocabulary size

# Model to be fine-tuned "set the model name without .pth" (Keep it empty for training from scratch)
model_name = 'yourModelNameWithoutExtensionHere'

# LoRA Rank - Set it to 0 if you want to train from scratch or perform full fine-tuning
lora_r = 12

compile = False

In [None]:
print(f"Data in tokens: {len(train_data)}")
iters4epoch = len(train_data)//(batch_size * block_size)
print(f"Number of iters for one pseudo-epoch : {iters4epoch}")
print(f"Number of pseudo-epochs : {max_iters / iters4epoch:.2f}")

In [None]:
# defining the entire structure of the model, and in parallel implementing lora
class LayerNorm(nn.Module):
    """ LayerNorm with an optional bias. PyTorch's LayerNorm doesn't support simply bias=False """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class Head(nn.Module):
    """One head of self-attention."""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)

        # Apply scaled dot-product attention
        out = torch.nn.functional.scaled_dot_product_attention(
            q, k, v, attn_mask=None, dropout_p=dropout if self.training else 0, is_causal=True
        )
        
        return out
    

class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel."""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # Concatenate the outputs from each head
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    
class FeedForward(nn.Module):
    """A simple linear layer followed by a non-linearity."""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd, bias=False),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd, bias=False),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class LinearLoRA(nn.Module):
    def __init__(self, original_layer, rank=8):
        super().__init__()
        self.original_layer = original_layer
        self.original_layer.weight.requires_grad = False
        self.rank = rank
        
        self.lora_a = nn.Parameter(torch.randn((original_layer.in_features, rank)))
        self.lora_b = nn.Parameter(torch.randn((rank, original_layer.out_features)))
        
        self.reset_parameters()
        
    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.lora_a, a=np.sqrt(5))
        nn.init.zeros_(self.lora_b)
        
    def forward(self, x):
        lora_output = x @ self.lora_a @ self.lora_b
        return self.original_layer(x) + lora_output
    
class Block(nn.Module):
    """Transformer block: communication followed by feedforward."""

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd, bias=False)
        self.ln2 = nn.LayerNorm(n_embd, bias=False)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPT(nn.Module):
    """GPT language model."""

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd, bias=False) 
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # Token and position embeddings
        tok_emb = self.token_embedding_table(idx) # (B, T, n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, n_embd)
        x = tok_emb + pos_emb # (B, T, n_embd)
        x = self.blocks(x) # (B, T, n_embd)
        x = self.ln_f(x) # (B, T, n_embd)
        logits = self.lm_head(x) # (B, T, vocab_size)

        # Compute loss if targets are provided
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        """Generate new tokens given an initial context `idx`."""
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:] # Crop to the last block_size tokens
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] # Focus on the last time step
            probs = F.softmax(logits, dim=-1) # Convert to probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # Sample from the distribution
            idx = torch.cat((idx, idx_next), dim=1) # Append sampled index to the sequence
        return idx
    
    def activate_lora(self, r=8, heads_only=False, freeze_others=True):
        self.lora_rank = r
        self.replace_multihead_attention_recursion(heads_only)
        if freeze_others:
            self.freeze_parameters_except_lora_and_bias()
    
    def replace_multihead_attention_recursion(self, heads_only=False, model=None):
        children = self.named_children() if model is None else model.named_children()
        for name, module in children:
            if heads_only and name in {"query", "key", "value"}:
                # Replace with Lora SelfAttention
                new_layer = LinearLoRA(module, rank=self.lora_rank)

                if model == None:
                    self.__setattr__(name, new_layer)
                else:
                    setattr(model, name, new_layer)
            
            elif isinstance(module, nn.Linear) and not heads_only:
                new_layer = LinearLoRA(module, rank=self.lora_rank)
                
                if model == None:
                    self.__setattr__(name, new_layer)
                else:
                    setattr(model, name, new_layer)
            
            else:
                # Recursive call for child modules
                self.replace_multihead_attention_recursion(heads_only, model=module)
                
                
    def freeze_parameters_except_lora_and_bias(self):
        for name, param in self.named_parameters():
            is_trainable = (
                "lora_" in name
                #(self.train_layer_norms and "LayerNorm" in name)
            )

            param.requires_grad = is_trainable

In [None]:
# Get random batch of data
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Estimate loss on train and val splits
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters) 
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


# Helper function to make large numbers of parameters human-readable
def human_readable(num):
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '%.0f%s' % (num, ['', 'K', 'M', 'G', 'T', 'P'][magnitude])

In [None]:
# load the language model
def load_model():
        """
        Load pre-trained model based on the provided model name.
        """
        model_path = os.path.join(MODEL_DIR, f"{model_name}.pth")
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file '{model_path}' not found.")
        
        model = GPT()
        print("Compiling the model...\n")
        r = -1
        if compile:
            try:
                model = torch.compile(model)  # requires PyTorch 2.0
            except Exception as e:
                pass

            checkpoint = torch.load(model_path, map_location=device)
            if 'lora_rank' in checkpoint.keys():
                r = checkpoint['lora_rank']
                state = checkpoint['state_dict']

                if r > 0:
                    model.activate_lora(r)
                model.load_state_dict(state)
            else:
                model.load_state_dict(checkpoint)
        else:
            checkpoint = torch.load(model_path, map_location=device)
            if 'lora_rank' in checkpoint.keys():
                r = checkpoint['lora_rank']
                state_dict = checkpoint['state_dict']

                if r > 0:
                    model.activate_lora(r)
            else:
                state_dict = checkpoint
            
            state_dict_keys = map(lambda x: x.replace("_orig_mod.", ""), state_dict.keys())
            state_dict = dict(zip(state_dict_keys, state_dict.values()))
            model.load_state_dict(state_dict)

        m = model.to(device)
        return m, (r > 0)

In [None]:
# Initialize model and move it to the device (GPU)
if len(model_name) > 0:
    print("Loading model...\n")
    model, r_exists = load_model()

else:
    model = GPT()
    m = model.to(device)
    r_exists = False

    # compile the model
    if compile:
        print("compiling the model... (takes a ~minute)")
        model = torch.compile(model)

if lora_r > 0 and not r_exists:
    print("Activating LoRA...")
    model.activate_lora(lora_r)
    model = model.to(device)

num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
num_parameters_hr = human_readable(num_parameters)
print(f'The model has {num_parameters_hr} trainable parameters')

# Training

In [None]:
# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Initialize learning rate scheduler
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=miles, gamma=0.1)

In [None]:
# Get current date and hour to get track of experiments
now = datetime.datetime.now()
date_hour = now.strftime("%Y-%m-%d_%H-%M")

# Train
# Start training timer
start_time = time.time()

# Training loop
for iter in range(max_iters):

    # evaluate the model on the train and val splits and log the losses
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f'iter {iter:5d} | train loss {losses["train"]:.4f} | val loss {losses["val"]:.4f}')
        
    # train the model for one iteration
    xb, yb = get_batch('train')

    # forward passd
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    #loss.requires_grad = True
    loss.backward()
    optimizer.step()

    # Step the scheduler
    scheduler.step()

# End training timer
end_time = time.time()
print(f'Training time: {(end_time - start_time) / 60}  min')

# Save the trained model
model_path = f"{num_parameters_hr}_{date_hour}.pth"
checkpoint = {
    'lora_rank': model.lora_rank if(hasattr(model, "lora_rank")) else -1,
    'state_dict': model.state_dict()
}

torch.save(checkpoint, model_path)
print(f"Model saved to {model_path}\n")

# Evaluation

In [None]:
test_data = np.memmap('test.bin', dtype=np.uint16, mode='r')

In [None]:
# Evaluate example "line execution counting"
def evaluate_example(model, example, max_new_tokens=30):
    
    # Split example and determine maximum new tokens allowed
    splited_example = example.split("# count")
    if not ("for" in splited_example[0]):
        max_new_tokens = 22
    # Encode prompt and prepare for evaluation    
    encoded_example = torch.tensor(encode(splited_example[0] + "# count"), dtype=torch.long).unsqueeze(0).to(device)
    prompt_text = splited_example[0] + "# count"
    result_example = splited_example[-1]
    
    # Extract real results from example
    real_results = [float(match.group()) for match in re.finditer(r"(?<=# )-?\d+(\.\d+)?", result_example.split('\n\n')[0].replace("\n", ""))]
    
    # Generate response from model and extract generated results
    try:
        response = decode(model.generate(encoded_example, max_new_tokens=max_new_tokens)[0].tolist())
        splited_response = response.split("# count")
        result_response = splited_response[-1]
        generated_results = [float(match.group()) for match in re.finditer(r"(?<=# )-?\d+(\.\d+)?", result_response.split('\n\n')[0].replace("\n", ""))]
    except:
        generated_results = "error"
    return prompt_text, real_results, generated_results



# Write results to file
def write_results_to_file(output_file, prompt, real_results, generated_results):
    df = pd.DataFrame({
        'Prompt': prompt,
        'Real_Results': real_results,
        'Generated_Results': generated_results
    })
    df.to_csv(output_file, index=False)

In [None]:
# Evaluation Loop

# Split examples and initialize lists for results
examples = decode(test_data).split("\n\n")
examples = [example for example in examples if example]
# Taking a subset of the examples for short "aimed for verification purposes" evaluations
example_subset = examples[:5000]
# Start evaluation process
prompt = []
real_results = []
generated_results = []

# Iterate through examples and evaluate the model on each one
for example in tqdm(example_subset):
    prompt_text, real_result, result = evaluate_example(model, example)
    prompt.append(prompt_text)
    real_results.append(real_result)
    generated_results.append(result)

# Calculate and print accuracy
correct_count = sum(1 for real, generated in zip(real_results, generated_results) if real == generated)
accuracy = correct_count / len(generated_results)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Store accuracy in a file
with open("accuracy.txt", 'w') as f:
    f.write(f"Accuracy: {accuracy * 100:.2f}%\n")

# Store predictions in a CSV file
    write_results_to_file("predictions.csv", prompt, real_results, generated_results)