In [12]:
import random
import os
import pickle
import time
import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

In [13]:
# Set the random seed for reproducibility
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

# Set the device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device set to {device}.")

Device set to cpu.


In [14]:
# Helper functions to load and save data
def save_data(data, file_path):
    with open(file_path, 'w') as f:
        f.write(data)

def load_data(file_path):
    with open(file_path, 'r') as f:
        return f.read()

In [15]:
DATA_DIR="./"


In [16]:
# Attempt to derive vocab_size from the dataset

meta_path = os.path.join(DATA_DIR, 'meta.pkl')
vocab_size = None

if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    vocab_size = meta['vocab_size']
    print(f"found vocab_size = {vocab_size} (inside {meta_path})")
else:
    print("Meta file not found. Please ensure the meta.pkl file is present in the data directory.")

# Encode and decode functions for character-level Tokenzation 
def encode(s):
    return [meta['stoi'][c] for c in s]

def decode(l):
    return ''.join([meta['itos'][i] for i in l])

found vocab_size = 36 (inside ./meta.pkl)


In [17]:
# Load data
train_data = load_data(os.path.join(DATA_DIR, 'train.txt'))
val_data = load_data(os.path.join(DATA_DIR, 'val.txt'))
test_data = load_data(os.path.join(DATA_DIR, 'test.txt'))

# Encode data
train_ids = encode(train_data)
val_ids = encode(val_data)
test_ids = encode(test_data)

# Save encoded data to bin files, make sure to choose "Files only" on the persistence option of the session so that you don't encode data each time
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
test_ids = np.array(test_ids, dtype=np.uint16)

train_ids.tofile( 'train.bin')
val_ids.tofile( 'val.bin')
test_ids.tofile('test.bin')

print("Encoded data saved as binary files.")

Encoded data saved as binary files.


In [18]:
del(train_ids)
del(val_ids)
del(test_ids)

In [19]:
# Load encoded data
train_data = np.memmap("./train.bin", dtype=np.uint16, mode='r')
val_data = np.memmap("./val.bin", dtype=np.uint16, mode='r')

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
n_embd=372
n_head=6
n_layer=6
dropout=0
vocab_size=53
block_size=256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class Head(nn.Module):
    """One head of self-attention."""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B, T, 16)
        q = self.query(x) # (B, T, 16)
        v = self.value(x)
        
        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=dropout if self.training else 0, is_causal=True)
            
        return out

class MultiHeadAttention(nn.Module):
    """multiple heads of self-attention in parallel."""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity."""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd, bias=False),
            nn.GELU(),
            nn.Linear( 4 * n_embd, n_embd, bias=False),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    """ Transformer block: communication followed by feedforward."""

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd, bias=False)
        self.ln2 = nn.LayerNorm(n_embd, bias=False)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPT(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd, bias=False) 
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:] # (B, T)
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [21]:


class LoraHead(Head):
    """
    Extends MultiHeadAttention with LoRA (Low-Rank Adaptation) matrices.
    LoRA enhances efficiency by only updating the query and value matrices.
    This class adds LoRA matrices and applies LoRA logic in the forward method.

    Parameters:
    - r (int): Rank for LoRA matrices.
    - config: Configuration of the Roberta Model.
    """
    
    def __init__(self, r=8):
        head_size = n_embd // n_head
        super().__init__(head_size=head_size)
        
        self.lora_query_matrix_A = nn.Parameter(torch.zeros(r, head_size))
        self.lora_query_matrix_B = nn.Parameter(torch.randn(n_embd, r))
        self.lora_value_matrix_A = nn.Parameter(torch.zeros(r, head_size))
        self.lora_value_matrix_B = nn.Parameter(torch.randn(n_embd, r))
        self.lora_key_matrix_A = nn.Parameter(torch.zeros(r, head_size))
        self.lora_key_matrix_B = nn.Parameter(torch.randn(n_embd, r))

    def lora_query(self, x):
        """
        Applies LoRA to the query component. Computes a modified query output by adding 
        the LoRA adaptation to the standard query output. Requires the regular linear layer 
        to be frozen before training.
        """
        lora_query_weights = torch.matmul(self.lora_query_matrix_B, self.lora_query_matrix_A)

        return self.query(x) + F.linear(x, lora_query_weights.T)

    def lora_value(self, x):
        """
        Applies LoRA to the value component. Computes a modified value output by adding 
        the LoRA adaptation to the standard value output. Requires the regular linear layer 
        to be frozen before training.
        """
        lora_value_weights = torch.matmul(self.lora_value_matrix_B, self.lora_value_matrix_A)


        return self.value(x) +  F.linear(x, lora_value_weights.T)

    def lora_key(self, x):
        """
        Applies LoRA to the key component. Computes a modified value output by adding 
        the LoRA adaptation to the standard value output. Requires the regular linear layer 
        to be frozen before training.
        """
        lora_key_weights = torch.matmul(self.lora_key_matrix_B, self.lora_key_matrix_A)

        
        return self.key(x) +  F.linear(x, lora_key_weights.T)

    def forward(self, x):
        B, T, C = x.shape
        k = self.lora_key(x)
        q = self.lora_query(x)
        v = self.lora_value(x)

        out = torch.nn.functional.scaled_dot_product_attention(
            q, k, v, attn_mask=None, dropout_p=dropout if self.training else 0, is_causal=True
        )

        return out
class LoraGPT(nn.Module):
    
    
    def __load_model(self)->GPT:
        model_path = "./10M_2024-07-21_08-16.pth"


        model = GPT()
        print("Compiling the model...\n")
        try:
            model = torch.compile(model)  # requires PyTorch 2.0
        except Exception as e:
            pass
        model.load_state_dict(torch.load(model_path,map_location=self.device))

        m = model.to(self.device)
        return m
    
    def __init__(self,  r=8,device='cuda'):
        
        
        super().__init__()
        self.lora_rank = r
        self.device = device
        self.model=self.__load_model()
        self.replace_multihead_attention_recursion(self.model)
        self.freeze_parameters_except_lora_and_bias()
        
        
    def forward(self, x,targets):
        return self.model(x,targets)
   
        
    def replace_multihead_attention_recursion(self,model):
        """
        Replaces RobertaSelfAttention with LoraRobertaSelfAttention in the model.
        This method applies the replacement recursively to all sub-components.

        Parameters
        ----------
        model : nn.Module
            The PyTorch module or model to be modified.
        """
        for name, module in model.named_children():
            if isinstance(module, Head):
                # Replace RobertaSelfAttention with LoraRobertaSelfAttention
                new_layer = LoraHead(r=self.lora_rank)
                new_layer.load_state_dict(module.state_dict(), strict=False)
                setattr(model, name, new_layer)
            else:
                # Recursive call for child modules
                self.replace_multihead_attention_recursion(module)
                
                
    def freeze_parameters_except_lora_and_bias(self):
        """
        Freezes all model parameters except for specific layers and types based on the configuration.
        Parameters in LoRA layers, the finetune head, bias parameters, embeddings, and layer norms 
        can be set as trainable based on class settings.
        """
        for name, param in self.model.named_parameters():
            print(name)
            is_trainable = (
                "lora_" in name 
                
                #(self.train_layer_norms and "LayerNorm" in name)
            )
            param.requires_grad = is_trainable
        

In [22]:

batch_size = 1
eval_iters = 1000
import torch
model_path = "./10M_2024-07-21_08-16.pth"
model = LoraGPT(r=10,device=device)
print("Compiling the model...\n")
try:
    model = torch.compile(model)  # requires PyTorch 2.0
except Exception as e:
    pass


Compiling the model...

_orig_mod.token_embedding_table.weight
_orig_mod.position_embedding_table.weight
_orig_mod.blocks.0.sa.heads.0.lora_query_matrix_A
_orig_mod.blocks.0.sa.heads.0.lora_query_matrix_B
_orig_mod.blocks.0.sa.heads.0.lora_value_matrix_A
_orig_mod.blocks.0.sa.heads.0.lora_value_matrix_B
_orig_mod.blocks.0.sa.heads.0.lora_key_matrix_A
_orig_mod.blocks.0.sa.heads.0.lora_key_matrix_B
_orig_mod.blocks.0.sa.heads.0.key.weight
_orig_mod.blocks.0.sa.heads.0.query.weight
_orig_mod.blocks.0.sa.heads.0.value.weight
_orig_mod.blocks.0.sa.heads.1.lora_query_matrix_A
_orig_mod.blocks.0.sa.heads.1.lora_query_matrix_B
_orig_mod.blocks.0.sa.heads.1.lora_value_matrix_A
_orig_mod.blocks.0.sa.heads.1.lora_value_matrix_B
_orig_mod.blocks.0.sa.heads.1.lora_key_matrix_A
_orig_mod.blocks.0.sa.heads.1.lora_key_matrix_B
_orig_mod.blocks.0.sa.heads.1.key.weight
_orig_mod.blocks.0.sa.heads.1.query.weight
_orig_mod.blocks.0.sa.heads.1.value.weight
_orig_mod.blocks.0.sa.heads.2.lora_query_matrix_A

  model.load_state_dict(torch.load(model_path,map_location=self.device))


In [23]:



num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(num_parameters)

468720


In [24]:
# Get random batch of data
import model_def as m

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - m.block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+m.block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+m.block_size]).astype(np.int64)) for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Estimate loss on train and val splits
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters) 
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


# Helper function to make large numbers of parameters human-readable
def human_readable(num):
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '%.0f%s' % (num, ['', 'K', 'M', 'G', 'T', 'P'][magnitude])

In [25]:
# Initialize optimizer
max_iters = 1  # Maximum number of iterations
learning_rate = 1e-3 # Initial Learning rate value
miles = [int(max_iters * m) for m in [0.7, 0.8, 0.9]]  # Milestones for learning rate decay as fractions of max_iters
eval_interval = 100 # Evaluation interval
eval_iters = 10000  # Number of iterations for evaluation
batch_size=256
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Initialize learning rate scheduler
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=miles, gamma=0.1)

In [26]:
# Get current date and hour to get track of experiments
now = datetime.datetime.now()
date_hour = now.strftime("%Y-%m-%d_%H-%M")

# Train
# Start training timer
start_time = time.time()

# Training loop
for iter in range(max_iters):

    # evaluate the model on the train and val splits and log the losses
    if iter % eval_interval == 0:
        print("calc loss...")
        losses = estimate_loss()
        print(f'iter {iter:5d} | train loss {losses["train"]:.4f} | val loss {losses["val"]:.4f}')
        
    print("get batch...")
    # train the model for one iteration
    xb, yb = get_batch('train')

    # forward pass
    print('forward pass...')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    print('backward ...')
    loss.backward()
    print('optimizer step...')
    optimizer.step()

    # Step the scheduler
    
    
    
    
    scheduler.step()

# End training timer
end_time = time.time()
print(f'Training time: {(end_time - start_time) / 60}  min')

# Save the trained model
torch.save(model.state_dict(), f"{num_parameters}_{date_hour}.pth")

calc loss...


In [None]:
test_data = np.memmap('test.bin', dtype=np.uint16, mode='r')

In [None]:
def evaluate_example(example, model, max_new_tokens=30):

    # Split example and determine maximum new tokens allowed
    splited_example = example.split("# reformulation")
    if not ("for" in splited_example[0]):
        max_new_tokens = 22
    # Encode prompt and prepare for evaluation
    encoded_example = torch.tensor(encode(splited_example[0] + "# reformulation"), dtype=torch.long).unsqueeze(0).to(device)
    prompt_text = splited_example[0] + "# reformulation"

    result_example = splited_example[-1]

    #print("result: ==>",result_example)

    # Extract real results from example
    #real_results = [float(match.group()) for match in re.finditer(r"(?<=# )-?\d+(\.\d+)?", result_example.split('\n\n')[0].replace("\n", ""))]

    # Generate response from model and extract generated results
    response = decode(model.generate(encoded_example, max_new_tokens=max_new_tokens)[0].tolist())
    splited_response = response.split("# reformulation")
    result_response = splited_response[-1]
    #generated_results = [float(match.group()) for match in re.finditer(r"(?<=# )-?\d+(\.\d+)?", result_response.split('\n\n')[0].replace("\n", ""))]

    return prompt_text, result_example, result_response



# Write results to file
def write_results_to_file(output_file, prompt, real_results, generated_results):
    df = pd.DataFrame({
        'Prompt': prompt,
        'Real_Results': real_results,
        'Generated_Results': generated_results
    })
    df.to_csv(output_file, index=False)



def evaluate_pair(real, generated_result):
    # Determine the length of the shorter and longer strings
    min_len = min(len(real), len(generated_result))
    max_len = max(len(real), len(generated_result))

    # Count the number of matching characters at the same index
    match_count = sum(1 for i in range(min_len) if real[i] == generated_result[i])

    # Calculate the ratio of matches to the length of the longer string
    ratio = match_count / max_len
    return ratio

# Evaluation Loop

# Split examples and initialize lists for results
examples = decode(test_data).split("\n\n")
examples = [example for example in examples if example]

# Start evaluation process
prompt = []
real_results = []
generated_results = []

# Iterate through examples and evaluate the model on each one
for example in tqdm(examples):
    prompt_text, real_result, result = evaluate_example(example, model)
    prompt.append(prompt_text)
    real_results.append(real_result)
    generated_results.append(result)

# Calculate and print accuracy
score=0

for real,generated in zip(real_results, generated_results):
  score+=evaluate_pair(real,generated)
accuracy = score / len(generated_results)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Store accuracy in a file
with open("accuracy.txt", 'w') as f:
    f.write(f"Accuracy: {accuracy * 100:.2f}%\n")

# Store predictions in a CSV file
    write_results_to_file("predictions.csv", prompt, real_results, generated_results)