In [1]:
import random
import os
import pickle
import time
import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

In [2]:
# Set the random seed for reproducibility
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

# Set the device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device set to {device}.")

Device set to cpu.


In [3]:
# Helper functions to load and save data
def save_data(data, file_path):
    with open(file_path, 'w') as f:
        f.write(data)

def load_data(file_path):
    with open(file_path, 'r') as f:
        return f.read()

In [4]:
DATA_DIR="./"


In [5]:
# Attempt to derive vocab_size from the dataset

meta_path = os.path.join(DATA_DIR, 'meta.pkl')
vocab_size = None

if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    vocab_size = meta['vocab_size']
    print(f"found vocab_size = {vocab_size} (inside {meta_path})")
else:
    print("Meta file not found. Please ensure the meta.pkl file is present in the data directory.")

# Encode and decode functions for character-level Tokenzation 
def encode(s):
    return [meta['stoi'][c] for c in s]

def decode(l):
    return ''.join([meta['itos'][i] for i in l])

found vocab_size = 36 (inside ./meta.pkl)


In [6]:
# Load data
train_data = load_data(os.path.join(DATA_DIR, 'train.txt'))
val_data = load_data(os.path.join(DATA_DIR, 'val.txt'))
test_data = load_data(os.path.join(DATA_DIR, 'test.txt'))

# Encode data
train_ids = encode(train_data)
val_ids = encode(val_data)
test_ids = encode(test_data)

# Save encoded data to bin files, make sure to choose "Files only" on the persistence option of the session so that you don't encode data each time
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
test_ids = np.array(test_ids, dtype=np.uint16)

train_ids.tofile( 'train.bin')
val_ids.tofile( 'val.bin')
test_ids.tofile('test.bin')

print("Encoded data saved as binary files.")

Encoded data saved as binary files.


In [7]:
del(train_ids)
del(val_ids)
del(test_ids)

In [8]:
# Load encoded data
train_data = np.memmap("./train.bin", dtype=np.uint16, mode='r')
val_data = np.memmap("./val.bin", dtype=np.uint16, mode='r')

In [10]:
from lora_model import LoraGPT
batch_size = 256
eval_iters = 1000
import torch
model_path = "./10M_2024-07-21_08-16.pth"


model = LoraGPT(r=10,device=device)
print("Compiling the model...\n")
try:
    model = torch.compile(model)  # requires PyTorch 2.0
except Exception as e:
    pass


Compiling the model...

_orig_mod.token_embedding_table.weight
_orig_mod.position_embedding_table.weight
_orig_mod.blocks.0.sa.heads.0.lora_query_matrix_B
_orig_mod.blocks.0.sa.heads.0.lora_query_matrix_A
_orig_mod.blocks.0.sa.heads.0.lora_value_matrix_B
_orig_mod.blocks.0.sa.heads.0.lora_value_matrix_A
_orig_mod.blocks.0.sa.heads.0.key.weight
_orig_mod.blocks.0.sa.heads.0.query.weight
_orig_mod.blocks.0.sa.heads.0.value.weight
_orig_mod.blocks.0.sa.heads.1.lora_query_matrix_B
_orig_mod.blocks.0.sa.heads.1.lora_query_matrix_A
_orig_mod.blocks.0.sa.heads.1.lora_value_matrix_B
_orig_mod.blocks.0.sa.heads.1.lora_value_matrix_A
_orig_mod.blocks.0.sa.heads.1.key.weight
_orig_mod.blocks.0.sa.heads.1.query.weight
_orig_mod.blocks.0.sa.heads.1.value.weight
_orig_mod.blocks.0.sa.heads.2.lora_query_matrix_B
_orig_mod.blocks.0.sa.heads.2.lora_query_matrix_A
_orig_mod.blocks.0.sa.heads.2.lora_value_matrix_B
_orig_mod.blocks.0.sa.heads.2.lora_value_matrix_A
_orig_mod.blocks.0.sa.heads.2.key.weight


In [None]:



num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(num_parameters)

10105433


In [None]:
# Get random batch of data
import model_def as m

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - m.block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+m.block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+m.block_size]).astype(np.int64)) for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Estimate loss on train and val splits
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters) 
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


# Helper function to make large numbers of parameters human-readable
def human_readable(num):
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '%.0f%s' % (num, ['', 'K', 'M', 'G', 'T', 'P'][magnitude])

In [None]:
# Initialize optimizer
max_iters = 60000  # Maximum number of iterations
learning_rate = 1e-3 # Initial Learning rate value
miles = [int(max_iters * m) for m in [0.7, 0.8, 0.9]]  # Milestones for learning rate decay as fractions of max_iters
eval_interval = 10000 # Evaluation interval
eval_iters = 500  # Number of iterations for evaluation
batch_size=256
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Initialize learning rate scheduler
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=miles, gamma=0.1)

In [None]:
# Get current date and hour to get track of experiments
now = datetime.datetime.now()
date_hour = now.strftime("%Y-%m-%d_%H-%M")

# Train
# Start training timer
start_time = time.time()

# Training loop
for iter in range(max_iters):

    # evaluate the model on the train and val splits and log the losses
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f'iter {iter:5d} | train loss {losses["train"]:.4f} | val loss {losses["val"]:.4f}')
        
    # train the model for one iteration
    xb, yb = get_batch('train')

    # forward pass
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    # Step the scheduler
    
    
    
    
    scheduler.step()

# End training timer
end_time = time.time()
print(f'Training time: {(end_time - start_time) / 60}  min')

# Save the trained model
torch.save(model.state_dict(), f"{num_parameters}_{date_hour}.pth")

KeyboardInterrupt: 

In [None]:
test_data = np.memmap('test.bin', dtype=np.uint16, mode='r')

In [None]:
def evaluate_example(example, model, max_new_tokens=30):

    # Split example and determine maximum new tokens allowed
    splited_example = example.split("# reformulation")
    if not ("for" in splited_example[0]):
        max_new_tokens = 22
    # Encode prompt and prepare for evaluation
    encoded_example = torch.tensor(encode(splited_example[0] + "# reformulation"), dtype=torch.long).unsqueeze(0).to(device)
    prompt_text = splited_example[0] + "# reformulation"

    result_example = splited_example[-1]

    #print("result: ==>",result_example)

    # Extract real results from example
    #real_results = [float(match.group()) for match in re.finditer(r"(?<=# )-?\d+(\.\d+)?", result_example.split('\n\n')[0].replace("\n", ""))]

    # Generate response from model and extract generated results
    response = decode(model.generate(encoded_example, max_new_tokens=max_new_tokens)[0].tolist())
    splited_response = response.split("# reformulation")
    result_response = splited_response[-1]
    #generated_results = [float(match.group()) for match in re.finditer(r"(?<=# )-?\d+(\.\d+)?", result_response.split('\n\n')[0].replace("\n", ""))]

    return prompt_text, result_example, result_response



# Write results to file
def write_results_to_file(output_file, prompt, real_results, generated_results):
    df = pd.DataFrame({
        'Prompt': prompt,
        'Real_Results': real_results,
        'Generated_Results': generated_results
    })
    df.to_csv(output_file, index=False)



def evaluate_pair(real, generated_result):
    # Determine the length of the shorter and longer strings
    min_len = min(len(real), len(generated_result))
    max_len = max(len(real), len(generated_result))

    # Count the number of matching characters at the same index
    match_count = sum(1 for i in range(min_len) if real[i] == generated_result[i])

    # Calculate the ratio of matches to the length of the longer string
    ratio = match_count / max_len
    return ratio

# Evaluation Loop

# Split examples and initialize lists for results
examples = decode(test_data).split("\n\n")
examples = [example for example in examples if example]

# Start evaluation process
prompt = []
real_results = []
generated_results = []

# Iterate through examples and evaluate the model on each one
for example in tqdm(examples):
    prompt_text, real_result, result = evaluate_example(example, model)
    prompt.append(prompt_text)
    real_results.append(real_result)
    generated_results.append(result)

# Calculate and print accuracy
score=0

for real,generated in zip(real_results, generated_results):
  score+=evaluate_pair(real,generated)
accuracy = score / len(generated_results)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Store accuracy in a file
with open("accuracy.txt", 'w') as f:
    f.write(f"Accuracy: {accuracy * 100:.2f}%\n")

# Store predictions in a CSV file
    write_results_to_file("predictions.csv", prompt, real_results, generated_results)