In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.amp import autocast
from datasets import load_dataset
import matplotlib as plt
import os
import sys

In [2]:
# Load the dataset
#https://huggingface.co/datasets/allenai/math_qa
import datasets

ds = load_dataset("math_qa.py")
# Access the different splits
train_data = ds['train']
validation_data = ds['validation']
test_data = ds['test']

# Access the first training example
#print(train_data[0])
# Combine corresponding elements of "Problem" and "options"
training_question = list([p + " " + o for p, o in zip(train_data[:]["Problem"], train_data[:]["options"])])
training_answer = list(train_data[:]['Rationale'])
validation_question = list([p + " " + o for p, o in zip(validation_data[:]["Problem"], train_data[:]["options"])])
validation_answer = list(validation_data[:]['Rationale'])
print(training_question[0])
print(validation_answer[134])

the banker ' s gain of a certain sum due 3 years hence at 10 % per annum is rs . 36 . what is the present worth ? a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d ) rs . 350 , e ) none of these
"final number = initial number + 25 % ( original number ) = 80 + 25 % ( 80 ) = 80 + 20 = 100 . answer e"


In [3]:
class makeDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        # Extract the input_ids and attention_mask for the question
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]

        # Extract the labels (input_ids for the answer)
        labels = self.targets['input_ids'][idx]

        # Return the input and output as a dictionary
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

In [7]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # Default to GPT small
GPTmodel = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # Rank of the low-rank adaptation matrices
    lora_alpha=32,  # LoRA scaling factor
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules = ["c_attn", "c_proj"]
)

# Prepare model for LoRA tuning
model = get_peft_model(GPTmodel, lora_config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

#tokenize dataset
tokenizer.pad_token = tokenizer.eos_token
maxLength = 64
tokenized_training_question = tokenizer(training_question, truncation=True, padding='max_length', return_tensors="pt", max_length = maxLength)
tokenized_training_answer = tokenizer(training_answer, truncation=True, padding=True, return_tensors="pt", max_length = maxLength)
tokenized_validation_question = tokenizer(validation_question, truncation=True, padding='max_length', return_tensors="pt", max_length = maxLength)
tokenized_validation_answer = tokenizer(validation_answer, truncation=True, padding=True, return_tensors="pt", max_length = maxLength)

print(f"Tokenized Training Questions Shape: {tokenized_training_question['input_ids'].shape}")
print(f"Tokenized Training Answers Shape: {tokenized_training_answer['input_ids'].shape}")

# Make sure it's divisible by batch size so last batch works fine
batch_size = 32
train_dataset = makeDataset(tokenized_training_question, tokenized_training_answer)
val_dataset = makeDataset(tokenized_validation_question, tokenized_validation_answer)
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

# Define training arguments
num_epochs = 1  # Number of training epochs
training_args = TrainingArguments(
    output_dir='./results',        # Directory to save model checkpoints
    num_train_epochs=num_epochs,            # Number of training epochs
    per_device_train_batch_size=batch_size, # Batch size per device
    per_device_eval_batch_size=batch_size,  # Batch size for evaluation
    warmup_steps=2,              # Number of warmup steps
    weight_decay=0.01,             # Weight decay
    logging_dir='./logs',          # Directory to save logs
    logging_steps=10,              # Log every X steps
)

# Create Trainer instance
trainer = Trainer(
    model=model,                     # The model you are fine-tuning
    args=training_args,              # Training arguments
    train_dataset=train_dataset,     # Your training dataset
    eval_dataset=val_dataset,
)

# Get model sizes
def print_model_size(path):
    size = 0
    for f in os.scandir(path):
        size += os.path.getsize(f)
    print(f"Model size: {(size / 1e6):.2} MB")

def print_trainable_parameters(model, label):
    parameters, trainable = 0, 0    
    for _, p in model.named_parameters():
        parameters += p.numel()
        trainable += p.numel() if p.requires_grad else 0
    print(f"{label} trainable parameters: {trainable:,}/{parameters:,} ({100 * trainable / parameters:.2f}%)")

#Fine-tune the model
print(f"Model is on device: {next(model.parameters()).device}")
print_model_size(training_args.output_dir)
print_trainable_parameters(model, "Before training")
trainer.train()

trainer.save_model("./mathLora-model")



Using device: cuda


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Tokenized Training Questions Shape: torch.Size([29837, 64])
Tokenized Training Answers Shape: torch.Size([29837, 64])
Training dataset size: 29837
Validation dataset size: 4475
Model is on device: cuda:0
Model size: 0.037 MB
Before training trainable parameters: 811,008/125,250,816 (0.65%)


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
10,8.3032
20,7.9176
30,7.2457
40,6.4123
50,5.6686
60,5.3441
70,5.2116
80,5.1279
90,5.0693
100,5.0142


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_name = "mathLora-model"  # Replace with your model's path
model = AutoModelForCausalLM.from_pretrained(model_name)

# Evaluate the model
model.eval()
total_loss = 0
num_batches = 0
batch_size = 32
with torch.no_grad():
    for i in range(0, len(val_dataset), batch_size):
        if(i+batch_size >= len(val_dataset)):
            break
        batch = val_dataset[i:i + batch_size]
        # Get input_ids and attention_mask from the batch
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask'] if 'attention_mask' in batch else None  # Optional

        # Pass input_ids as labels for loss calculation
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels= batch['labels'])
        
        loss = outputs.loss
        total_loss += loss.item()
        num_batches += 1

# Calculate average loss and perplexity
average_loss = total_loss / num_batches
perplexity = torch.exp(torch.tensor(average_loss)).item()

print(f"Average Loss: {average_loss:.4f}")
print(f"Perplexity: {perplexity:.4f}")

KeyboardInterrupt: 

In [None]:
# Load the default GPT-2 Small model and tokenizer
GPTmodel = "gpt2"  # This points to the default GPT-2 Small model
tokenizer = AutoTokenizer.from_pretrained(GPTmodel)
model = AutoModelForCausalLM.from_pretrained(GPTmodel)

# Evaluate the model
model.eval()
total_loss = 0
num_batches = 0
batch_size = 8  # Adjust based on your memory constraints

loss_hist = []
with torch.no_grad():
    for i in range(0, len(val_dataset), batch_size):
        if(i+batch_size >= len(val_dataset)):
            break
        batch = val_dataset[i:i + batch_size]
        # Get input_ids and attention_mask from the batch
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask'] if 'attention_mask' in batch else None  # Optional

        # Pass input_ids as labels for loss calculation
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels= batch['labels'])
        
        loss = outputs.loss
        loss_hist.append(loss)
        total_loss += loss.item()
        num_batches += 1

# Calculate average loss and perplexity
average_loss = total_loss / num_batches
perplexity = torch.exp(torch.tensor(average_loss)).item()

print(f"Average Loss: {average_loss:.4f}")
print(f"Perplexity: {perplexity:.4f}")



tensor(14.2888)
tensor(14.9782)
tensor(15.1596)
tensor(14.1524)
tensor(14.4816)
tensor(14.5441)
tensor(14.2112)
tensor(14.7085)
tensor(13.4263)
tensor(14.1028)
tensor(15.1642)
tensor(15.2056)
tensor(14.7975)
tensor(13.5777)
tensor(14.9535)
tensor(15.0056)
tensor(15.0133)
tensor(14.6804)
tensor(14.3289)
tensor(14.1136)
tensor(14.9169)
tensor(14.7038)
tensor(14.5362)
tensor(15.1375)
tensor(14.9126)
tensor(14.8563)
tensor(14.3381)
tensor(14.8082)
tensor(14.2995)
tensor(14.7002)
tensor(14.8756)
tensor(15.0656)
tensor(14.5521)
tensor(14.6520)
tensor(14.3047)
tensor(14.7539)
tensor(14.2916)
tensor(14.9533)
tensor(14.4720)
tensor(15.0784)
tensor(14.3151)
tensor(14.1776)
tensor(14.9997)
tensor(14.7585)
tensor(14.1314)
tensor(15.3629)
tensor(14.5348)
tensor(14.7129)
tensor(14.2549)
tensor(14.2690)
tensor(14.9771)
tensor(14.5972)
tensor(14.8759)
tensor(14.3634)
tensor(14.2969)
tensor(14.1811)
tensor(15.1462)
tensor(14.9558)
tensor(14.5929)
tensor(15.2540)
tensor(15.2891)
tensor(14.1834)
tensor(1