In [16]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.amp import autocast
from datasets import load_dataset
import matplotlib as plt
import matplotlib.pyplot as plt
import os
import sys

In [17]:
# Load dataset - https://huggingface.co/datasets/itzme091/financial-qa-10K-modified
dataset = load_dataset("itzme091/financial-qa-10K-modified")
# Split the dataset into train (80%) and temp (20%)
train_val_split = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Split temp (20%) into validation (10%) and test (10%)
val_test_split = train_val_split["test"].train_test_split(test_size=0.5, seed=42)

train_data = train_val_split["train"]
validation_data = val_test_split["train"]
test_data = val_test_split["test"]

print(f"Train size: {len(train_data)}, Validation size: {len(validation_data)}, Test size: {len(test_data)}") # Print dataset sizes
print(train_data[0]) # Print sample Q/A

Train size: 5600, Validation size: 700, Test size: 700
{'answer': 'Reduced demand for services, primarily from macroeconomic conditions', 'question': 'With respect to FDX company What factors contributed to the decrease in total average daily volume for FedEx Ground in 2023?'}


In [18]:
class FinancialQADataset(Dataset):

    def __init__(self, inputs, targets):
        """
        Args:
            inputs (dict): A dictionary containing the financial questions.
            targets (dict): A dictionary containing the corresponding answers.
        """
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        # Return the number of samples in the dataset
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        """
        Args:
            idx (int): Index of the data sample to fetch.

        Returns:
            dict: A dictionary containing the question, attention mask, and the answer.
        """
        # Extract the question (input data)
        question_input_ids = self.inputs['input_ids'][idx]  # Tokenized version of the question
        attention_mask = self.inputs['attention_mask'][
            idx]  # Attention mask for the question tokens (shows which tokens in answer are relevant)
        # Extract the corresponding answer (target data)
        answer_labels = self.targets['input_ids'][idx]  # Tokenized version of the answer

        # Return the question, attention mask, and answer as a dictionary
        return {
            'input_ids': question_input_ids,
            'attention_mask': attention_mask,
            'labels': answer_labels  # Answer as the label
        }

In [19]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
GPTmodel = GPT2LMHeadModel.from_pretrained(model_name) 
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [20]:
# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, # Causal language modeling
    r=8,  # Rank of the low-rank adaptation matrices
    lora_alpha=32,  # LoRA scaling factor
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules=["c_attn", "c_proj"]  # Applying LoRA to attention and projection layers
)

In [21]:
# Prepare model for LoRA tuning
model = get_peft_model(GPTmodel, lora_config) # Creates new version of GPT-2 model that incorporates LoRA modifications
device = torch.device("mps")
model.to(device)



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
       

In [22]:
# Define maximum sequence length for financial data
max_length = 128  # Increased length to accommodate detailed financial Q&A 
                  # specifies the maximum number of tokens that the tokenizer will generate for each input text

# Extract questions and answers from the dataset
training_question = train_data["question"]
training_answer = train_data["answer"]

validation_question = validation_data["question"]
validation_answer = validation_data["answer"]

# Tokenize dataset
tokenizer.pad_token = tokenizer.eos_token # Defines padding token for GPT-2, which lacks it
tokenized_training_question = tokenizer(
    training_question,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    max_length=max_length
)
tokenized_training_answer = tokenizer(
    training_answer,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    max_length=max_length
)
tokenized_validation_question = tokenizer(
    validation_question,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    max_length=max_length
)
tokenized_validation_answer = tokenizer(
    validation_answer,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    max_length=max_length
)

print(f"Tokenized Training Questions Shape: {tokenized_training_question['input_ids'].shape}")
print(f"Tokenized Training Answers Shape: {tokenized_training_answer['input_ids'].shape}")

Tokenized Training Questions Shape: torch.Size([5600, 128])
Tokenized Training Answers Shape: torch.Size([5600, 128])


In [23]:
batch_size = 32
train_dataset = FinancialQADataset(tokenized_training_question, tokenized_training_answer)
val_dataset = FinancialQADataset(tokenized_validation_question, tokenized_validation_answer)
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Training dataset size: 5600
Validation dataset size: 700


In [24]:
print("MPS available:", torch.backends.mps.is_available())
print("MPS device:", torch.device("mps" if torch.backends.mps.is_available() else "cpu"))

MPS available: True
MPS device: mps


In [25]:
epochs = 1

training_args = TrainingArguments(
    output_dir='./financial_model_results', # Specifies directory where model checkpoints will be saved
    overwrite_output_dir=True, # Allows the overwriting of the contents of the output_dir
    num_train_epochs=epochs, # Specifies number of times the model will go through the entire dataset
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,
    warmup_steps=2, # Defines the number of steps to perform learning rate warmup (0 -> lr)
    logging_dir='./logs',
    logging_steps=50, # Specifies how often to log training information
    fp16=False,                 
    bf16=False,                 
    no_cuda=True,               
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Trainer(
    model=model,                     
    args=training_args,             
    train_dataset=train_dataset,    
    eval_dataset=val_dataset,       
    tokenizer=tokenizer,            
)

# Print details about the training setup
print(f"Training will run for {epochs} epochs.")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Logging directory: {training_args.logging_dir}")
print(f"Checkpoints will be saved in: {training_args.output_dir}")

Training will run for 1 epochs.
Batch size: 32
Logging directory: ./logs
Checkpoints will be saved in: ./financial_model_results


  trainer = Trainer(


In [26]:
# Get model sizes
def print_model_size(path):
    size = 0
    for f in os.scandir(path):
        size += os.path.getsize(f)
    print(f"Model size: {(size / 1e6):.2f} MB")

def print_trainable_parameters(model, label):
    parameters, trainable = 0, 0    
    for _, p in model.named_parameters():
        parameters += p.numel()
        trainable += p.numel() if p.requires_grad else 0
    print(f"{label} trainable parameters: {trainable:,}/{parameters:,} ({100 * trainable / parameters:.2f}%)")

#Fine-tune the model
print(f"Model is on device: {next(model.parameters()).device}")
print_model_size(training_args.output_dir)
print_trainable_parameters(model, "Before training")

Model is on device: cpu
Model size: 0.00 MB
Before training trainable parameters: 811,008/125,250,816 (0.65%)


In [27]:
#Training the model
trainer.train()
trainer.save_model("./FinancialLora-model")

Step,Training Loss
50,7.0745
100,2.2045
150,1.889


In [30]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_name = "FinancialLora-model"
model = AutoModelForCausalLM.from_pretrained(model_name) # Loads the fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_name) # Load tokenizer

# Check if a GPU or MPS is available, and move model to that device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

# Evaluate the model
model.eval()
total_loss = 0
num_batches = 0
batch_size = 8

with torch.no_grad():
    for i in range(0, len(val_dataset), batch_size):
        if i + batch_size >= len(val_dataset):
            break
        batch = val_dataset[i:i + batch_size]
        
        # Get input_ids and attention_mask from the batch and move to device
        input_ids = batch['input_ids'].to(device) # Extracts tokenized text from device
        attention_mask = batch['attention_mask'].to(device) if 'attention_mask' in batch else None # Extracts attention mask
        labels = batch['labels'].to(device) # Extracts target labels

        # Pass input_ids as labels for loss calculation
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        total_loss += loss.item()
        num_batches += 1

# Calculate average loss and perplexity
average_loss = total_loss / num_batches
perplexity = torch.exp(torch.tensor(average_loss)).item()

print(f"Average Loss: {average_loss:.4f}")
print(f"Perplexity: {perplexity:.4f}")

Average Loss: 1.7885
Perplexity: 5.9807


In [32]:
# Load the default GPT-2 Small model and tokenizer
GPTmodel = "gpt2"  # This points to the default GPT-2 Small model
tokenizer = AutoTokenizer.from_pretrained(GPTmodel)
model = AutoModelForCausalLM.from_pretrained(GPTmodel)

# Evaluate the model
model.eval()
total_loss = 0
num_batches = 1
batch_size = 8  # Adjust based on your memory constraints

loss_hist = []
with torch.no_grad():
    for i in range(0, len(val_dataset), batch_size):
        if(i+batch_size >= len(val_dataset)):
            break
        batch = val_dataset[i:i + batch_size]
        # Get input_ids and attention_mask from the batch
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask'] if 'attention_mask' in batch else None

        # Pass input_ids as labels for loss calculation
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels= batch['labels'])
        
        loss = outputs.loss
        loss_hist.append(loss)
        total_loss += loss.item()
        num_batches += 1

# Calculate average loss and perplexity
average_loss = total_loss / num_batches
perplexity = torch.exp(torch.tensor(average_loss)).item()

print(f"Average Loss: {average_loss:.4f}")
print(f"Perplexity: {perplexity:.4f}")

Average Loss: 11.3487
Perplexity: 84853.9844


In [33]:
import pickle
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the base GPT-2 model
base_model = AutoModelForCausalLM.from_pretrained("gpt2") # Unmodified base GPT-2 model

# Load the fine-tuned model with LoRA layers
model = PeftModel.from_pretrained(base_model, "FinancialLora-model")

# Initialize a dictionary to store LoRA weights
lora_weights = {}

# Iterate through model parameters and extract LoRA layers
count = 0
for name, param in model.named_parameters():
    if 'lora' in name:  # Check if the parameter is part of LoRA layers
        count += 1 
        print(f"Extracting LoRA Layer: {name}, Shape: {param.shape}")
        lora_weights[name] = param.detach().cpu().numpy()  # Detach and store weights as numpy arrays

# Output the number of LoRA-modified parameters
print(f"The number of modified parameters is {count}.")

# Save the LoRA weights to a file
with open("lora_weights.pkl", "wb") as f:
    pickle.dump(lora_weights, f)

print("LoRA weights extracted and saved to lora_weights.pkl")

Extracting LoRA Layer: base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight, Shape: torch.Size([8, 768])
Extracting LoRA Layer: base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight, Shape: torch.Size([2304, 8])
Extracting LoRA Layer: base_model.model.transformer.h.0.attn.c_proj.lora_A.default.weight, Shape: torch.Size([8, 768])
Extracting LoRA Layer: base_model.model.transformer.h.0.attn.c_proj.lora_B.default.weight, Shape: torch.Size([768, 8])
Extracting LoRA Layer: base_model.model.transformer.h.0.mlp.c_proj.lora_A.default.weight, Shape: torch.Size([8, 3072])
Extracting LoRA Layer: base_model.model.transformer.h.0.mlp.c_proj.lora_B.default.weight, Shape: torch.Size([768, 8])
Extracting LoRA Layer: base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight, Shape: torch.Size([8, 768])
Extracting LoRA Layer: base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight, Shape: torch.Size([2304, 8])
Extracting LoRA Layer: base_model.model.transfo