<a href="https://colab.research.google.com/github/Mehulsoni26/Uncertainty_Quantification_LLMs/blob/main/GPT2_Finetuning_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [2]:
import os
dir_path = '/content/drive/MyDrive/Uncertainty_Quantification_LLMs'
os.chdir(dir_path)

In [3]:
%%capture
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install evaluate
!pip install -qqq trl==0.7.1
!pip install torch

In [4]:
import torch
import gc
import time
import evaluate
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import Dataset, load_dataset
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [5]:
df_stress_index_hf_prompt=pd.read_csv('./Data/df_stress_index_hf_prompt.csv',index_col=False)

In [None]:
df_stress_index_hf_prompt

In [46]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

gpt2_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", add_eos_token=False)
gpt_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

In [8]:
df_stress_index_hf_prompt.head()

Unnamed: 0,news_sentiment,broker_count_imbalance,volume_indicator,benchmark_price_difference,trade_count_imbalance,one_sided_trade_indicator,tranche_size_indicator,stress_index,stress_index_bucket,row_prompt,answer_label
0,0.393144,0.124362,0.677889,0.497776,-0.720698,-0.680076,0.724869,0.551515,0.5-0.6,###Instruction:\nYou are fixed income corporat...,H
1,0.772496,0.369376,-0.877238,0.139748,0.851864,-0.257519,0.586063,0.623805,0.6-0.7,###Instruction:\nYou are fixed income corporat...,D
2,-0.189353,0.190013,-0.485352,-0.364914,-0.183833,-0.235899,0.356616,0.305677,0.3-0.4,###Instruction:\nYou are fixed income corporat...,F
3,0.21909,-0.57793,0.87617,0.495462,0.182495,0.09434,0.106455,0.599768,0.5-0.6,###Instruction:\nYou are fixed income corporat...,F
4,0.038766,0.474529,0.345526,-0.15337,-0.020692,0.238055,0.432594,0.594587,0.5-0.6,###Instruction:\nYou are fixed income corporat...,I


In [9]:
feature_cols=['news_sentiment', 'broker_count_imbalance', 'volume_indicator',
       'benchmark_price_difference', 'trade_count_imbalance',
       'one_sided_trade_indicator', 'tranche_size_indicator']
stress_index_col ='stress_index'

In [10]:
df_stress_index_hf_prompt[feature_cols] = df_stress_index_hf_prompt[feature_cols].apply(lambda x: round(x,3))

In [36]:
df_stress_index_hf_prompt[stress_index_col] = df_stress_index_hf_prompt[stress_index_col].apply(lambda x: round(x,3))

In [11]:
def generate_prompt(row):
  base_instruction = """###Instruction:
  You are fixed income corporate bond trader. Use the feature values that I will provide and select the correct option for classifying the stress index (range of 0 to 1) into one of the defined buckets.
  Feature descriptions:
  - news_sentiment: Scale from -1 (negative) to +1 (positive) reflecting the sentiment in news about the bond issuer over the past 7 days.
  - broker_count_imbalance: Difference in broker count buying or selling the same security in the last 2 days, ranging from -1 (more selling) to +1 (more buying).
  - volume_indicator: Indicates if a bond is heavily traded in the last 2 days, with -1 for high selling volume and +1 for high buying volume.
  - benchmark_price_difference: Compares a bond's quoted price to the benchmark, ranging from -1 (below benchmark) to +1 (above benchmark), indicating price stress.
  - trade_count_imbalance: Difference in buy and sell trades of a security over 2 days, from -1 (more sells) to +1 (more buys).
  - one_sided_trade_indicator: Imbalance in buy or sell trades over 7 days, with -1 for predominantly sell trades and +1 for buy trades.
  - tranche_size_indicator: Assesses bond stress by the direction of trades and tranche size, ranging from 0 (large tranche) to 1 (small tranche).
  """

  base_instruction+=f'''
  ###Input:
  Given the feature value dict as:
  {str(row[feature_cols].to_dict())}
  Return the stress index value as a number between 0 and 1.
  ###Response:
  '''
  return base_instruction

In [12]:
print(df_stress_index_hf_prompt.head(5).apply(generate_prompt,axis=1)[0])

###Instruction:
  You are fixed income corporate bond trader. Use the feature values that I will provide and select the correct option for classifying the stress index (range of 0 to 1) into one of the defined buckets.
  Feature descriptions:
  - news_sentiment: Scale from -1 (negative) to +1 (positive) reflecting the sentiment in news about the bond issuer over the past 7 days.
  - broker_count_imbalance: Difference in broker count buying or selling the same security in the last 2 days, ranging from -1 (more selling) to +1 (more buying).
  - volume_indicator: Indicates if a bond is heavily traded in the last 2 days, with -1 for high selling volume and +1 for high buying volume.
  - benchmark_price_difference: Compares a bond's quoted price to the benchmark, ranging from -1 (below benchmark) to +1 (above benchmark), indicating price stress.
  - trade_count_imbalance: Difference in buy and sell trades of a security over 2 days, from -1 (more sells) to +1 (more buys).
  - one_sided_trade

In [13]:
df_stress_index_hf_prompt['instruction_prompt_regression'] = df_stress_index_hf_prompt.apply(generate_prompt,axis=1)

In [14]:
len(gpt2_tokenizer.tokenize(df_stress_index_hf_prompt.head(5).apply(generate_prompt,axis=1)[0]))

451

In [15]:
from transformers import GPT2Model, GPT2Config

# Load pre-trained model configuration
model_name = 'gpt2-medium'  # You can choose from 'gpt2', 'gpt2-medium', 'gpt2-large', etc.
config = GPT2Config.from_pretrained(model_name)

# Load pre-trained model
model = GPT2Model.from_pretrained(model_name, config=config)

In [16]:
import torch
import torch.nn as nn

class GPT2ForRegression(GPT2Model):
    def __init__(self, config):
        super().__init__(config)
        # Add a new linear layer for regression
        self.regression_head = nn.Linear(config.n_embd, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state
        # Use the representation of the [CLS] token for regression
        cls_representation = last_hidden_states[:, 0]
        regression_output = self.regression_head(cls_representation)
        return regression_output

# Initialize the modified model
model_for_regression = GPT2ForRegression(config)


In [17]:
def custom_loss_function(predictions, targets):
    # Example: Mean Squared Error
    mse_loss = nn.MSELoss()
    return mse_loss(predictions, targets)

In [18]:
optimizer = torch.optim.Adam(model_for_regression.parameters(), lr=5e-5)

In [19]:
# Calculate the number of parameters
num_parameters = sum(p.numel() for p in model_for_regression.parameters())

print(f"The model {model_name} has {num_parameters:,} tunable parameters.")

The model gpt2-medium has 354,824,193 tunable parameters.


In [60]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2Tokenizer

class InstructionDataset(Dataset):
    def __init__(self, encodings, labels):
        """
        Initializes the dataset.
        :param encodings: Tokenized instructions.
        :param labels: Corresponding outputs for the instructions.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns a single data point from the dataset.
        """
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        """
        Returns the size of the dataset.
        """
        return len(self.labels)

# Example usage
if __name__ == "__main__":
    # Load your dataset (this is just a placeholder)
    df_subset = df_stress_index_hf_prompt.head(500)
    instructions = df_subset['instruction_prompt_regression'].tolist()
    outputs = df_subset['stress_index'].tolist()

    gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
    # Tokenize the instructions and outputs
    encodings = gpt2_tokenizer(instructions, padding=True, truncation=True, return_tensors="pt")
    # labels = gpt2_tokenizer(outputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]
    labels = outputs

    # Create the dataset
    dataset = InstructionDataset(encodings, labels)

    # Split the dataset into training and validation sets
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    # Create the DataLoader
    batch_size = 2
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory = True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory = True)



In [61]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_for_regression = model_for_regression.to(device)
# gpt2_tokenizer = gpt2_tokenizer.to(device)

In [62]:
device

device(type='cuda')

In [63]:
num_epochs = 2

In [64]:
save_every_n_steps = 100
save_every_n_epochs = 1


In [65]:
torch.cuda.empty_cache()

In [72]:
import torch
from torch import nn, optim

# Assuming the following are already defined:
# model: an instance of a PyTorch neural network
# criterion: loss function, e.g., nn.CrossEntropyLoss()
# optimizer: optimization algorithm, e.g., optim.Adam(model.parameters(), lr=learning_rate)
# train_loader: DataLoader for the training set
# val_loader: DataLoader for the validation set
# num_epochs: number of epochs to train for

# Function to perform a training epoch
def train_epoch(model, data_loader, criterion, optimizer):
    model = model.to(device)
    model.train()  # Set the model to training mode
    total_loss = 0
    global epoch, save_every_n_steps, save_every_n_epochs
    for step, data in enumerate(data_loader):
      inputs = data['input_ids']
      labels = data['labels']

      inputs = inputs.to(device)
      labels = labels.to(device)
      # Forward pass
      outputs = model(inputs)
      # print(outputs.size(), labels.size())
      # loss = criterion(outputs, labels)
      labels = labels.to(torch.float64)
      outputs = outputs.to(torch.float64)
      outputs = outputs.squeeze()
      loss = custom_loss_function(outputs,labels)

      # Backward pass and optimization
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # Accumulate loss
      total_loss += loss.item()


      if (step+1)%save_every_n_steps == 0:
        checkpoint_filename = f"train_checkpoint_epoch_{epoch+1}_step_{step+1}.pth.tar"
        save_checkpoint(model, optimizer, epoch+1, step+1, filename=checkpoint_filename)
        logging.info(f"Saved checkpoint to {checkpoint_filename} after step {step+1} of epoch {epoch+1}")
        logging.info(f'Epoch {epoch+1}/{num_epochs}, Step {step+1} - Training loss: {total_loss:.4f}')

    return total_loss / len(data_loader)

# Function to evaluate the model
def evaluate(model, data_loader, criterion):
    model = model.to(device)
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    global epoch, save_every_n_steps, save_every_n_epochs
    with torch.no_grad():  # Disable gradient calculation
      for step, data in enumerate(data_loader):
        inputs = data['input_ids']
        labels = data['labels']

        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        labels = labels.to(torch.float64)
        outputs = outputs.to(torch.float64)
        # loss = criterion(outputs, labels)
        outputs = outputs.squeeze()
        loss = custom_loss_function(outputs,labels)

        # Accumulate loss
        total_loss += loss.item()

        if (step+1)%save_every_n_steps == 0:
          checkpoint_filename = f"validation_checkpoint_epoch_{epoch+1}_step_{step+1}.pth.tar"
          save_checkpoint(model, optimizer, epoch+1, step+1, filename=checkpoint_filename)
          logging.info(f"Saved checkpoint to {checkpoint_filename} after step {step+1} of epoch {epoch+1}")
          logging.info(f'Epoch {epoch+1}/{num_epochs}, Step {step+1} - Validation loss: {total_loss:.4f}')
    return total_loss / len(data_loader)


In [73]:
criterion = nn.MSELoss()

In [74]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename='./gpt2_training_log.log', filemode='w')

In [75]:
def save_checkpoint(model, optimizer, epoch, step, filename="checkpoint.pth.tar"):
    state = {
        'epoch': epoch,
        'step': step,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict(),
    }
    torch.save(state, filename)

In [77]:

# Training and evaluation loop
for epoch in range(num_epochs):

      # Train for one epoch
      train_loss = train_epoch(model_for_regression, train_dataloader, criterion, optimizer)
      print(f'Epoch {epoch+1}/{num_epochs} - Training loss: {train_loss:.4f}')
      torch.cuda.empty_cache()
      # Evaluate on the training set
      train_eval_loss = evaluate(model_for_regression, train_dataloader, criterion)
      print(f'Epoch {epoch+1}/{num_epochs} - Training Evaluation loss: {train_eval_loss:.4f}')

      torch.cuda.empty_cache()

      # Evaluate on the validation set
      val_loss = evaluate(model_for_regression, val_dataloader, criterion)
      print(f'Epoch {epoch+1}/{num_epochs} - Validation loss: {val_loss:.4f}')
      torch.cuda.empty_cache()

      logging.info(f'Epoch {epoch+1}/{num_epochs} - Training loss: {train_loss:.4f}')
      logging.info(f'Epoch {epoch+1}/{num_epochs} - Training Evaluation loss: {train_eval_loss:.4f}')
      logging.info(f'Epoch {epoch+1}/{num_epochs} - Validation loss: {val_loss:.4f}')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/2 - Training loss: 0.0297
Epoch 1/2 - Training Evaluation loss: 0.0215
Epoch 1/2 - Validation loss: 0.0149
Epoch 2/2 - Training loss: 0.0318
Epoch 2/2 - Training Evaluation loss: 0.0597
Epoch 2/2 - Validation loss: 0.0549


In [78]:
model_checkpoint_loaded = torch.load('train_checkpoint_epoch_2_step_200.pth.tar')

In [79]:
type(model_loaded)

dict

In [83]:
model_for_regression.load_state_dict(model_loaded['state_dict'])

<All keys matched successfully>

In [84]:
optimizer.load_state_dict(model_loaded['optimizer'])

In [90]:
model_for_regression.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
  for step, data in enumerate(val_dataloader):
    inputs = data['input_ids']
    labels = data['labels']

    inputs = inputs.to(device)
    labels = labels.to(device)

    outputs = model_for_regression(inputs)
    labels = labels.to(torch.float64)
    outputs = outputs.to(torch.float64)
    # loss = criterion(outputs, labels)
    outputs = outputs.squeeze()

    # print(f"Step {step+1}:")
    # print(f"Inputs:\n")
    print((gpt2_tokenizer.batch_decode(inputs)))

    print("Outputs:\n")
    print(outputs)

    print(f"Labels:\n")
    print(labels)

    # print("Outputs decoded:")
    # print(gpt2_tokenizer.decode(outputs.argmax(dim=-1).cpu().numpy()))
    if step >= 2:
      break

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


["###Instruction:\n  You are fixed income corporate bond trader. Use the feature values that I will provide and select the correct option for classifying the stress index (range of 0 to 1) into one of the defined buckets.\n  Feature descriptions:\n  - news_sentiment: Scale from -1 (negative) to +1 (positive) reflecting the sentiment in news about the bond issuer over the past 7 days.\n  - broker_count_imbalance: Difference in broker count buying or selling the same security in the last 2 days, ranging from -1 (more selling) to +1 (more buying).\n  - volume_indicator: Indicates if a bond is heavily traded in the last 2 days, with -1 for high selling volume and +1 for high buying volume.\n  - benchmark_price_difference: Compares a bond's quoted price to the benchmark, ranging from -1 (below benchmark) to +1 (above benchmark), indicating price stress.\n  - trade_count_imbalance: Difference in buy and sell trades of a security over 2 days, from -1 (more sells) to +1 (more buys).\n  - one_s