<a href="https://colab.research.google.com/github/Mehulsoni26/Uncertainty_Quantification_LLMs/blob/main/Code/Uncertainty_Quantification_phi_batch_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls

drive  sample_data


In [None]:
import os
dir_path = '/content/drive/MyDrive'
os.chdir(dir_path)

In [None]:
# !git clone https://github.com/Abhi23run/Uncertainty_Quantification_LLMs.git

In [None]:
os.chdir(dir_path+'/Uncertainty_Quantification_LLMs')

In [None]:
%%capture
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install evaluate
!pip install -qqq trl==0.7.1
!pip install torch

In [None]:
import torch
import gc
import time
import evaluate
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import Dataset, load_dataset
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
df_stress_index = pd.read_csv('./Data/data_with_stress_index.csv',usecols=lambda col:col not in ['Unnamed: 0'])

In [None]:
n_bins = 10
bin_width = 1/n_bins

df_stress_index.loc[:,'stress_index_bucket']=pd.cut(df_stress_index['stress_index'],\
                                                    bins=n_bins,\
                                                    labels=[str(np.round(x,1))+'-'+str(np.round(x+bin_width,1)) \
                                                            for x in np.arange(0,1,bin_width)])

In [None]:
def shuffle_list(original_list):
    # Create a copy of the list
    shuffled_list = original_list.copy()

    # Shuffle the copy
    random.shuffle(shuffled_list)

    return shuffled_list

list_stress_index_bucket=[str(np.round(x,1))+'-'+str(np.round(x+bin_width,1)) for x in np.arange(0,1,bin_width)]
list_option_choices=[chr(x) for x in range(65,65+len(list_stress_index_bucket))]

In [None]:
import getpass

# Prompt for the Hugging Face token
hf_token = getpass.getpass("Enter your Hugging Face token: ")

import os
os.environ['HUGGINGFACE_TOKEN'] = hf_token

Enter your Hugging Face token: ··········


In [None]:
# model_id =  "NousResearch/Llama-2-7b-hf"
# model_id = "meta-llama/Llama-2-7b-chat-hf"
# model_id = "mistralai/Mistral-7B-v0.1"
model_id = "microsoft/phi-2"
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
# quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto",token=hf_token)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id,token=hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
choice_prompt=''''''
for i,j in zip(list_option_choices,shuffle_list(list_stress_index_bucket)):
    choice_prompt+=f'({i}) {j}\n'

In [None]:
def get_values_before_key(sorted_dict, key): ###updated the function to include the logits not including the true label
    values_before_key = []
    for k, v in sorted_dict.items():
        if k == key:
            # values_before_key.append(v)
            break
        values_before_key.append(v)
    return values_before_key

In [None]:
##Softmax function to convert logits to probabilities
def softmax(x):
    e_x=np.exp(x-np.max(x))
    return e_x/e_x.sum()

In [None]:
def lac(true_label, label_softmax_dict):
    lac_score = 1.0 - label_softmax_dict[true_label]
    return lac_score

In [None]:
def aps(true_label, label_softmax_dict):
    sorted_softmax_dict = dict(sorted(label_to_softmax_dict.items(), key=lambda item: item[1], reverse=True))
    high_labels = get_values_before_key(sorted_softmax_dict, true_label)
    aps_score = sum(high_labels)
    return aps_score

In [None]:
!ls

Code  Data  output_logits.pkl  README.md


In [None]:
df_stress_index_hf_prompt=pd.read_csv('/content/drive/MyDrive/Uncertainty_Quantification_LLMs/Data/df_stress_index_hf_prompt.csv',index_col=False,skiprows=[1]).head(2000)

In [None]:
data_splits = {'test': df_stress_index_hf_prompt}
df_stress_index_hf_dataset = {split: Dataset.from_pandas(pd.DataFrame(df_stress_index_hf_prompt[['row_prompt','answer_label','stress_index_bucket']])) for split, data in data_splits.items()}
df_stress_index_hf_dataset_inference=df_stress_index_hf_dataset['test']


In [None]:
df_stress_index_hf_dataset_inference

Dataset({
    features: ['row_prompt', 'answer_label', 'stress_index_bucket'],
    num_rows: 250
})

In [None]:
def process_batch(batch):
    """
    Tokenizes a batch of text inputs.
    Args:
        batch: A dictionary with a key 'text' containing a list of text inputs.
    Returns:
        A dictionary with keys 'input_ids', 'attention_mask' (and possibly others like 'token_type_ids' depending on the tokenizer).
    """
    # Tokenize the text inputs in the batch. This automatically applies padding and truncation.
    # Adjust `padding` and `truncation` as needed.
    tokenized_inputs = tokenizer(batch["row_prompt"], padding=True, truncation=True, return_tensors="pt")

    return tokenized_inputs

def test_data_collator(batch):
  input_ids=torch.stack([example['input_ids'] for example in batch])
  attention_mask=torch.stack([example['attention_mask'] for example in batch])

  return {'input_ids':input_ids, 'attention_mask':attention_mask}

In [None]:
tokens_of_interest= list_option_choices #the option choices corresponding to stress index buckets

token_indices = tokenizer.convert_tokens_to_ids(tokens_of_interest)

In [None]:
token_indices

[32, 33, 34, 35, 36, 37, 38, 39, 40, 41]

In [None]:
batch_size=4
from torch.utils.data import DataLoader
from tqdm import tqdm

df_stress_index_hf_dataset_batched=df_stress_index_hf_dataset_inference.map(process_batch,batched=True,batch_size=batch_size)
df_stress_index_hf_dataset_batched.set_format(type='torch')

test_dataloader=DataLoader(
    df_stress_index_hf_dataset_batched,
    batch_size=batch_size,
    collate_fn=test_data_collator
)

output_labels=[]
for batch in tqdm(test_dataloader):
  input_ids=batch['input_ids']
  attention_mask=batch['attention_mask']


  with torch.no_grad():
    result=model(input_ids=input_ids,attention_mask=attention_mask,return_dict=True)
    next_token_logits=result.logits.detach().cpu()[:,-1,:]
    indices_in_logits = [{token: next_token_logits[i,token_idx].item() for token, token_idx in zip(tokens_of_interest, token_indices)} for i in range(next_token_logits.shape[0])]
    label_to_softmax_dict=[dict(zip(tokens_of_interest,softmax(np.array(list(indices_in_logits[i].values()))))) for i in range(len(indices_in_logits))]
    output_labels.append(label_to_softmax_dict)
    del input_ids
    del attention_mask
    del result
    gc.collect()
    torch.cuda.empty_cache()

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

100%|██████████| 500/500 [19:40<00:00,  2.36s/it]


In [40]:
output_labels=[item for sublist in output_labels for item in sublist]

In [41]:
import pickle
with open('./output_labels.pkl','wb') as f:
  pickle.dump(output_labels,f)

In [42]:
with open('./output_labels.pkl','rb') as f:
  output_labels=pickle.load(f)

In [58]:
lac_score_list=[]
aps_score_list=[]

##First 1500 rows as calibration dataset
for i,j in zip(output_labels[:1500],df_stress_index_hf_dataset_inference['answer_label'][:1500]):
  lac_score_list.append(lac(j,i))
  aps_score_list.append(aps(j,i))

In [59]:
with open('./lac_score_list.pkl','wb') as f:
  pickle.dump(lac_score_list,f)

with open('./aps_score_list.pkl','wb') as f:
  pickle.dump(aps_score_list,f)

In [60]:
with open('./lac_score_list.pkl','rb') as f:
  lac_score_list=pickle.load(f)

with open('./aps_score_list.pkl','rb') as f:
  aps_score_list=pickle.load(f)

In [85]:
n=len(lac_score_list)

# user specified error rate
alpha=0.2
import math

def calculate_threshold(data,alpha):
    sorted_data = sorted(data)
    n = len(sorted_data)
    percentile = math.ceil(((n + 1)*(1-alpha)))/(n)  # Subtracting 1 because list indices start at 0
    # index = min(index, N - 1)  # Ensure the index does not exceed the last index of the list
    return np.percentile(sorted_data, percentile*100)

In [96]:
lac_threshold=calculate_threshold(lac_score_list,alpha)

In [97]:
aps_threshold=calculate_threshold(aps_score_list,alpha)

In [95]:
tokens_of_interest

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']

In [104]:
predicion_set_lac=[]
for i_test in (output_labels[1500:]):
  predicion_set_i_lac=[]
  for token in tokens_of_interest:
    if lac(token,i_test)<=lac_threshold:
      predicion_set_i_lac.append(token)
  predicion_set_lac.append(predicion_set_i_lac)

prediction_set_aps=[]

for i_test in (output_labels[1500:]):
  predicion_set_i_aps=[]
  for token in tokens_of_interest:
    if aps(token,i_test)<=aps_threshold:
      predicion_set_i_aps.append(token)
  prediction_set_aps.append(predicion_set_i_aps)

In [105]:
with open('./predicion_set_lac.pkl','wb') as f:
  pickle.dump(predicion_set_lac,f)

with open('./predicion_set_aps.pkl','wb') as f:
  pickle.dump(prediction_set_aps,f)

In [106]:
with open('./predicion_set_lac.pkl','rb') as f:
  predicion_set_lac=pickle.load(f)

with open('./predicion_set_aps.pkl','rb') as f:
  prediction_set_aps=pickle.load(f)

In [107]:
predicion_set_lac[:10]

[['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J'],
 ['A', 'B', 'C', 'D', 'E', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J'],
 ['A', 'B', 'C', 'D', 'E', 'H', 'I']]

In [108]:
prediction_set_aps[:10]

[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']]