## Step 1: BetterPrompt for perplexity scores 

In [None]:
import os
os.environ['OPENAI_API_KEY'] = "" # fill in your own key 
import betterprompt
prompt = "This is a sample prompt"
perplexity = betterprompt.calculate_perplexity(prompt)

# Get perplexity scores for prompt paraphrases
prompt_paraphrase_file = "../../UW-Health-Prompt/prompt_paraphrase_v3.txt"
subject_matter_prompt1 = open(prompt_paraphrase_file).readlines() 
prompt_score = {}
for k,v in enumerate(subject_matter_prompt1):
    prompt = v
    perplexity = betterprompt.calculate_perplexity(prompt)
    prompt_score[k] = perplexity
    
for k,v in prompt_score.items():
    #print(k,v)
    print(f"{v:.2e}")
    

In [None]:
sorted_dict = sorted(prompt_score.items(), key=lambda x: x[1])

# Extract the keys of the top 5 smallest values
smallest_keys = [key for key, value in sorted_dict[:5]]

# Print the result
print(smallest_keys)

import numpy as np 
avg_sc = np.mean(list(prompt_score.values()))
print(f"Avg Perplexity: {avg_sc:.2e}")

lowest_prompts = [] 
for _,i in enumerate(smallest_keys):
    print(f"Prompt {i} with Perplexity: {prompt_score[i]:.2e} \n > : {subject_matter_prompt1[i]}  === ")
    lowest_prompts.append(subject_matter_prompt1[i])

## Step 2: Define self-consistency metrics

In [1]:

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 
model = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True) 

from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union

def measure_self_consistency_auto_jaccard(response1, response2,mode="jaccard"):
    prompt_tokens = set(word_tokenize(response1.lower()))
    response_tokens = set(word_tokenize(response2.lower()))
    
    jaccard_score = jaccard_similarity(prompt_tokens, response_tokens)
    
    return jaccard_score 

def measure_self_consistency_auto_bert(string1, string2, model, tokenizer):
    # Tokenize and encode the strings
    input_encoding1 = model(**tokenizer(string1,return_tensors='pt', max_length=256, padding=True, truncation=True))
    input_encoding2 = model(**tokenizer(string2,return_tensors='pt', max_length=256, padding=True, truncation=True))
    # Get the model's output
    with torch.no_grad():
        embeddings1= input_encoding1.pooler_output.detach().numpy()
        embeddings2= input_encoding2.pooler_output.detach().numpy()
     
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(embeddings1, embeddings2) 
    
    return cosine_sim[0][0]  # Return the cosine similarity value

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
## Function for chatgpt API call 

def call_chatgpt_system(chatgpt_input_content,system_prompt):
    response = openai.ChatCompletion.create(
      engine="test_chatgpt",
      messages = [{"role":"system", "content":system_prompt},#"content":"You are an AI assistant that helps people find information."},
                  {"role":"user","content":chatgpt_input_content}],
                  # {"role":"assistant","content":"Microsoft was founded by Bill Gates and Paul Allen in 1975."}],
      temperature=0.5,
      max_tokens=160,
      top_p=0.95,
      frequency_penalty=0,
      presence_penalty=0,
      stop=None)

    #print(response)
    #print(response['choices'][0]['message']['content'])
    #print("Target Gold: ", target_gold)
    output = response['choices'][0]['message']['content']
    
    return output 

In [None]:
prompt_categories = {}

# pipeline to read fewshot example for self-consistency; note that below is the UW Mychart message usecase  

category = "results" # medication, general, paperwork, results 
if_fewshot = "yes"

def extract_input_example(category):
    example_file = open("../../UW-Health-Prompt/"+category+"_example.txt").readlines()
    example_dict = {}
    for l in example_file:
        cleaned_string = l.replace("||", "")
        #print(cleaned_string)
        doc_pat = cleaned_string.split("<Doctor>")
        #print(doc_pat)
        if len(doc_pat) == 2:
            key = doc_pat[0].split("<Patient>", 1)[-1] 
            example_dict[key] = doc_pat[-1]
    print(f"Extract {len(example_dict)} examples for self-consistency measures")
    return example_dict 

categories = []  
categories = ['general', 'medication', 'results', 'paperwork'] 
for category in categories:
    fewshot_file = open("../../UW-Health-Prompt/"+category+"_fewshot.txt").read()
    example_dict = extract_input_example(category)

    jaccard_avg = [] 
    prompt_scores = {}
    prompt_output = {} 

    consistency_sample = 5 

    for prompt in lowest_prompts:
        prompt_scores[prompt] = []
        prompt_output[prompt] = [] 
        for eg_input, eg_output in example_dict.items():
            if if_fewshot:
                chatgpt_input = fewshot_file + eg_input 
            else:
                chatgpt_input = eg_input 
            tmp = [] 
            tmp_output = [] 
            for _ in range(consistency_sample):
                output = call_chatgpt_system(chatgpt_input, prompt)
                #score = measure_self_consistency_auto(eg_output, prompt) 
                score = measure_self_consistency_auto_bert(eg_output, output, model, tokenizer)
                tmp.append(score)
                tmp_output.append(output)
            prompt_scores[prompt].append(tmp)
            prompt_output[prompt].append(tmp_output)

    print(category)
    prompt_categories[category] = [prompt_scores, prompt_output]
    

In [None]:
# Aggregate results from the four categories and run bootstrap 
aggr_results = {}

for k,v in best_sc_prompt.items():
    aggr_results[k] = []
    tmp = [] 
    for kk,vv in v.items():
        tmp.extend(vv)
    aggr_results[k] = tmp

import random
for k,v in aggr_results.items():
    print(f"current prompt: {k}")
    print(f"AVG: {np.mean(v)*100:.2f} STD: {np.std(v)*100:.2f}")
    
bootstrapped_n = 100
num_samples = len(v) 
bootstrapp_index = [] 
for _ in range(0, num_samples):
    k = random.randint(0, num_samples - 1)
    bootstrapp_index.append(k)

aggr_bootstrap_results = {}
for k,v in aggr_results.items():
    aggr_bootstrap_results[k] = []
    for _ in bootstrapp_index:
        aggr_bootstrap_results[k].append(v[_])
        
import scipy.stats

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h 

aggr_ci_results = {}
for k,v in aggr_bootstrap_results.items():
    print(f"Prompt: {k}")
    mean, upper, lower = mean_confidence_interval(v)
    aggr_ci_results[k] = [mean, upper, lower]
    print(f"Mean: {mean:.4f} Upper: {upper:.4f} Lower: {lower:.4f}")
    
    
    