# Importing Data

## Loading the preprocessed medquad data

In [74]:
# load medQuad preprocessed dataset
import json

# Open the JSON file
with open("train.json", 'r') as f:
    # Load the contents of the file into a variable
    json_data = f.read()

# Parse the JSON data into a Python dictionary
dataset_raw = json.loads(json_data)

In [75]:
print(type(dataset_raw))

<class 'dict'>


In [76]:
dataset_raw.keys

<function dict.keys>

## Creating a questions dictionary (lookup table)

In [77]:
questions = {}
for k in dataset_raw:
  for qa in range(len(dataset_raw[k]["QAs"])):
    questions[str(dataset_raw[k]["QAs"][qa]["question"])] = str(dataset_raw[k]["QAs"][qa]["answer"])

In [78]:
print(len(questions))
questions["Is Gitelman syndrome inherited ?"]

12629


'This condition is inherited in an autosomal recessive pattern, which means both copies of the gene in each cell have mutations. The parents of an individual with an autosomal recessive condition each carry one copy of the mutated gene, but they typically do not show signs and symptoms of the condition.'

# Embedding Starts

In [34]:
from InstructorEmbedding import INSTRUCTOR

## Load the instructOR embeddor model

In [35]:
model = INSTRUCTOR('hkunlp/instructor-large')

Downloading (…)c7233/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)9fb15c7233/README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

Downloading (…)b15c7233/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)c7233/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

Downloading (…)15c7233/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [79]:
question = list(questions.keys())

## Preprocessing

In [80]:
import numpy as np

def make_inst_ques(instruction, question):
    instruction = np.array(instruction)
    
    question = np.array(question)
    question = question.reshape(-1,1)

    instruction_question = np.hstack((instruction, question))
    instruction_question = instruction_question.tolist()
    return instruction_question

#### Domain Specific stuff

In [81]:
question = list(questions.keys())
instruction = [["Represent the Medicine sentence for retrieval: "]]*len(question)  # Domain instruction inspired from their readme
instruction_question_train = make_inst_ques(instruction, question)

## Preprocess the entire test set

In [1]:
import pandas as pd
df = pd.read_csv("new_dataset.csv")
test_questions = {}

for index, row in df.iterrows():
    test_questions[row["Extracted_Question"]] = row["Extracted_Answer"]

test_question = list(set(df["Extracted_Question"].values.tolist()))
instruction = [["Represent the Medicine sentence for retrieval: "]]*len(test_question)
instruction_question_test = make_inst_ques(instruction, test_question)

NameError: name 'make_inst_ques' is not defined

In [6]:
list(test_questions.keys())[1385]

"Do you have information about When your child's treatment stops working (Also called: End of life care - children; Palliative care - children; Advance care planning - children)"

In [7]:
import pickle
with open('test_ques_ans.pickle', 'wb') as file:
    pickle.dump(test_questions, file)

In [8]:
with open('test_ques_ans.pickle', 'rb') as file:
    loaded_dict = pickle.load(file)
len(loaded_dict)

1805

In [106]:
len(instruction_question_test), len(instruction_question_test[0])

(1805, 2)

## Performing the embedding

In [107]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

query_embeddings = model.encode(instruction_question_test)
corpus_embeddings = model.encode(instruction_question_train)

### Perform Cosine Similarity

In [108]:
similarities = cosine_similarity(query_embeddings,corpus_embeddings)

### Get top K

In [109]:
def get_top_k_similarities(similarities, k):
    top_k_indices = np.argsort(similarities, axis=1)[:, -k:][:, ::-1]
    return top_k_indices

In [110]:
top_k_indices = get_top_k_similarities(similarities, k=3)

## Testing it out

In [117]:
for i in range(len(top_k_indices[:5])):
    print("Test Question:", test_question[i])
    print()
    for ind in top_k_indices[i]:
        print("Similar Questions:")
        print(question[ind])
        print("Answer:", questions[question[ind]])
    print("\n")

Test Question: What is the outlook for Benign positional vertigo - aftercare ? (Also called: Vertigo - positional - aftercare; Benign paroxysmal positional vertigo - aftercare; BPPV - aftercare)

Similar Questions:
What is the outlook for Orthostatic Hypotension ?
Answer: The prognosis for individuals with orthostatic hypotension depends on the underlying cause of the condition.
Similar Questions:
What is the outlook for Occipital Neuralgia ?
Answer: Occipital neuralgia is not a life-threatening condition. Many individuals will improve with therapy involving heat, rest, anti-inflammatory medications, and muscle relaxants. Recovery is usually complete after the bout of pain has ended and the nerve damage repaired or lessened.
Similar Questions:
What is the outlook for Binswanger's Disease ?
Answer: BD is a progressive disease; there is no cure. Changes may be sudden or gradual and then progress in a stepwise manner. BD can often coexist with Alzheimer's disease. Behaviors that slow the 

In [118]:
dict_to_save = {
    "similarities": similarities,
    "test_question": test_question,
    "question": question,
    "questions": questions
}
torch.save(dict_to_save, "similarities.pth")

# Testing on GPT

## Get Top-k similarities

In [None]:
def get_top_k_similarities(similarities, k):
    top_k_indices = np.argsort(similarities, axis=1)[:, -k:][:, ::-1]
    return top_k_indices

## Loading stuff up

In [None]:
import torch
from tqdm.notebook import tqdm
import numpy as np

device = torch.device(("cuda:0" if torch.cuda.is_available() else "cpu"))
preLoaded = torch.load("/content/drive/MyDrive/291_I00/similarities.pth")

similarities = preLoaded["similarities"]
test_question = preLoaded["test_question"]
question = preLoaded["question"]
questions = preLoaded["questions"]
top_k_indices = get_top_k_similarities(similarities, k=2)

### Getting random 100 question indices

In [None]:
import numpy as np
#Set the seed value
np.random.seed(42069)

random_numbers = np.random.randint(0, 1805, size=100)
print(random_numbers)

## Openai API

In [None]:
import openai

# Set your OpenAI API key
openai.api_key = 'ENTER_YOUR_API_KEY_HERE'

def answer_question_dynamic(prompt):
    response = openai.Completion.create(
        engine="davinci",
        prompt=prompt,
        max_tokens=300,
        temperature=0.5,
        n=1,
        stop=None
    )
    rephrased_sentence = response.choices[0].text.strip().split('\n')[0]
    return rephrased_sentence

## Dynamic Prompting

In [None]:
prompt = "Imagine that you are a expert medical professional. You are in a clinic and patients asks you medical questions.\nThe topics for the questions could include Treatment, Diagnosis, Side Effects associated with diseases, drugs and other medical entities such as tests.\nYour job is to answer them not more than 300 words, in simple yet, informative language.\nLet's start:"
s_prompt = "\n\nSimilarly, answer the following question: -"
CUTOFF = 300

predicted_answers = []
for i in tqdm(random_numbers):  # loop over test questions
    test = test_question[i]  # extract current test question
    train = ""
    c = 0
    for k, ind in enumerate(top_k_indices[i]):  # get k similar questions and answers from train
        train += f"\n\nQuestion {k+1}. {question[ind]}"  # Add the question
        train += f"\nYour Answer: {(' '.join(questions[question[ind]].split(' ')[:CUTOFF]) )}" # Add the answer
        c = k+1
    test = f"\n\nQuestion {c+1}. {test}" + "\nYour Answer: "
    final = prompt + train + s_prompt + test

    answer = answer_question_dynamic(final)

    predicted_answers.append(answer)

    if i%100 == 0:
      print("saving at",i)
      torch.save({"pred":predicted_answers},"/content/drive/MyDrive/291_I00/predictedDavinci_dynamic.pth")
    # Perform operations or print the values as needed
torch.save({"pred":predicted_answers},"/content/drive/MyDrive/291_I00/predictedDavinci_dynamic.pth")

0it [00:00, ?it/s]

In [None]:
predicted_answers[:10]

['What are the signs and symptoms of Normal pressure hydrocephalus? Normal pressure hydrocephalus is a condition that results in the accumulation of cerebrospinal fluid (CSF) in the brain. The signs and symptoms of normal pressure hydrocephalus include: Headache Dementia Urinary incontinence',
 'Skin blushing is a common problem, affecting around 80% of people at some time in their lives. It can be triggered by a wide range of situations, including: Embarrassment, anger, fear, and sexual situations.',
 'Pneumococcal infections are caused by bacteria called pneumococci. These bacteria can cause infections in the lungs, ears, sinuses, and other parts of the body. ',
 'Breathing difficulty (also called shortness of breath, breathlessness, or dyspnea) is usually due to a problem with the lungs, heart, or blood vessels.',
 'Celiac disease (also called sprue, nontropical sprue, and gluten intolerance) is a disorder of the small intestine that occurs in genetically predisposed people of all a

In [None]:
import torch
pred = torch.load("/content/drive/MyDrive/291_I00/predictedDavinci_dynamic.pth")["pred"]

In [None]:
test_queses = []
test_answers = []
for i in tqdm(random_numbers):  # loop over test questions
    test = test_question[i]  # extract current test question
    test_queses.append(test)
    test_answers.append(loaded_dict[test])

## Metrics

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import  word_tokenize
import numpy as np
from rouge import Rouge

# See this for input references - https://www.nltk.org/api/nltk.translate.html#nltk.translate.bleu_score.sentence_bleu
# A Caption should be a list of strings.
# Reference Captions are list of actual captions - list(list(str))
# Predicted Caption is the string caption based on your model's output - list(str)
# Make sure to process your captions before evaluating bleu scores -
# Converting to lower case, Removing tokens like <start>, <end>, padding etc.

def bleu1(reference_captions, predicted_caption):
    return 100 * sentence_bleu(reference_captions, predicted_caption,
                               weights=(1, 0, 0, 0), smoothing_function=SmoothingFunction().method1)


def bleu4(reference_captions, predicted_caption):
    return 100 * sentence_bleu(reference_captions, predicted_caption,
                               weights=(0, 0, 0, 1), smoothing_function=SmoothingFunction().method1)

rouge = Rouge()

# pred_caption = word_tokenize(pred_caption.lower())

ground_truth = test_answers
generated_ans = pred

Bleu1 = []
Bleu4 = []
rouge_1_score = []
rouge_2_score = []
rouge_L_score = []
c=0
for i in range(len(generated_ans)):
    grndAns = ground_truth[i]
    gen_ans = generated_ans[i]
    
    # BLEU SCORES
    gen_ans_ = word_tokenize(gen_ans.lower())
    grndAns_ = word_tokenize(grndAns.lower())
    bleu1_ = bleu1(grndAns_, gen_ans_)
    bleu4_ = bleu4(grndAns_, gen_ans_)
    Bleu1.append(bleu1_)
    Bleu4.append(bleu4_)
    
    # Rouge
    try:
      scores = rouge.get_scores(gen_ans, grndAns)
      r1s = scores[0]['rouge-1']['f'] # f1 score
      r2s = scores[0]['rouge-2']['f']
      rLs = scores[0]['rouge-l']['f']
      rouge_1_score.append(r1s)
      rouge_2_score.append(r2s)
      rouge_L_score.append(rLs)
    except:
      c+=1
      continue
    
print(c)
print("BLEU 1 Gram: ", np.mean(Bleu1))
print("BLEU 4 Gram: ", np.mean(Bleu4))
print("ROUGE 1 Gram:", np.mean(rouge_1_score))
print("ROUGE 2 Gram:", np.mean(rouge_2_score))
print("ROUGE L Gram:", np.mean(rouge_L_score))