In [None]:
#load MedMCQA dataset
from datasets import load_dataset
import os

# Load the dataset
medmcqa = load_dataset("bigbio/med_qa", trust_remote_code=True)

# Directory to save processed data
current_path = os.getcwd()
repo_name = "Oracle24hr"
repo_root = current_path.split(repo_name)[0]
print(repo_root)

# Define the path to the data directory and JSON file in the repo root
output_dir = os.path.join(repo_root,repo_name, 'data')
os.makedirs(output_dir, exist_ok=True)

# Save each split as a JSONL file in the output directory
for split in ["train", "validation", "test"]:
    file_path = os.path.join(output_dir, f"{split}.json")
    medmcqa[split].to_json(file_path)


In [10]:
import json
import os

# Get the root of the repository (assumes this script is in a subdirectory of the repo)
current_path = os.getcwd()
repo_name = "Oracle24hr"
repo_root = current_path.split(repo_name)[0]
print(repo_root)

# Define the path to the data directory and JSON file in the repo root
data_dir = os.path.join(repo_root,repo_name, 'data')
file_path = os.path.join(data_dir, 'test.json')

# Read the JSON file line by line
qa_data = []
try:
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                item = json.loads(line)
                question = item.get('question')
                answer = item.get('answer')
                qa_data.append({'question': question, 'answer': answer})
except FileNotFoundError:
    print(f"File not found: {file_path}")


c:\Users\phili\Documents\Projects\


In [15]:
qa_data[1]

{'question': "A 67-year-old man with transitional cell carcinoma of the bladder comes to the physician because of a 2-day history of ringing sensation in his ear. He received this first course of neoadjuvant chemotherapy 1 week ago. Pure tone audiometry shows a sensorineural hearing loss of 45 dB. The expected beneficial effect of the drug that caused this patient's symptoms is most likely due to which of the following actions?",
 'answer': 'Cross-linking of DNA'}

In [18]:
import random
import pdb
import pandas as pd
import torch
import math
import numpy as np
import re
from transformers import pipeline, BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
from transformers import LlamaTokenizer
from huggingface_hub import login
login(token = "YourToken")
device = torch.device("cpu")
def load_models(model_name):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model = model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    return [model,tokenizer]
def option_exists(new_op,old_ops):
    new_op = new_op.strip(". ").lower()
    for old_op in old_ops:
        old_op = old_op.strip(". ").lower()
        if (new_op in old_op or old_op in new_op):
            return(1)
    return(0)

def create_options(instring, num_unique_ops = 4, options_generate_limit = 15):
    unique_options = []
    all_options = []
    op_to_reason = {}

    inputs = tokenizer(instring, return_tensors="pt",padding=True)
    attention_mask = inputs["attention_mask"]
    inputs = inputs.to(device)

    print("Creating Options")

    while(len(all_options) < options_generate_limit and len(unique_options) < num_unique_ops):

        if not len(all_options):
            # outputs = greedy_model.generate([instring])[0]
            outputs = model.generate(
                    inputs.input_ids, 
                    max_new_tokens = 1024, 
                    repetition_penalty = 1.1, 
                    attention_mask=attention_mask,
                    pad_token_id=tokenizer.eos_token_id,
                    )
        else:
            # outputs = model.generate([instring])[0]
            outputs = model.generate(
                    inputs.input_ids, 
                    max_new_tokens = 1024, 
                    do_sample = True, 
                    temperature = 1, 
                    repetition_penalty = 1.1,
                    attention_mask=attention_mask,
                    pad_token_id=tokenizer.eos_token_id,
                    )

        # text_output = outputs.generated_text.strip()
        text_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        text_output = text_output.split(instring)[-1]
        text_output = text_output.strip()
        #print("Text Output : ",text_output)

        if reasoning_delimiter in text_output:
            op_reasoning, op_text  = text_output.split(reasoning_delimiter)[0], text_output.split(reasoning_delimiter)[1]
            op_reasoning, op_text = op_reasoning.strip(), op_text.strip().split("\n")[0].strip()

            print("Option Created :",op_text)

            if not option_exists(op_text,unique_options):
                print("New Option Accepted", op_text)
                unique_options.append(op_text)
                op_to_reason[op_text] = op_reasoning
            else:
                print("Option already exists.. Discarding..")
            
        else:
            op_text = "<parsing error>"
            print(op_text)

        all_options.append(op_text)

    print("Final Options :", unique_options)

    return [unique_options,op_to_reason]
model, tokenizer = load_models("meta-llama/Llama-3.2-1B")
file_path = os.path.join('.', 'prompts', 'open_ended_ClinicR.txt')
with open(file_path, 'r') as f:
    few_shot_cot = f.read()
reasoning_delimiter = "Answer: "
output_delimiter = "Q:"
instruction = "Use just the given patient history to answer the question. Do not assume any further information about the patient. Strictly Limit your response to 200 words."
question = qa_data[1]['question']
f_instring = f'''{few_shot_cot}\n\n{instruction}\nQ: {question}\nA: Let's think step-by-step.'''
uniq_options, op_to_reason = create_options(f_instring)
if (len(uniq_options) < 4):
    print("Not enough Options")

else:
    options_text = ""
    for op_num,op in enumerate(uniq_options):
        options_text += f"({chr(ord('A') + op_num)}) {op} "

    options_text = options_text.strip() + '\n'
    with open("prompts/MCQ_Eliminative.txt", 'r') as f:
        backward_prompt = f.read()
    backward_ques = question
    b_instring = f"{backward_prompt}\n\nQ: {backward_ques}\n{options_text}A: Let's think step-by-step."

    b_inputs = tokenizer(b_instring, return_tensors="pt",padding=True)
    attention_mask = b_inputs["attention_mask"]
    b_inputs = b_inputs.to(device)

    b_output = model.generate(
                    b_inputs.input_ids, 
                    max_new_tokens = 1024, 
                    repetition_penalty = 1.1, 
                    attention_mask=attention_mask,
                    pad_token_id=tokenizer.eos_token_id,
                    )
    b_output = tokenizer.batch_decode(b_output, skip_special_tokens=True)[0]
    b_output = b_output.split(b_instring)[-1]
    b_output = b_output.strip()

    if b_output.strip() == "":
        print("output empty")
        b_output = "<empty>"

    b_answer = re.findall(r"\([A-D]\)",b_output)
    if len(b_answer):
        b_answer = b_answer[0]
        option = ord(b_answer[1]) - ord('A') + 1
        b_answer_idx = ord(b_answer[1]) - ord('A')
        output = b_answer + " " + op_to_reason[uniq_options[b_answer_idx]] + "\nAnswer: " + uniq_options[b_answer_idx]
    else:
        b_answer ="<parsing error>" 
        option = 0
        output = "<parsing error>"

Creating Options
Option Created : cisplatin induced hearing loss.
New Option Accepted cisplatin induced hearing loss.
Option Created : hearing impairment is directly due to cisplatin toxicity
New Option Accepted hearing impairment is directly due to cisplatin toxicity
Option Created : Neoadjuvant chemotherapy
New Option Accepted Neoadjuvant chemotherapy
<parsing error>
Option Created : after complete remission
New Option Accepted after complete remission
Final Options : ['cisplatin induced hearing loss.', 'hearing impairment is directly due to cisplatin toxicity', 'Neoadjuvant chemotherapy', 'after complete remission']


In [13]:
with open("prompts/MCQ_Eliminative.txt", 'r') as f:
        backward_prompt = f.read()
if (len(uniq_options) < 4):
    print("Not enough Options")

else:
    options_text = ""
    for op_num,op in enumerate(uniq_options):
        options_text += f"({chr(ord('A') + op_num)}) {op} "

    options_text = options_text.strip() + '\n'

    backward_ques = question
    b_instring = f"{backward_prompt}\n\nQ: {backward_ques}\n{options_text}A: Let's think step-by-step."

    b_inputs = tokenizer(b_instring, return_tensors="pt")
    b_inputs = b_inputs.to(device)

    b_output = model.generate(b_inputs.input_ids, max_new_tokens = 1024, repetition_penalty = 1.1)
    b_output = tokenizer.batch_decode(b_output, skip_special_tokens=True)[0]
    b_output = b_output.split(b_instring)[-1]
    b_output = b_output.strip()

    if b_output.strip() == "":
        print("output empty")
        b_output = "<empty>"

    b_answer = re.findall(r"\([A-D]\)",b_output)
    if len(b_answer):
        b_answer = b_answer[0]
        option = ord(b_answer[1]) - ord('A') + 1
        b_answer_idx = ord(b_answer[1]) - ord('A')
        output = b_answer + " " + op_to_reason[uniq_options[b_answer_idx]] + "\nAnswer: " + uniq_options[b_answer_idx]
    else:
        b_answer ="<parsing error>" 
        option = 0
        output = "<parsing error>"
print("The output is:",output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The output is: (A) The patient has transitional cell carcinoma of the bladder which is a cancerous growth of the urinary system. Neoadjuvant chemotherapy is used in advanced stage cancers to shrink the tumor before surgery. It is thought that chemotherapeutic drugs cause necrosis of tumor cells resulting in more complete destruction of tumor than surgical resection alone. In this case, the patient has undergone neoadjuvant chemotherapy prior to presenting to the physician. Since the patient had a hearing loss, the drug that was responsible for this patient's symptoms is probably ototoxic. Ototoxicity is the side effect of some chemotherapy drugs that causes irreversible damage to the inner ear. Some examples of ototoxic drugs include cisplatin, methotrexate, cyclophosphamide, and vinblastine. Hearing loss due to ototoxicity may occur anywhere from days to months after the administration of the drug. Therefore, patients who develop hearing loss should be evaluated for ototoxicity.
Answe

In [16]:
qa_data[1]

{'question': "A 67-year-old man with transitional cell carcinoma of the bladder comes to the physician because of a 2-day history of ringing sensation in his ear. He received this first course of neoadjuvant chemotherapy 1 week ago. Pure tone audiometry shows a sensorineural hearing loss of 45 dB. The expected beneficial effect of the drug that caused this patient's symptoms is most likely due to which of the following actions?",
 'answer': 'Cross-linking of DNA'}