In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

In [None]:
import os
from datasets import load_dataset
from datasets import Dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd
from huggingface_hub import login
import torch
import random

In [None]:
login()

In [None]:
# @title prompts
prompt_get_part = f'''

## Question
{{question}} \n

Here is a medical question. Select one of the following topics that this question would best be classified as:

Topic List = [
    "Part 1: The Profession of Medicine",
    "Part 2: Cardinal Manifestations and Presentation of Diseases",
    "Part 3: Pharmacology",
    "Part 4: Oncology and Hematology",
    "Part 5: Infectious Diseases",
    "Part 6: Disorders of the Cardiovascular System",
    "Part 7: Disorders of the Respiratory System",
    "Part 8: Critical Care Medicine",
    "Part 9: Disorders of the Kidney and Urinary Tract",
    "Part 10: Disorders of the Gastrointestinal System",
    "Part 11: Immune-Mediated, Inflammatory, and Rheumatologic Disorders",
    "Part 12: Endocrinology and Metabolism",
    "Part 13: Neurologic Disorders",
    "Part 14: Poisoning, Drug Overdose, and Envenomation",
    "Part 15: Disorders Associated with Environmental Exposures",
    "Part 16: Genes, the Environment, and Disease",
    "Part 17: Global Medicine",
    "Part 18: Aging",
    "Part 19: Consultative Medicine",
    "Part 20: Frontiers"
]

Respond with a number, 1-20 representing the part that the question most likely belongs in. Only respond with this number.
'''

prompt_eval = f'''
## Question
{{question}} \n

## Choices
{{choices}} \n

Here is a medical question, some answer choices. Think through the problem step by step and provide your selection like this:
model generated chain of thought explanation. Therefore, the answer is [final model answer (e.g. A,B,C,D)] for instance [A].
\n
'''

prompt_eval_bare = f'''
## Question
{{question}} \n

## Choices
{{choices}} \n

\n
'''

prompt_eval_bare_fully = f'''
{{question}} \n
{{choices}}
'''

prompt_eval_bare_fully_with_examples = f'''
{{examples}} \n
{{question}} \n
{{choices}}
'''

prompt_eval_with_examples = f'''
## Examples
{{examples}}

Above are examples for medical Q&A.

## Question
{{question}} \n

## Choices
{{choices}} \n

Here is a medical question, some answer choices. Think through the problem step by step and provide your selection like this:
model generated chain of thought explanation. Therefore, the answer is [final model answer (e.g. A,B,C,D)] for instance [A].
\n
'''

prompt_eval_context_bare = f'''
{{context}} \n
{{question}} \n
{{choices}}
'''
prompt_eval_with_context = f'''
## Context
{{context}} \n

## Question
{{question}} \n

## Choices
{{choices}} \n

Here is some context from a textbook, a medical question, some answer choices. Think through the problem step by step and provide your selection like this:
model generated chain of thought explanation. Therefore, the answer is [final model answer (e.g. A,B,C,D)] for instance [A].
\n '''

prompt_eval_with_context_and_examples = f'''
## Examples
{{examples}}

Above are examples of medical Q&A.

## Context
{{context}} \n

## Question
{{question}} \n

## Choices
{{choices}} \n

Here is some context from a textbook, a medical question, some answer choices. Think through the problem step by step and provide your selection like this:
model generated chain of thought explanation. Therefore, the answer is [final model answer (e.g. A,B,C,D)] for instance [A].
\n '''

prompt_example = f'''
## Question
{{question}} \n

## Choices
{{choices}} \n

## Answer
{{answer}} \n
'''

gpt_chain_of_thoughts_MEDPROMPT = {
    "prompt_name": "gpt_chain_of_thoughts",
    "response_type": "MC",
    "examples": [
        {
            "question": """A 6-year-old girl is brought to the emergency department because of right elbow swelling and pain 30 minutes after falling onto her outstretched right arm. She has been unable to move her right elbow since the fall. Physical examination shows bruising, swelling, and tenderness of the right elbow; range of motion is limited by pain. An x-ray of the right arm shows a supracondylar fracture of the humerus with anterior displacement of the proximal fragment. Further evaluation is most likely to show which of the following findings?

A. Absent distal radial pulse
B. Radial deviation of the wrist
C. Inability to abduct shoulder
D. Inability to flex the elbow
""",
            "answer": """A supracondylar fracture of the humerus, especially with anterior displacement of the proximal fragment, is a common and serious injury in children. It can cause injury to the brachial artery, leading to compromised blood flow to the forearm and hand. Therefore, the most likely additional finding in this case would be an absent distal radial pulse due to this potential vascular compromise. This is a serious complication and requires immediate orthopedic consultation and likely surgical intervention to prevent long-term damage.
So, the answer is [A].""",
        },
        {
            "question": """Two weeks after undergoing low anterior resection for rectal cancer, a 52-year-old man comes to the physician because of swelling in both feet. He has not had any fever, chills, or shortness of breath. His temperature is 36°C (96.8°F) and pulse is 88/min. Physical examination shows a normal thyroid and no jugular venous distention. Examination of the lower extremities shows bilateral non-pitting edema that extends from the feet to the lower thigh, with deep flexion creases. His skin is warm and dry, and there is no erythema or rash. Microscopic examination of the interstitial space in this patient's lower extremities would be most likely to show the presence of which of the following?

A. Acellular, protein-poor fluid
B. Lymphocytic, hemosiderin-rich fluid
C. Lipid-rich, protein-rich fluid
D. Protein-rich, glycosaminoglycan-rich fluid
""",
            "answer": """This patient's bilateral, symmetric lower extremity swelling extending to the thighs, nonpitting quality, and absence of systemic symptoms (eg, shortness of breath, orthopnea) are suggestive of lymphedema. Lymphedema develops due to a blockage in the lymphatic system that prevents lymph fluid from draining adequately. In this case, the patient's recent surgery (low anterior resection for rectal cancer) probably involved extensive lymph node dissection, leading to secondary lymphedema.

Microscopically, lymph is a protein-rich fluid that also contains lipids, lymphocytes, and other substances. It does not typically contain significant amounts of glycosaminoglycans or hemosiderin.

Therefore, the answer is [C].""",
        },
        {
            "question": """A 34-year-old gravida 2, para 1 woman at 37+6 weeks of gestation presents for elective cesarean delivery. She says she has been having increased fatigue over the past few weeks. Past medical history includes gestational hypertension for which she has been taking an antihypertensive drug twice daily since week 24. Her vital signs include: temperature 36.7°C (98.0°F), blood pressure 120/75 mm Hg, pulse 127/min. Physical examination reveals generalized pallor. Her laboratory results reveal microcytic, hypochromic anemia with anisocytosis, hemoglobin of 9 g/dL, a differential with 14% lymphocytes, an ESR of 22 mm/hr, and a reticulocyte production index of 3.1. A direct antiglobulin test is positive. LFTs, creatinine, ferritin level, vitamin B12 level, coagulation studies, and urinalysis are normal. Which of the following is the most likely diagnosis in this patient?

A. Preeclampsia
B. Hereditary spherocytosis
C. HELLP syndrome
D. Drug-induced immune hemolytic reaction
""",
            "answer": """The patient's microcytic, hypochromic anemia and a high reticulocyte count are indicative of increased erythropoiesis, which, in this clinical scenario, is likely due to hemolysis. The positive direct antiglobulin test (Coomb's test) confirms that the hemolysis is immune-mediated. Given that other causes of immune hemolysis such as autoimmune disorders and infections are unlikely in this patient, the most probable cause of hemolysis is a drug-induced immune hemolytic reaction. Drugs can cause immune hemolysis by acting as haptens that bind to the red blood cell membrane, causing the formation of antibodies against the drug-cell complex (immune complex type) or by inducing the formation of autoantibodies against red blood cells (autoimmune type). While any drug can potentially cause immune hemolysis, common culprits include penicillins, cephalosporins, quinidine, and antihypertensives like methyldopa.

Hence, the answer is [D].""",
        },
        {
            "question": """You are counseling a pregnant woman who plans to breast-feed exclusively regarding her newborn's nutritional requirements. The child was born at home and the mother only plans for her newborn to receive vaccinations but no other routine medical care. Which vitamins should be given to the newborn?

A. Folic acid
B. Vitamin K
C. Vitamin D
D. Vitamin K and Vitamin D
""",
            "answer": """The American Academy of Pediatrics (AAP) recommends that all newborns receive a single dose of intramuscular vitamin K at birth. This is because newborns have low stores of vitamin K at birth, leading to a risk of vitamin K deficiency bleeding, also known as hemorrhagic disease of the newborn. This is a potentially life-threatening condition that can cause bleeding into the brain. Because this baby was born at home and the mother plans to avoid routine medical care, this baby may not have received this essential vitamin.

Additionally, the AAP recommends that all infants and children, including those who are breastfed, have a minimum intake of 400 IU/day of vitamin D beginning soon after birth. While breast milk is the best source of most nutrients, it is usually deficient in vitamin D.

The other option, folic acid, is not routinely supplemented in newborns as they usually receive adequate amounts from breast milk or formula.

Therefore, the answer is [D].""",
        },
        {
            "question": """An investigator is studying nutritional deficiencies in humans. A group of healthy volunteers are started on a diet deficient in pantothenic acid. After 4 weeks, several of the volunteers develop irritability, abdominal cramps, and burning paresthesias of their feet. These symptoms are fully reversed after reintroduction of pantothenic acid to their diet. The function of which of the following enzymes was most likely impaired in the volunteers during the study?

A. Methionine synthase
B. Alpha-ketoglutarate dehydrogenase
C. Glutathione reductase
D. Dopamine beta-hydroxylase
""",
            "answer": """Pantothenic acid is also known as vitamin B5, which is a component of coenzyme A (CoA) and phosphopantetheine. CoA is required for the synthesis and oxidation of fatty acids, and the metabolism of carbohydrates and proteins. One of the enzymes that requires CoA for its function is alpha-ketoglutarate dehydrogenase, a key enzyme in the citric acid cycle (also known as Krebs cycle or TCA cycle).

This enzyme catalyzes the conversion of alpha-ketoglutarate to succinyl CoA, with the concomitant reduction of NAD+ to NADH. Deficiencies in pantothenic acid would impair the function of this enzyme, leading to the observed symptoms.

So, the answer is [B].""",
        },
    ],
}

In [None]:
# @title utility
def extract_samples(task, numShot, model_prompt):
    questions, answer_choices, correct_answers = task_load(task, 'train')
    example_indexes = random.sample(range(len(questions)), numShot)
    example_list = []
    for i in example_indexes:
        example_list.append(model_prompt.format(question=questions[i], choices=format_choices(answer_choices[i]), answer=correct_answers[i]))
    return example_list

def translate(text,src,des):
    prompt = "you are an expert translator in medical domain please translate the given" + src + "question to" + des + "please do not provide any additional information or explanations" + '\n' + "text: "
    pipe = pipeline("text-generation", model=translator_model, tokenizer=translator_tokenizer)
    pipe_output = pipe([{"role": "user", "content": f"{prompt+text}"}],
                       max_new_tokens=1024,
                       eos_token_id=[translator_tokenizer.eos_token_id],
                       do_sample=False)
    return pipe_output[0]["generated_text"][-1]["content"]

def resume_the_test(question_list, answer_choices_list, correct_answer_list,bgn):
    question_list = question_list[bgn:]
    answer_choices_list = answer_choices_list[bgn:]
    correct_answer_list = correct_answer_list[bgn:]
    for i, (question, answer_choices, correct_answer) in tqdm(enumerate(zip(question_list, answer_choices_list, correct_answer_list))):
            context = ""
            if NSHOT == 0:
               prompt = prompt_eval_bare_fully
            else:
               prompt = prompt_eval_bare_fully_with_examples

            if NSHOT != 0:
                examples = extract_samples(TASK, NSHOT, prompt_example)
                model_prompt = prompt.format(
                    question=question,
                    choices=format_choices(answer_choices),
                    examples = ("\n").join(examples),
                    context = filterContext(context)
                )
            else:
                model_prompt = prompt.format(question=question, choices=format_choices(answer_choices), context = filterContext(context))

            AI_answer = run_inference(model_prompt, ENGINE, ENGINE_TEMPERATURE, MAX_TOKEN_OUTPUT, tokenizer, model)
            file_path = 'fa_pipeline_result.xlsx'
            append_record_to_excel(file_path, question, answer_choices,
                           correct_answer, model_prompt, AI_answer)

            if i == STOP_GEN-1:
                break


def append_record_to_excel(file_path, Question, question_choices,
                           correct_answer, model_prompt, AI_answer):
    new_record = {
        'Question': Question,
        'question_choices': question_choices,
        'correct_answer': correct_answer,
        'model_prompt':  model_prompt,
        'AI_answer': AI_answer
    }
    new_record_df = pd.DataFrame([new_record])
    try:
        existing_df = pd.read_excel(file_path)
        updated_df = pd.concat([existing_df, new_record_df], ignore_index=True)
    except FileNotFoundError:
        updated_df = new_record_df

    updated_df.to_excel(file_path, index=False)

def format_choices(choices):
    a = zip(list(choices.keys()), choices.values())
    final_answers = []
    for x,y in a:
        final_answers.append(f'[{x}] : {y}')
    return "\n".join(final_answers)

def format_examples(examples):
    formatted_examples = []
    for row in examples:
        example = f'## Question {row["question"]} \n ## Answer {row["answer"]}'
        formatted_examples.append(example)
    return "\n".join(formatted_examples)

def get_ds_from_df(df,task):
    if task == 'kopp':
       df['Question'] = df['Question'].astype(str)
       df['Option1'] = df['Option1'].astype(str)
       df['Option2'] = df['Option2'].astype(str)
       df['Option3'] = df['Option3'].astype(str)
       df['Option4'] = df['Option4'].astype(str)
       df['Topic'] = df['Topic'].astype(str)
       df['Source'] = df['Source'].astype(str)
       df['Correct answer'] = df['Correct answer'].astype(str)
       ds = Dataset.from_pandas(df)
       return ds
    elif "mmlu" in task:
       df['question'] = df['question'].astype(str)
       df['option1'] = df['option1'].astype(str)
       df['option2'] = df['option2'].astype(str)
       df['Option3'] = df['option3'].astype(str)
       df['option4'] = df['option4'].astype(str)
       df['answer'] = df['answer'].astype(str)
       ds = Dataset.from_pandas(df)
       return ds
    else:
       raise Exception("TASK NOT FOUND")

def task_load(task, split):
    if task=="kopp":
        df = pd.read_excel(task+'.xlsx')
        ds =get_ds_from_df(df,task)
        questions = [ds[i]['Question'] for i in range(len(ds))]
        answer_choices = [{"A": ds[i]['Option1'], "B": ds[i]['Option2'], "C": ds[i]['Option3'], "D": ds[i]['Option4']} for i in range(len(ds))]
        correct_answers = [chr(int(ds[i]['Correct answer'])+64) for i in range(len(ds))]
        return questions, answer_choices, correct_answers
    elif "mmlu" in task:
        df = pd.read_excel(task+'_fa.xlsx')
        ds =get_ds_from_df(df,task)
        questions = [ds[i]['question'] for i in range(len(ds))]
        answer_choices = [{"A": ds[i]['option1'], "B": ds[i]['option2'], "C": ds[i]['option3'], "D": ds[i]['option4']} for i in range(len(ds))]
        correct_answers = [chr(int(ds[i]['answer'])+64) for i in range(len(ds))]
        return questions, answer_choices, correct_answers
    else:
        raise Exception("TASK NOT FOUND")

def filterContext(context):
    end_tag = "</end>"
    if end_tag in context:
        return context.split(end_tag)[0] + end_tag
    return context

def run_inference(content, engine, temp=0.0001, max_tokens_output=1024, tokenizer=None, model=None):
    content = translate(content,'persian','english')
    messages = [{"role": "user", "content": f"{content}"}]
    input_ids = tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                return_tensors="pt"
                ).to(model.device)

    terminators = [
                tokenizer.eos_token_id,
                tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
              input_ids,
              max_new_tokens=max_tokens_output,
              eos_token_id=terminators,
              do_sample=True,
              temperature=temp,)
    response = outputs[0][input_ids.shape[-1]:]
    return translate(tokenizer.decode(response, skip_special_tokens=True),"english","persian")

In [None]:
# @title model setting

print("RUNNING NORMAL IMPLEMENTATION")
ENGINE = "dmis-lab/llama-3-meerkat-8b-v1.0"
TRANSLATOR_ENGINE = "CohereForAI/aya-expanse-8b"
SPLIT = "test"
ENGINE_TEMPERATURE = 0.000000001
MAX_TOKEN_OUTPUT = 1024
NSHOT = 0
STOP_GEN = 10000000 ## For testing purposes; stop generating after {STOP_GEN} amount of test-questions
device = "cuda" if torch.cuda.is_available() else "cpu"

## LOAD IN MODEL IF LOCAL
model_path = ENGINE
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
        ENGINE,
        torch_dtype=torch.bfloat16,
        device_map="auto")
model.eval()

translator_tokenizer = AutoTokenizer.from_pretrained(TRANSLATOR_ENGINE)
translator_model = AutoModelForCausalLM.from_pretrained(
    TRANSLATOR_ENGINE,
    torch_dtype=torch.bfloat16
).to(device)
translator_model.eval()
## OUTPUT RUN INFO:
print("Model Running: " + ENGINE)

RUNNING NORMAL IMPLEMENTATION


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/12.8M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Model Running: dmis-lab/llama-3-meerkat-8b-v1.0


In [None]:
# @title Load the test
TASK = 'mmlu-professional_medicine' # Options ["'mmlu-anatomy', 'mmlu-professional_medicine', 'mmlu-college_biology', 'mmlu-college_medicine', 'mmlu-clinical_knowledge', 'mmlu-medical_genetics'"]
question_list, answer_choices_list, correct_answer_list = task_load(TASK, SPLIT)
print(f"{TASK} loaded succesfully. Now conducting evaluation on {len(question_list)} samples.")

mmlu-professional_medicine loaded succesfully. Now conducting evaluation on 272 samples.


In [None]:
# Since google colab usage time is limited & this test takes days to complete
#  we need to concatenate the result of many session to get the final result
#   so set the bgn variable to number of question that has been solved in previous sessions
resume_the_test(question_list, answer_choices_list, correct_answer_list,bgn = 0)

0it [00:00, ?it/s]Device set to use cuda:0
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Device set to use cuda:0
1it [00:32, 32.32s/it]Device set to use cuda:0
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
2it [00:43, 19.99s/it]Device set to use cuda:0
The attention mask and the pad token id were not set. As a consequence, you may

In [None]:
# append AI_chosen_answer column manually to the excel file
#  then measure the accuracy
df = pd.read_excel('fa_pipeline_result.xlsx')
number_of_questions = 0
number_of_invalid_answers = 0
number_of_correct_answers = 0
for index,row in df.iterrows():
    number_of_questions += 1
    AI_chosen_answer = row['AI_chosen_answer']
    correct_answer = row['correct_answer']
    if AI_chosen_answer == 'invalid':
       number_of_invalid_answers += 1
    elif AI_chosen_answer == correct_answer:
       number_of_correct_answers += 1
print('#Questions: '+str(number_of_questions))
print('#Correct answers: '+str(number_of_correct_answers))
print('#Invalid answers: '+str(number_of_invalid_answers))
print('Accuracy: '+str(number_of_correct_answers/number_of_questions))

#Questions: 272
#Correct answers: 157
#Invalid answers: 38
Accuracy: 0.5772058823529411
