# MedQA evaluation process
This notebook shows the evaluation process after SFT with MedQA.

In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling,TrainingArguments,Trainer
import pandas as pd
from transformers import AutoModelForCausalLM
from datasets import Dataset

## Load the based model and the SFT model with LORA

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "./BioMistral-7B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
)


In [None]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(model, "biomistral-biomistral-7B-MedmcQA-SFT/checkpoint-10000")

## Make the few-shot prompting for QA evaluation
Test the prompt for both base model and sft model

In [None]:
model_device = next(model.parameters()).device
#  Telling the model how to answer the question
eval_prompt = """
### Question:
Best prognostic factor for head injury is ？

### Option：
 'A': 'A.Glasgow coma scale',
 'B': 'B.Age',
 'C': 'C.Mode of injury',
 'D': 'D.CT'

### Answer:
'A': 'A.Glasgow coma scale'

### Question:
Congenital hydrocele is best treated by ?

### Option：
 'A': 'Eversion of sac',
 'B': 'Excision of sac',
 'C': "Lord's procedure",
 'D': 'Herniotomy',

### Answer:

"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to(model_device)

model.eval()

with torch.no_grad():
    # generate output using base model
    output = model.generate(**model_input, max_new_tokens=300)
    # decode output
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    print(answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



### Question: 
Best prognostic factor for head injury is ？

### Option：
 'A': 'A.Glasgow coma scale',
 'B': 'B.Age',
 'C': 'C.Mode of injury',
 'D': 'D.CT'

### Answer:
'A': 'A.Glasgow coma scale'

### Question:
Congenital hydrocele is best treated by ?

### Option：
 'A': 'Eversion of sac',
 'B': 'Excision of sac',
 'C': "Lord's procedure",
 'D': 'Herniotomy',
 
### Answer:

'B': 'Excision of sac'


In [None]:
eval_prompt = """
Choose the right choice:

### Question:
Best prognostic factor for head injury is ？

### Option：
 'A': 'A.Glasgow coma scale',
 'B': 'B.Age',
 'C': 'C.Mode of injury',
 'D': 'D.CT'

### Answer:
'A': 'A.Glasgow coma scale'

### Question:
Secondary amyloidosis occurs in ?

### Option：
 'A': 'Chronic osteomyelitis',
 'B': 'Rheumatoid ahritis',
 'C': 'Leprosy',
 'D': 'All'

### Answer:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# generate output using ft_model
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=10)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Choose the right choice:

### Question: 
Best prognostic factor for head injury is ？

### Option：
 'A': 'A.Glasgow coma scale',
 'B': 'B.Age',
 'C': 'C.Mode of injury',
 'D': 'D.CT'

### Answer:
'A': 'A.Glasgow coma scale'

### Question: 
Secondary amyloidosis occurs in ?

### Option：
 'A': 'Chronic osteomyelitis',
 'B': 'Rheumatoid ahritis',
 'C': 'Leprosy',
 'D': 'All'

### Answer:
'C': 'Leprosy',



## Loading a separate test dataset

In [None]:
import pandas as pd
df = pd.read_csv('train_dataset.csv')[:100]
df.tail()

Unnamed: 0,question,A,B,C,D,cop,Answer
95,Which of the following is/are true of blunt re...,Blunt renal trauma and penetrating renal injur...,Blunt renal trauma must be evaluated by contra...,Blunt renal trauma requires exploration only w...,Any kidney fractured by blunt renal trauma mus...,B,Blunt renal trauma must be evaluated by contra...
96,All of the following can lead to increased Int...,Meningitis,Subarachnoid Hemorrhage,Subdural hemorrhage,Migraine,C,Subdural hemorrhage
97,Chronic lymphoedema of the limb is predisposed...,Thickening of the skin,Recurrent soft tissue infections,Marjolin' s ulcer,Sarcoma,B,Recurrent soft tissue infections
98,Secondary amyloidosis occurs in ?,Chronic osteomyelitis,Rheumatoid ahritis,Leprosy,All,C,Leprosy
99,Lahsal classification is used for:,Cleft lip and palate,Tumor staging,Neurological assessment of trauma patient,None of the above,D,None of the above


## Build the evaluation process
Iterate using the model to QA the test dataset to generate answers, and record the answers for final accuracy

In [None]:
from tqdm import tqdm
import pandas as pd

# Read the CSV file
df = pd.read_csv('test_dataset.csv')
model_device = next(model.parameters()).device

# Define the evaluation function
def evaluate_question(question, options, model, tokenizer, model_device):
    option_str = '\n'.join([f"'{k}': '{v}'" for k, v in options.items()])
    eval_prompt = f"""
    ### Question:
    Best prognostic factor for head injury is ?

    ### Options:
     'A': 'A.Glasgow coma scale',
     'B': 'B.Age',
     'C': 'C.Mode of injury',
     'D': 'D.CT'

    ### Answer:
    'A': 'A.Glasgow coma scale'

    ### Question:
    {question} ?

    ### Options:
    \n{option_str}

    ### Answer:
    """
    model_input = tokenizer(eval_prompt, return_tensors="pt").to(model_device)
    with torch.no_grad():
        output = model.generate(**model_input, max_new_tokens=20)
        answer = tokenizer.decode(output[0], skip_special_tokens=True)
        # Parse the generated answer
        generated_answer = answer.split("### Answer:")[1].strip().split("'")[1]
    return generated_answer

# Iterate over the dataset and evaluate each question
results = []
num_correct = 0
tqdm_pbar = tqdm(total=len(df), desc="Evaluating")
for index, row in df.iterrows():

    question = row['question']
    options = {
        'A': row['A'],
        'B': row['B'],
        'C': row['C'],
        'D': row['D']
    }
    correct_answer = row['cop']
    generated_answer = evaluate_question(question, options, model, tokenizer, model_device=next(model.parameters()).device)

    # Check if the answer is correct
    is_correct = "Yes" if generated_answer == correct_answer else "No"
    results.append({
        'Question': question,
        'Correct Answer': correct_answer,
        'Generated Answer': generated_answer,
        'Is Correct': is_correct
    })
    if is_correct == "Yes":
        num_correct += 1
    tqdm_pbar.update(1)  # Update the progress bar

tqdm_pbar.close()  # Close the progress bar after completion

# Calculate accuracy
accuracy = num_correct / len(df) * 100


Evaluating:   0%|          | 0/1000 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:None for open-end generation.

Evaluating:   0%|          | 1/1000 [00:00<11:09,  1.49it/s][ASetting `pad_token_id` to `eos_token_id`:None for open-end generation.

Evaluating:   0%|          | 2/1000 [00:02<18:04,  1.09s/it][ASetting `pad_token_id` to `eos_token_id`:None for open-end generation.

Evaluating:   0%|          | 3/1000 [00:02<15:21,  1.08it/s][ASetting `pad_token_id` to `eos_token_id`:None for open-end generation.

Evaluating:   0%|          | 4/1000 [00:03<14:03,  1.18it/s][ASetting `pad_token_id` to `eos_token_id`:None for open-end generation.

Evaluating:   0%|          | 5/1000 [00:04<13:43,  1.21it/s][ASetting `pad_token_id` to `eos_token_id`:None for open-end generation.

Evaluating:   1%|          | 6/1000 [00:05<13:09,  1.26it/s][ASetting `pad_token_id` to `eos_token_id`:None for open-end generation.

Evaluating:   1%|          | 7/1000 [00:06<14:28,  1.14it/s][

In [1]:
accuracy

43.5

In [None]:
# Look the result
results[:10]

[{'Question': 'All of the following are surgical options for morbid obesity except -',
  'Correct Answer': 'C',
  'Generated Answer': 'A',
  'Is Correct': 'No'},
 {'Question': 'A 60 yr old chronic smoker presents with painless gross hematuria of 1 day duration. Investigation of choice to know the cause of hematuria',
  'Correct Answer': 'C',
  'Generated Answer': 'A',
  'Is Correct': 'No'},
 {'Question': 'An Isograft indicates transfer of tissues between -',
  'Correct Answer': 'B',
  'Generated Answer': 'A',
  'Is Correct': 'No'},
 {'Question': 'Which of the following muscle is not a hybrid muscle',
  'Correct Answer': 'A',
  'Generated Answer': 'A',
  'Is Correct': 'Yes'},
 {'Question': 'Delayed union of fracture of a bone follo-wing a surgical treatment may be due to',
  'Correct Answer': 'C',
  'Generated Answer': 'A',
  'Is Correct': 'No'},
 {'Question': 'During extraction of the upper first molar, the mesio buccal root is missing and is suspected to have been pushed into the maxi