In [1]:
# To clone the PubMedQA Benchmark repo
#!git clone https://github.com/pubmedqa/pubmedqa.git

Cloning into 'pubmedqa'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 40 (delta 0), reused 1 (delta 0), pack-reused 37 (from 1)[K
Receiving objects: 100% (40/40), 704.87 KiB | 13.55 MiB/s, done.
Resolving deltas: 100% (12/12), done.


In [1]:
# Libraries
import os
import json
import torch
import time
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
# Set model path
model_path = "/mnt/models/MeLLaMA-13B"

# PubmedQA data file path
pubmedqa_data_filepath = "/home/dyh2111/moeme/model/pubmedqa/data/ori_pqal.json"

# Ground truth
gt_data_filepath = "/home/dyh2111/moeme/model/pubmedqa/data/test_ground_truth.json"


In [8]:
# Load Model

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model with architecture access
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)
print("Model loaded successfully!")

Loading tokenizer...
Loading model...


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


Model loaded successfully!


In [9]:
# Benchmark functions V1

# Test prompt on the model
def prompt_model(model, tokenizer, prompt, max_new_tokens=10):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Prompt function for PubMedQA

# PubmedQA data file path
pubmedqa_data_filepath = "/home/dyh2111/moeme/model/pubmedqa/data/ori_pqal.json"

# Ground truth
gt_data_filepath = "/home/dyh2111/moeme/model/pubmedqa/data/test_ground_truth.json"


def pubmed_prompt(model, tokenizer, pmid, data_file):
    prompt = "Given the QUESTION, return an ANSWER using only [yes,no,maybe]."
    
    # Handle reasoning_required cases differently
    if "reasoning_required_pred" in data_file[pmid] and data_file[pmid]["reasoning_required_pred"] == "yes":
        # Join the contexts list into a single string if it's a list
        if isinstance(data_file[pmid]["CONTEXTS"], list):
            contexts = " ".join(data_file[pmid]["CONTEXTS"])
        else:
            contexts = data_file[pmid]["CONTEXTS"]
            
        # Include context for reasoning-required questions
        question = "QUESTION: " + contexts + " " + data_file[pmid]["QUESTION"]
    else:
        # Use only the question for non-reasoning questions
        question = "QUESTION: " + data_file[pmid]["QUESTION"]
    
    # Combine prompt and question
    full_prompt = prompt + "\n\n" + question + "\n\nANSWER:"

    print(full_prompt)
    
    # Get model output
    full_response = prompt_model(model, tokenizer, full_prompt)

    print(f"Full Response: {full_response}")
    
    # Extract only the generated part
    answer_part = full_response[len(full_prompt):].strip().lower()
    
    # Parse to get yes, no, or maybe
    if "yes" in answer_part[:20]:
        prediction = "yes"
    elif "no" in answer_part[:20]:
        prediction = "no"
    elif "maybe" in answer_part[:20]:
        prediction = "maybe"
    else:
        # Count occurrences in case the answer isn't at the start
        yes_count = answer_part.count("yes")
        no_count = answer_part.count("no")
        maybe_count = answer_part.count("maybe")
        
        if yes_count > no_count and yes_count > maybe_count:
            prediction = "yes"
        elif no_count > yes_count and no_count > maybe_count:
            prediction = "no"
        else:
            prediction = "maybe"
    
    print(f"Final Answer: {prediction}")
    return prediction

In [None]:
# Run Benchmark

# Load the dataset
print(f"Loading PubMedQA data from {pubmedqa_data_filepath}")
with open(pubmedqa_data_filepath, 'r') as f:
    pubmedqa_data = json.load(f)

# Load the ground truth
print(f"Loading PubMedQA GT from {gt_data_filepath}")
with open(gt_data_filepath, 'r') as f:
    gt_data = json.load(f)

print(f"Starting predictions. {len(pubmedqa_data.keys())} questions found. {len(gt_data.keys())} Answers found.")

# Initialize predictions dictionary
preds = {}

# Display Counter
counter = 0

# Record start time
start_time = time.time()

# Process each PMID
for pmid in tqdm(gt_data.keys()):
    print(f"Question: {counter+1}")
    question_start = time.time()
    prediction = pubmed_prompt(model, tokenizer, pmid, pubmedqa_data)
    preds[pmid] = prediction
    question_elapsed_time = (time.time() - question_start)  # sec
    print(f"Inference Time: {question_elapsed_time}sec")
    counter += 1 #increment

# Calculate elapsed time
elapsed_time = (time.time() - start_time) / 60  # minutes

print("Predictions complete")
print(f"Elapsed time: {elapsed_time:.2f} mins")

# Save predictions to file
output_file = "mellama_pubmedqa_predictions.json"
with open(output_file, 'w') as f:
    json.dump(preds, f, indent=2)

print(f"Predictions saved to {output_file}")

Loading PubMedQA data from /home/dyh2111/moeme/model/pubmedqa/data/ori_pqal.json
Loading PubMedQA GT from /home/dyh2111/moeme/model/pubmedqa/data/test_ground_truth.json
Starting predictions. 1000 questions found. 500 Answers found.


  0%|          | 0/500 [00:00<?, ?it/s]

Question: 1
Given the QUESTION, return an ANSWER using only [yes,no,maybe].

QUESTION: Dyschesia can be provoked by inappropriate defecation movements. The aim of this prospective study was to demonstrate dysfunction of the anal sphincter and/or the musculus (m.) puborectalis in patients with dyschesia using anorectal endosonography. Twenty consecutive patients with a medical history of dyschesia and a control group of 20 healthy subjects underwent linear anorectal endosonography (Toshiba models IUV 5060 and PVL-625 RT). In both groups, the dimensions of the anal sphincter and the m. puborectalis were measured at rest, and during voluntary squeezing and straining. Statistical analysis was performed within and between the two groups. The anal sphincter became paradoxically shorter and/or thicker during straining (versus the resting state) in 85% of patients but in only 35% of control subjects. Changes in sphincter length were statistically significantly different (p<0.01, chi(2) test) i