In [1]:
# load dependencies - make sure to activate moeme ENV and kernel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [1]:
# To clone the PubMedQA Benchmark repo
#!git clone https://github.com/pubmedqa/pubmedqa.git

Cloning into 'pubmedqa'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 40 (delta 0), reused 1 (delta 0), pack-reused 37 (from 1)[K
Receiving objects: 100% (40/40), 704.87 KiB | 13.55 MiB/s, done.
Resolving deltas: 100% (12/12), done.


In [2]:
# Libraries
import os
import json
import torch
import time
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
# Constants + Load Model

# Set model path
model_path = "/mnt/models/MeLLaMA-13B"

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model with architecture access
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)
print("Model loaded successfully!")

Loading tokenizer...
Loading model...


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Model loaded successfully!


In [21]:
# Benchmark functions

# Test prompt on the model
def prompt_model(model, tokenizer, prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Prompt function for PubMedQA
def pubmed_prompt(model, tokenizer, pmid, data_file):
    prompt = "Given the QUESTION, return an ANSWER using only [yes,no,maybe]."
    
    # Handle reasoning_required cases differently
    if "reasoning_required_pred" in data_file[pmid] and data_file[pmid]["reasoning_required_pred"] == "yes":
        # Join the contexts list into a single string if it's a list
        if isinstance(data_file[pmid]["CONTEXTS"], list):
            contexts = " ".join(data_file[pmid]["CONTEXTS"])
        else:
            contexts = data_file[pmid]["CONTEXTS"]
            
        # Include context for reasoning-required questions
        question = "QUESTION: " + contexts + " " + data_file[pmid]["QUESTION"]
    else:
        # Use only the question for non-reasoning questions
        question = "QUESTION: " + data_file[pmid]["QUESTION"]
    
    # Combine prompt and question
    full_prompt = prompt + "\n\n" + question + "\n\nANSWER:"
    
    # Get model output
    full_response = prompt_model(model, tokenizer, full_prompt)
    
    # Extract only the generated part
    answer_part = full_response[len(full_prompt):].strip().lower()
    
    # Parse to get yes, no, or maybe
    if "yes" in answer_part[:20]:
        prediction = "yes"
    elif "no" in answer_part[:20]:
        prediction = "no"
    elif "maybe" in answer_part[:20]:
        prediction = "maybe"
    else:
        # Count occurrences in case the answer isn't at the start
        yes_count = answer_part.count("yes")
        no_count = answer_part.count("no")
        maybe_count = answer_part.count("maybe")
        
        if yes_count > no_count and yes_count > maybe_count:
            prediction = "yes"
        elif no_count > yes_count and no_count > maybe_count:
            prediction = "no"
        else:
            prediction = "maybe"
    
    return prediction

In [None]:
# Run Benchmark

# PubmedQA data file path
pubmedqa_data_filepath = "/home/dyh2111/moeme/model/pubmedqa/data/ori_pqal.json"


# Load the dataset
print(f"Loading PubMedQA data from {pubmedqa_data_filepath}")
with open(pubmedqa_data_filepath, 'r') as f:
    pubmedqa_data = json.load(f)

print(f"Starting predictions. {len(pubmedqa_data.keys())} questions found.")

# Initialize predictions dictionary
preds = {}

# Record start time
start_time = time.time()

# Process each PMID
for pmid in tqdm(pubmedqa_data.keys()):
    prediction = pubmed_prompt(model, tokenizer, pmid, pubmedqa_data)
    preds[pmid] = prediction

# Calculate elapsed time
elapsed_time = (time.time() - start_time) / 60  # minutes

print("Predictions complete")
print(f"Elapsed time: {elapsed_time:.2f} mins")

# Save predictions to file
output_file = "mellama_pubmedqa_predictions.json"
with open(output_file, 'w') as f:
    json.dump(preds, f, indent=2)

print(f"Predictions saved to {output_file}")

Loading PubMedQA data from /home/dyh2111/moeme/model/pubmedqa/data/ori_pqal.json
Starting predictions. 1000 questions found.


  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
# Set model path
model_path = "/mnt/models/MeLLaMA-13B"

# Load Baseline MeLLaMA-13B Model

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model with architecture access
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Test prompt on the model
# Note: 4096 Max token length

def prompt_model(model, tokenizer, prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Prompt function for PubMedQA

def pubmed_prompt(model,tokenizer, PMID, data_file):
    // load json data_file
    
    prompt = "Given the QUESTION, return an ANSWER using only [yes,no,maybe]."

    if data_file[PMID]["reasoning_required_pred"] == "yes":
        question = = "QUESTION: " + "data_file[PMID]["CONTEXTS"] + data_file[PMID]["QUESTION"] 
    else:
        question = "QUESTION: " + data_file[PMID]["QUESTION"]
    
    inputs = tokenizer([prompt,question], return_tensors="pt").to(model.device)

    output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    ## filter to only yes, no maybe
    output.lower(seach if "yes" == yes), no, maybe

    return dict[PMID] = output
    

// PubmedQA eval location
pubmedqa_data_filepath = "moeme/model/pubmedqa/data/ori_pqal.json"
pubmedqa_data = json.load(open(pubmedqa_data_filepath)) 

print("Starting Predictions. {len(pubmedqa_data.keys} Questions Found)
preds = dict[]
start_time = time.now()
for pmid in pubmedqa_data:
    preds = pubmed_prompt(model,tokenizer, pmid, data_file)
elapsed_time = (start_time - time.now())/60 //minutes
print "Predictions Complete"
print("Elapsed Time: {elapsed_time} mins")

In [3]:
import torch
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Set model path
model_path = "/mnt/models/MeLLaMA-13B"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.eval()

# Load PubMedQA dataset (labeled subset)
dataset = load_dataset("pubmed_qa", "pqa_labeled")["test"]

# Define function to extract final decision from model output
def extract_answer(text):
    text = text.lower()
    for ans in ["yes", "no", "maybe"]:
        if ans in text:
            return ans
    return "unknown"

# Initialize lists to store predictions and references
predictions = []
references = []

ModuleNotFoundError: No module named 'datasets'

In [None]:


# ⏱️ Start timer
start_time = time.time()

# Iterate over the dataset
for example in tqdm(dataset):
    question = example["question"]
    context = example["context"]
    reference = example["final_decision"].lower()

    # Construct the prompt
    prompt = f"Question: {question}\nContext: {context}\nAnswer:"

    # Tokenize and move to device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=10)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the answer
    prediction = extract_answer(decoded)

    # Append to lists
    predictions.append(prediction)
    references.append(reference)

# ⏱️ End timer
end_time = time.time()
elapsed = end_time - start_time

# Calculate accuracy and macro F1 score
accuracy = accuracy_score(references, predictions)
macro_f1 = f1_score(references, predictions, average='macro', labels=["yes", "no", "maybe"])

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Elapsed Time: {elapsed:.2f} seconds")
print(f"Avg Time per Example: {elapsed / len(dataset):.2f} seconds")


In [17]:
os.getcwd()

'/home/dyh2111/moeme/model'

In [13]:
os.chdir('/home/dyh2111/moeme/model')

In [None]:
moeme/model/pubmedqa/data/ori_pqal.json