In [1]:
# Cardiac EXPERT EVAL - Evals PubMedQA

Model was trained on blood_heart_circulation_qa training data. 17 epochs /160 steps

In [2]:
import os
import json
import datetime
from tqdm import tqdm
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig
)

In [3]:
# Change paths as needed
model_path = "moeme/model/cardiovascular_expert_model/"

#Test data is pubmedqa
data_path = "moeme/model/model_eval/pubmedqa/data/ori_pqal.json"
ground_truth = "moeme/model/model_eval/pubmedqa/data/test_ground_truth.json"

# Save predictions
output_file = "expert1_predictions.json"

limit = None  # e.g., 100 for testing subset
verbose = False  # or False

#Enable bfloat16 or 4-bit if supported by GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=bnb_config
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [4]:
def pubmed_prompt(model_pipeline, pmid, data_file, verbose=False):
    intro = "TASK: Your task is to answer biomedical questions using the given abstract. Only output yes, no, or maybe as answer."
    
    contexts = data_file[pmid].get("CONTEXTS", "")
    if isinstance(contexts, list):
        contexts = " ".join(contexts)
    
    question = data_file[pmid]["QUESTION"]
    full_prompt = f"{intro} INPUT: {contexts} {question} OUTPUT:"
    
    if verbose:
        print(f"\nPrompt: {full_prompt}")
    
    output = model_pipeline(full_prompt, max_new_tokens=20)
    full_response = output[0]['generated_text']
    answer_part = full_response[len(full_prompt):].strip().lower()
    
    for ans in ["yes", "no", "maybe"]:
        if ans in answer_part[:20]:
            prediction = ans
            break
    else:
        counts = {ans: answer_part.count(ans) for ans in ["yes", "no", "maybe"]}
        prediction = max(counts, key=counts.get)

    if verbose:
        print(f"Model Answer: {prediction} | True Answer: {data_file[pmid]['final_decision']}")
    
    return prediction


In [5]:
# Load PubMedQA JSON
with open(data_path, 'r') as f:
    pubmedqa_data = json.load(f)

with open(ground_truth, 'r') as f:
    gt_data = json.load(f)

# Backup output file if it exists
if os.path.exists(output_file):
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    os.rename(output_file, f"{os.path.splitext(output_file)[0]}_{timestamp}.json")

# Limit dataset for test runs
pmids = list(gt_data.keys())
if limit:
    pmids = pmids[:limit]

# Run predictions
results = {}
correct = 0
total = 0

for pmid in tqdm(pmids, desc="Evaluating PubMedQA"):
    prediction = pubmed_prompt(pipe, pmid, pubmedqa_data, verbose=verbose)
    results[pmid] = prediction

    # Save progress
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    if prediction == pubmedqa_data[pmid]["final_decision"]:
        correct += 1
    total += 1
    acc = correct / total
    tqdm.write(f"[{total}] Accuracy: {acc:.4f}")


Evaluating PubMedQA:   0%|          | 1/500 [00:01<14:50,  1.78s/it]

[1] Accuracy: 1.0000


Evaluating PubMedQA:   0%|          | 2/500 [00:03<13:24,  1.62s/it]

[2] Accuracy: 1.0000


Evaluating PubMedQA:   1%|          | 3/500 [00:04<12:56,  1.56s/it]

[3] Accuracy: 1.0000


Evaluating PubMedQA:   1%|          | 4/500 [00:06<12:43,  1.54s/it]

[4] Accuracy: 1.0000


Evaluating PubMedQA:   1%|          | 5/500 [00:07<12:38,  1.53s/it]

[5] Accuracy: 1.0000


Evaluating PubMedQA:   1%|          | 6/500 [00:09<12:30,  1.52s/it]

[6] Accuracy: 1.0000


Evaluating PubMedQA:   1%|▏         | 7/500 [00:10<12:27,  1.52s/it]

[7] Accuracy: 1.0000


Evaluating PubMedQA:   2%|▏         | 8/500 [00:12<12:23,  1.51s/it]

[8] Accuracy: 1.0000


Evaluating PubMedQA:   2%|▏         | 9/500 [00:13<12:21,  1.51s/it]

[9] Accuracy: 1.0000


Evaluating PubMedQA:   2%|▏         | 10/500 [00:15<12:17,  1.51s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[10] Accuracy: 1.0000


Evaluating PubMedQA:   2%|▏         | 11/500 [00:16<12:14,  1.50s/it]

[11] Accuracy: 1.0000


Evaluating PubMedQA:   2%|▏         | 12/500 [00:18<12:11,  1.50s/it]

[12] Accuracy: 1.0000


Evaluating PubMedQA:   3%|▎         | 13/500 [00:19<12:10,  1.50s/it]

[13] Accuracy: 1.0000


Evaluating PubMedQA:   3%|▎         | 14/500 [00:21<12:08,  1.50s/it]

[14] Accuracy: 1.0000


Evaluating PubMedQA:   3%|▎         | 15/500 [00:22<12:08,  1.50s/it]

[15] Accuracy: 1.0000


Evaluating PubMedQA:   3%|▎         | 16/500 [00:24<12:05,  1.50s/it]

[16] Accuracy: 1.0000


Evaluating PubMedQA:   3%|▎         | 17/500 [00:25<12:07,  1.51s/it]

[17] Accuracy: 1.0000


Evaluating PubMedQA:   4%|▎         | 18/500 [00:27<12:06,  1.51s/it]

[18] Accuracy: 1.0000


Evaluating PubMedQA:   4%|▍         | 19/500 [00:28<12:04,  1.51s/it]

[19] Accuracy: 1.0000


Evaluating PubMedQA:   4%|▍         | 20/500 [00:30<12:02,  1.50s/it]

[20] Accuracy: 1.0000


Evaluating PubMedQA:   4%|▍         | 21/500 [00:31<11:59,  1.50s/it]

[21] Accuracy: 1.0000


Evaluating PubMedQA:   4%|▍         | 22/500 [00:33<11:59,  1.50s/it]

[22] Accuracy: 1.0000


Evaluating PubMedQA:   5%|▍         | 23/500 [00:34<11:59,  1.51s/it]

[23] Accuracy: 1.0000


Evaluating PubMedQA:   5%|▍         | 24/500 [00:36<12:00,  1.51s/it]

[24] Accuracy: 1.0000


Evaluating PubMedQA:   5%|▌         | 25/500 [00:37<12:00,  1.52s/it]

[25] Accuracy: 1.0000


Evaluating PubMedQA:   5%|▌         | 26/500 [00:39<12:00,  1.52s/it]

[26] Accuracy: 1.0000


Evaluating PubMedQA:   5%|▌         | 27/500 [00:40<11:56,  1.51s/it]

[27] Accuracy: 0.9630


Evaluating PubMedQA:   6%|▌         | 28/500 [00:42<11:51,  1.51s/it]

[28] Accuracy: 0.9643


Evaluating PubMedQA:   6%|▌         | 29/500 [00:43<11:48,  1.50s/it]

[29] Accuracy: 0.9655


Evaluating PubMedQA:   6%|▌         | 30/500 [00:45<11:45,  1.50s/it]

[30] Accuracy: 0.9333


Evaluating PubMedQA:   6%|▌         | 31/500 [00:46<11:42,  1.50s/it]

[31] Accuracy: 0.9355


Evaluating PubMedQA:   6%|▋         | 32/500 [00:48<11:39,  1.49s/it]

[32] Accuracy: 0.9375


Evaluating PubMedQA:   7%|▋         | 33/500 [00:49<11:37,  1.49s/it]

[33] Accuracy: 0.9091


Evaluating PubMedQA:   7%|▋         | 34/500 [00:51<11:36,  1.49s/it]

[34] Accuracy: 0.9118


Evaluating PubMedQA:   7%|▋         | 35/500 [00:52<11:35,  1.50s/it]

[35] Accuracy: 0.9143


Evaluating PubMedQA:   7%|▋         | 36/500 [00:54<11:34,  1.50s/it]

[36] Accuracy: 0.9167


Evaluating PubMedQA:   7%|▋         | 37/500 [00:55<11:34,  1.50s/it]

[37] Accuracy: 0.9189


Evaluating PubMedQA:   8%|▊         | 38/500 [00:57<11:36,  1.51s/it]

[38] Accuracy: 0.9211


Evaluating PubMedQA:   8%|▊         | 39/500 [00:58<11:33,  1.50s/it]

[39] Accuracy: 0.9231


Evaluating PubMedQA:   8%|▊         | 40/500 [01:00<11:29,  1.50s/it]

[40] Accuracy: 0.9250


Evaluating PubMedQA:   8%|▊         | 41/500 [01:01<11:28,  1.50s/it]

[41] Accuracy: 0.9268


Evaluating PubMedQA:   8%|▊         | 42/500 [01:03<11:26,  1.50s/it]

[42] Accuracy: 0.9286


Evaluating PubMedQA:   9%|▊         | 43/500 [01:04<11:20,  1.49s/it]

[43] Accuracy: 0.9302


Evaluating PubMedQA:   9%|▉         | 44/500 [01:06<11:18,  1.49s/it]

[44] Accuracy: 0.9318


Evaluating PubMedQA:   9%|▉         | 45/500 [01:07<11:19,  1.49s/it]

[45] Accuracy: 0.9333


Evaluating PubMedQA:   9%|▉         | 46/500 [01:09<11:17,  1.49s/it]

[46] Accuracy: 0.9348


Evaluating PubMedQA:   9%|▉         | 47/500 [01:10<11:16,  1.49s/it]

[47] Accuracy: 0.9362


Evaluating PubMedQA:  10%|▉         | 48/500 [01:12<11:15,  1.49s/it]

[48] Accuracy: 0.9375


Evaluating PubMedQA:  10%|▉         | 49/500 [01:13<11:14,  1.50s/it]

[49] Accuracy: 0.9388


Evaluating PubMedQA:  10%|█         | 50/500 [01:15<11:13,  1.50s/it]

[50] Accuracy: 0.9400


Evaluating PubMedQA:  10%|█         | 51/500 [01:16<11:12,  1.50s/it]

[51] Accuracy: 0.9412


Evaluating PubMedQA:  10%|█         | 52/500 [01:18<11:12,  1.50s/it]

[52] Accuracy: 0.9423


Evaluating PubMedQA:  11%|█         | 53/500 [01:19<11:11,  1.50s/it]

[53] Accuracy: 0.9245


Evaluating PubMedQA:  11%|█         | 54/500 [01:21<11:10,  1.50s/it]

[54] Accuracy: 0.9259


Evaluating PubMedQA:  11%|█         | 55/500 [01:22<11:08,  1.50s/it]

[55] Accuracy: 0.9273


Evaluating PubMedQA:  11%|█         | 56/500 [01:24<11:06,  1.50s/it]

[56] Accuracy: 0.9286


Evaluating PubMedQA:  11%|█▏        | 57/500 [01:25<11:07,  1.51s/it]

[57] Accuracy: 0.9298


Evaluating PubMedQA:  12%|█▏        | 58/500 [01:27<11:07,  1.51s/it]

[58] Accuracy: 0.9310


Evaluating PubMedQA:  12%|█▏        | 59/500 [01:28<11:05,  1.51s/it]

[59] Accuracy: 0.9322


Evaluating PubMedQA:  12%|█▏        | 60/500 [01:30<11:03,  1.51s/it]

[60] Accuracy: 0.9333


Evaluating PubMedQA:  12%|█▏        | 61/500 [01:31<10:58,  1.50s/it]

[61] Accuracy: 0.9344


Evaluating PubMedQA:  12%|█▏        | 62/500 [01:33<10:58,  1.50s/it]

[62] Accuracy: 0.9355


Evaluating PubMedQA:  13%|█▎        | 63/500 [01:34<10:58,  1.51s/it]

[63] Accuracy: 0.9365


Evaluating PubMedQA:  13%|█▎        | 64/500 [01:36<10:55,  1.50s/it]

[64] Accuracy: 0.9375


Evaluating PubMedQA:  13%|█▎        | 65/500 [01:37<10:53,  1.50s/it]

[65] Accuracy: 0.9385


Evaluating PubMedQA:  13%|█▎        | 66/500 [01:39<10:51,  1.50s/it]

[66] Accuracy: 0.9394


Evaluating PubMedQA:  13%|█▎        | 67/500 [01:40<10:49,  1.50s/it]

[67] Accuracy: 0.9403


Evaluating PubMedQA:  14%|█▎        | 68/500 [01:42<10:48,  1.50s/it]

[68] Accuracy: 0.9412


Evaluating PubMedQA:  14%|█▍        | 69/500 [01:43<10:44,  1.49s/it]

[69] Accuracy: 0.9420


Evaluating PubMedQA:  14%|█▍        | 70/500 [01:45<10:46,  1.50s/it]

[70] Accuracy: 0.9429


Evaluating PubMedQA:  14%|█▍        | 71/500 [01:46<10:43,  1.50s/it]

[71] Accuracy: 0.9437


Evaluating PubMedQA:  14%|█▍        | 72/500 [01:48<10:38,  1.49s/it]

[72] Accuracy: 0.9444


Evaluating PubMedQA:  15%|█▍        | 73/500 [01:49<10:37,  1.49s/it]

[73] Accuracy: 0.9452


Evaluating PubMedQA:  15%|█▍        | 74/500 [01:51<10:37,  1.50s/it]

[74] Accuracy: 0.9459


Evaluating PubMedQA:  15%|█▌        | 75/500 [01:52<10:35,  1.50s/it]

[75] Accuracy: 0.9467


Evaluating PubMedQA:  15%|█▌        | 76/500 [01:54<10:35,  1.50s/it]

[76] Accuracy: 0.9474


Evaluating PubMedQA:  15%|█▌        | 77/500 [01:55<10:31,  1.49s/it]

[77] Accuracy: 0.9481


Evaluating PubMedQA:  16%|█▌        | 78/500 [01:57<10:30,  1.49s/it]

[78] Accuracy: 0.9487


Evaluating PubMedQA:  16%|█▌        | 79/500 [01:58<10:30,  1.50s/it]

[79] Accuracy: 0.9494


Evaluating PubMedQA:  16%|█▌        | 80/500 [02:00<10:28,  1.50s/it]

[80] Accuracy: 0.9500


Evaluating PubMedQA:  16%|█▌        | 81/500 [02:01<10:28,  1.50s/it]

[81] Accuracy: 0.9506


Evaluating PubMedQA:  16%|█▋        | 82/500 [02:03<10:25,  1.50s/it]

[82] Accuracy: 0.9512


Evaluating PubMedQA:  17%|█▋        | 83/500 [02:04<10:22,  1.49s/it]

[83] Accuracy: 0.9518


Evaluating PubMedQA:  17%|█▋        | 84/500 [02:06<10:20,  1.49s/it]

[84] Accuracy: 0.9524


Evaluating PubMedQA:  17%|█▋        | 85/500 [02:07<10:21,  1.50s/it]

[85] Accuracy: 0.9529


Evaluating PubMedQA:  17%|█▋        | 86/500 [02:09<10:20,  1.50s/it]

[86] Accuracy: 0.9535


Evaluating PubMedQA:  17%|█▋        | 87/500 [02:10<10:15,  1.49s/it]

[87] Accuracy: 0.9540


Evaluating PubMedQA:  18%|█▊        | 88/500 [02:12<10:15,  1.49s/it]

[88] Accuracy: 0.9545


Evaluating PubMedQA:  18%|█▊        | 89/500 [02:13<10:14,  1.50s/it]

[89] Accuracy: 0.9551


Evaluating PubMedQA:  18%|█▊        | 90/500 [02:15<10:14,  1.50s/it]

[90] Accuracy: 0.9556


Evaluating PubMedQA:  18%|█▊        | 91/500 [02:16<10:10,  1.49s/it]

[91] Accuracy: 0.9560


Evaluating PubMedQA:  18%|█▊        | 92/500 [02:18<10:09,  1.49s/it]

[92] Accuracy: 0.9565


Evaluating PubMedQA:  19%|█▊        | 93/500 [02:19<10:07,  1.49s/it]

[93] Accuracy: 0.9570


Evaluating PubMedQA:  19%|█▉        | 94/500 [02:21<10:06,  1.49s/it]

[94] Accuracy: 0.9574


Evaluating PubMedQA:  19%|█▉        | 95/500 [02:22<10:08,  1.50s/it]

[95] Accuracy: 0.9579


Evaluating PubMedQA:  19%|█▉        | 96/500 [02:24<10:09,  1.51s/it]

[96] Accuracy: 0.9583


Evaluating PubMedQA:  19%|█▉        | 97/500 [02:25<10:09,  1.51s/it]

[97] Accuracy: 0.9588


Evaluating PubMedQA:  20%|█▉        | 98/500 [02:27<10:08,  1.51s/it]

[98] Accuracy: 0.9592


Evaluating PubMedQA:  20%|█▉        | 99/500 [02:28<10:05,  1.51s/it]

[99] Accuracy: 0.9596


Evaluating PubMedQA:  20%|██        | 100/500 [02:30<10:04,  1.51s/it]

[100] Accuracy: 0.9600


Evaluating PubMedQA:  20%|██        | 101/500 [02:31<10:02,  1.51s/it]

[101] Accuracy: 0.9604


Evaluating PubMedQA:  20%|██        | 102/500 [02:33<10:03,  1.52s/it]

[102] Accuracy: 0.9608


Evaluating PubMedQA:  21%|██        | 103/500 [02:34<09:59,  1.51s/it]

[103] Accuracy: 0.9612


Evaluating PubMedQA:  21%|██        | 104/500 [02:36<09:56,  1.51s/it]

[104] Accuracy: 0.9615


Evaluating PubMedQA:  21%|██        | 105/500 [02:37<09:54,  1.50s/it]

[105] Accuracy: 0.9619


Evaluating PubMedQA:  21%|██        | 106/500 [02:39<09:54,  1.51s/it]

[106] Accuracy: 0.9528


Evaluating PubMedQA:  21%|██▏       | 107/500 [02:40<09:54,  1.51s/it]

[107] Accuracy: 0.9533


Evaluating PubMedQA:  22%|██▏       | 108/500 [02:42<09:49,  1.50s/it]

[108] Accuracy: 0.9537


Evaluating PubMedQA:  22%|██▏       | 109/500 [02:43<09:46,  1.50s/it]

[109] Accuracy: 0.9541


Evaluating PubMedQA:  22%|██▏       | 110/500 [02:45<09:47,  1.51s/it]

[110] Accuracy: 0.9545


Evaluating PubMedQA:  22%|██▏       | 111/500 [02:46<09:44,  1.50s/it]

[111] Accuracy: 0.9550


Evaluating PubMedQA:  22%|██▏       | 112/500 [02:48<09:47,  1.52s/it]

[112] Accuracy: 0.9554


Evaluating PubMedQA:  23%|██▎       | 113/500 [02:49<09:44,  1.51s/it]

[113] Accuracy: 0.9558


Evaluating PubMedQA:  23%|██▎       | 114/500 [02:51<09:41,  1.51s/it]

[114] Accuracy: 0.9561


Evaluating PubMedQA:  23%|██▎       | 115/500 [02:52<09:38,  1.50s/it]

[115] Accuracy: 0.9565


Evaluating PubMedQA:  23%|██▎       | 116/500 [02:54<09:36,  1.50s/it]

[116] Accuracy: 0.9569


Evaluating PubMedQA:  23%|██▎       | 117/500 [02:55<09:34,  1.50s/it]

[117] Accuracy: 0.9573


Evaluating PubMedQA:  24%|██▎       | 118/500 [02:57<09:32,  1.50s/it]

[118] Accuracy: 0.9576


Evaluating PubMedQA:  24%|██▍       | 119/500 [02:58<09:30,  1.50s/it]

[119] Accuracy: 0.9580


Evaluating PubMedQA:  24%|██▍       | 120/500 [03:00<09:30,  1.50s/it]

[120] Accuracy: 0.9583


Evaluating PubMedQA:  24%|██▍       | 121/500 [03:01<09:26,  1.50s/it]

[121] Accuracy: 0.9587


Evaluating PubMedQA:  24%|██▍       | 122/500 [03:03<09:26,  1.50s/it]

[122] Accuracy: 0.9590


Evaluating PubMedQA:  25%|██▍       | 123/500 [03:04<09:26,  1.50s/it]

[123] Accuracy: 0.9512


Evaluating PubMedQA:  25%|██▍       | 124/500 [03:06<09:25,  1.50s/it]

[124] Accuracy: 0.9516


Evaluating PubMedQA:  25%|██▌       | 125/500 [03:07<09:22,  1.50s/it]

[125] Accuracy: 0.9520


Evaluating PubMedQA:  25%|██▌       | 126/500 [03:09<09:24,  1.51s/it]

[126] Accuracy: 0.9524


Evaluating PubMedQA:  25%|██▌       | 127/500 [03:11<09:25,  1.51s/it]

[127] Accuracy: 0.9528


Evaluating PubMedQA:  26%|██▌       | 128/500 [03:12<09:25,  1.52s/it]

[128] Accuracy: 0.9531


Evaluating PubMedQA:  26%|██▌       | 129/500 [03:14<09:24,  1.52s/it]

[129] Accuracy: 0.9535


Evaluating PubMedQA:  26%|██▌       | 130/500 [03:15<09:23,  1.52s/it]

[130] Accuracy: 0.9538


Evaluating PubMedQA:  26%|██▌       | 131/500 [03:17<09:21,  1.52s/it]

[131] Accuracy: 0.9542


Evaluating PubMedQA:  26%|██▋       | 132/500 [03:18<09:16,  1.51s/it]

[132] Accuracy: 0.9545


Evaluating PubMedQA:  27%|██▋       | 133/500 [03:20<09:13,  1.51s/it]

[133] Accuracy: 0.9549


Evaluating PubMedQA:  27%|██▋       | 134/500 [03:21<09:10,  1.51s/it]

[134] Accuracy: 0.9552


Evaluating PubMedQA:  27%|██▋       | 135/500 [03:23<09:08,  1.50s/it]

[135] Accuracy: 0.9556


Evaluating PubMedQA:  27%|██▋       | 136/500 [03:24<09:05,  1.50s/it]

[136] Accuracy: 0.9559


Evaluating PubMedQA:  27%|██▋       | 137/500 [03:26<09:10,  1.52s/it]

[137] Accuracy: 0.9562


Evaluating PubMedQA:  28%|██▊       | 138/500 [03:27<09:10,  1.52s/it]

[138] Accuracy: 0.9565


Evaluating PubMedQA:  28%|██▊       | 139/500 [03:29<09:06,  1.51s/it]

[139] Accuracy: 0.9568


Evaluating PubMedQA:  28%|██▊       | 140/500 [03:30<09:03,  1.51s/it]

[140] Accuracy: 0.9571


Evaluating PubMedQA:  28%|██▊       | 141/500 [03:32<09:01,  1.51s/it]

[141] Accuracy: 0.9574


Evaluating PubMedQA:  28%|██▊       | 142/500 [03:33<08:59,  1.51s/it]

[142] Accuracy: 0.9577


Evaluating PubMedQA:  29%|██▊       | 143/500 [03:35<08:56,  1.50s/it]

[143] Accuracy: 0.9580


Evaluating PubMedQA:  29%|██▉       | 144/500 [03:36<08:53,  1.50s/it]

[144] Accuracy: 0.9583


Evaluating PubMedQA:  29%|██▉       | 145/500 [03:38<08:54,  1.50s/it]

[145] Accuracy: 0.9586


Evaluating PubMedQA:  29%|██▉       | 146/500 [03:39<09:00,  1.53s/it]

[146] Accuracy: 0.9589


Evaluating PubMedQA:  29%|██▉       | 147/500 [03:41<08:58,  1.52s/it]

[147] Accuracy: 0.9592


Evaluating PubMedQA:  30%|██▉       | 148/500 [03:42<08:56,  1.52s/it]

[148] Accuracy: 0.9595


Evaluating PubMedQA:  30%|██▉       | 149/500 [03:44<08:53,  1.52s/it]

[149] Accuracy: 0.9597


Evaluating PubMedQA:  30%|███       | 150/500 [03:45<08:51,  1.52s/it]

[150] Accuracy: 0.9600


Evaluating PubMedQA:  30%|███       | 151/500 [03:47<08:49,  1.52s/it]

[151] Accuracy: 0.9603


Evaluating PubMedQA:  30%|███       | 152/500 [03:48<08:44,  1.51s/it]

[152] Accuracy: 0.9605


Evaluating PubMedQA:  31%|███       | 153/500 [03:50<08:43,  1.51s/it]

[153] Accuracy: 0.9608


Evaluating PubMedQA:  31%|███       | 154/500 [03:51<08:41,  1.51s/it]

[154] Accuracy: 0.9610


Evaluating PubMedQA:  31%|███       | 155/500 [03:53<08:38,  1.50s/it]

[155] Accuracy: 0.9613


Evaluating PubMedQA:  31%|███       | 156/500 [03:54<08:36,  1.50s/it]

[156] Accuracy: 0.9615


Evaluating PubMedQA:  31%|███▏      | 157/500 [03:56<08:34,  1.50s/it]

[157] Accuracy: 0.9618


Evaluating PubMedQA:  32%|███▏      | 158/500 [03:57<08:32,  1.50s/it]

[158] Accuracy: 0.9620


Evaluating PubMedQA:  32%|███▏      | 159/500 [03:59<08:30,  1.50s/it]

[159] Accuracy: 0.9623


Evaluating PubMedQA:  32%|███▏      | 160/500 [04:00<08:30,  1.50s/it]

[160] Accuracy: 0.9625


Evaluating PubMedQA:  32%|███▏      | 161/500 [04:02<08:30,  1.51s/it]

[161] Accuracy: 0.9627


Evaluating PubMedQA:  32%|███▏      | 162/500 [04:03<08:28,  1.51s/it]

[162] Accuracy: 0.9630


Evaluating PubMedQA:  33%|███▎      | 163/500 [04:05<08:24,  1.50s/it]

[163] Accuracy: 0.9632


Evaluating PubMedQA:  33%|███▎      | 164/500 [04:06<08:24,  1.50s/it]

[164] Accuracy: 0.9634


Evaluating PubMedQA:  33%|███▎      | 165/500 [04:08<08:24,  1.51s/it]

[165] Accuracy: 0.9636


Evaluating PubMedQA:  33%|███▎      | 166/500 [04:09<08:29,  1.53s/it]

[166] Accuracy: 0.9639


Evaluating PubMedQA:  33%|███▎      | 167/500 [04:11<08:28,  1.53s/it]

[167] Accuracy: 0.9641


Evaluating PubMedQA:  34%|███▎      | 168/500 [04:12<08:22,  1.51s/it]

[168] Accuracy: 0.9643


Evaluating PubMedQA:  34%|███▍      | 169/500 [04:14<08:19,  1.51s/it]

[169] Accuracy: 0.9645


Evaluating PubMedQA:  34%|███▍      | 170/500 [04:15<08:17,  1.51s/it]

[170] Accuracy: 0.9647


Evaluating PubMedQA:  34%|███▍      | 171/500 [04:17<08:15,  1.51s/it]

[171] Accuracy: 0.9649


Evaluating PubMedQA:  34%|███▍      | 172/500 [04:18<08:12,  1.50s/it]

[172] Accuracy: 0.9651


Evaluating PubMedQA:  35%|███▍      | 173/500 [04:20<08:11,  1.50s/it]

[173] Accuracy: 0.9653


Evaluating PubMedQA:  35%|███▍      | 174/500 [04:21<08:08,  1.50s/it]

[174] Accuracy: 0.9655


Evaluating PubMedQA:  35%|███▌      | 175/500 [04:23<08:09,  1.51s/it]

[175] Accuracy: 0.9600


Evaluating PubMedQA:  35%|███▌      | 176/500 [04:24<08:08,  1.51s/it]

[176] Accuracy: 0.9602


Evaluating PubMedQA:  35%|███▌      | 177/500 [04:26<08:07,  1.51s/it]

[177] Accuracy: 0.9605


Evaluating PubMedQA:  36%|███▌      | 178/500 [04:28<08:06,  1.51s/it]

[178] Accuracy: 0.9607


Evaluating PubMedQA:  36%|███▌      | 179/500 [04:29<08:04,  1.51s/it]

[179] Accuracy: 0.9609


Evaluating PubMedQA:  36%|███▌      | 180/500 [04:31<08:02,  1.51s/it]

[180] Accuracy: 0.9611


Evaluating PubMedQA:  36%|███▌      | 181/500 [04:32<08:00,  1.51s/it]

[181] Accuracy: 0.9613


Evaluating PubMedQA:  36%|███▋      | 182/500 [04:34<08:00,  1.51s/it]

[182] Accuracy: 0.9615


Evaluating PubMedQA:  37%|███▋      | 183/500 [04:35<07:57,  1.51s/it]

[183] Accuracy: 0.9617


Evaluating PubMedQA:  37%|███▋      | 184/500 [04:37<07:55,  1.51s/it]

[184] Accuracy: 0.9620


Evaluating PubMedQA:  37%|███▋      | 185/500 [04:38<07:53,  1.50s/it]

[185] Accuracy: 0.9622


Evaluating PubMedQA:  37%|███▋      | 186/500 [04:40<07:51,  1.50s/it]

[186] Accuracy: 0.9570


Evaluating PubMedQA:  37%|███▋      | 187/500 [04:41<07:50,  1.50s/it]

[187] Accuracy: 0.9572


Evaluating PubMedQA:  38%|███▊      | 188/500 [04:43<07:48,  1.50s/it]

[188] Accuracy: 0.9574


Evaluating PubMedQA:  38%|███▊      | 189/500 [04:44<07:47,  1.50s/it]

[189] Accuracy: 0.9577


Evaluating PubMedQA:  38%|███▊      | 190/500 [04:46<07:46,  1.50s/it]

[190] Accuracy: 0.9526


Evaluating PubMedQA:  38%|███▊      | 191/500 [04:47<07:44,  1.50s/it]

[191] Accuracy: 0.9529


Evaluating PubMedQA:  38%|███▊      | 192/500 [04:49<07:42,  1.50s/it]

[192] Accuracy: 0.9531


Evaluating PubMedQA:  39%|███▊      | 193/500 [04:50<07:40,  1.50s/it]

[193] Accuracy: 0.9534


Evaluating PubMedQA:  39%|███▉      | 194/500 [04:52<07:38,  1.50s/it]

[194] Accuracy: 0.9536


Evaluating PubMedQA:  39%|███▉      | 195/500 [04:53<07:38,  1.50s/it]

[195] Accuracy: 0.9538


Evaluating PubMedQA:  39%|███▉      | 196/500 [04:55<07:35,  1.50s/it]

[196] Accuracy: 0.9541


Evaluating PubMedQA:  39%|███▉      | 197/500 [04:56<07:36,  1.51s/it]

[197] Accuracy: 0.9543


Evaluating PubMedQA:  40%|███▉      | 198/500 [04:58<07:32,  1.50s/it]

[198] Accuracy: 0.9545


Evaluating PubMedQA:  40%|███▉      | 199/500 [04:59<07:31,  1.50s/it]

[199] Accuracy: 0.9548


Evaluating PubMedQA:  40%|████      | 200/500 [05:01<07:29,  1.50s/it]

[200] Accuracy: 0.9550


Evaluating PubMedQA:  40%|████      | 201/500 [05:02<07:27,  1.50s/it]

[201] Accuracy: 0.9552


Evaluating PubMedQA:  40%|████      | 202/500 [05:04<07:25,  1.49s/it]

[202] Accuracy: 0.9554


Evaluating PubMedQA:  41%|████      | 203/500 [05:05<07:24,  1.50s/it]

[203] Accuracy: 0.9507


Evaluating PubMedQA:  41%|████      | 204/500 [05:07<07:23,  1.50s/it]

[204] Accuracy: 0.9510


Evaluating PubMedQA:  41%|████      | 205/500 [05:08<07:26,  1.51s/it]

[205] Accuracy: 0.9512


Evaluating PubMedQA:  41%|████      | 206/500 [05:10<07:26,  1.52s/it]

[206] Accuracy: 0.9515


Evaluating PubMedQA:  41%|████▏     | 207/500 [05:11<07:24,  1.52s/it]

[207] Accuracy: 0.9517


Evaluating PubMedQA:  42%|████▏     | 208/500 [05:13<07:20,  1.51s/it]

[208] Accuracy: 0.9519


Evaluating PubMedQA:  42%|████▏     | 209/500 [05:14<07:18,  1.51s/it]

[209] Accuracy: 0.9522


Evaluating PubMedQA:  42%|████▏     | 210/500 [05:16<07:16,  1.50s/it]

[210] Accuracy: 0.9524


Evaluating PubMedQA:  42%|████▏     | 211/500 [05:17<07:15,  1.51s/it]

[211] Accuracy: 0.9526


Evaluating PubMedQA:  42%|████▏     | 212/500 [05:19<07:14,  1.51s/it]

[212] Accuracy: 0.9528


Evaluating PubMedQA:  43%|████▎     | 213/500 [05:20<07:12,  1.51s/it]

[213] Accuracy: 0.9531


Evaluating PubMedQA:  43%|████▎     | 214/500 [05:22<07:11,  1.51s/it]

[214] Accuracy: 0.9533


Evaluating PubMedQA:  43%|████▎     | 215/500 [05:23<07:09,  1.51s/it]

[215] Accuracy: 0.9535


Evaluating PubMedQA:  43%|████▎     | 216/500 [05:25<07:11,  1.52s/it]

[216] Accuracy: 0.9537


Evaluating PubMedQA:  43%|████▎     | 217/500 [05:26<07:09,  1.52s/it]

[217] Accuracy: 0.9539


Evaluating PubMedQA:  44%|████▎     | 218/500 [05:28<07:06,  1.51s/it]

[218] Accuracy: 0.9541


Evaluating PubMedQA:  44%|████▍     | 219/500 [05:29<07:03,  1.51s/it]

[219] Accuracy: 0.9543


Evaluating PubMedQA:  44%|████▍     | 220/500 [05:31<07:01,  1.50s/it]

[220] Accuracy: 0.9545


Evaluating PubMedQA:  44%|████▍     | 221/500 [05:32<06:59,  1.50s/it]

[221] Accuracy: 0.9548


Evaluating PubMedQA:  44%|████▍     | 222/500 [05:34<06:59,  1.51s/it]

[222] Accuracy: 0.9550


Evaluating PubMedQA:  45%|████▍     | 223/500 [05:35<06:58,  1.51s/it]

[223] Accuracy: 0.9552


Evaluating PubMedQA:  45%|████▍     | 224/500 [05:37<06:57,  1.51s/it]

[224] Accuracy: 0.9509


Evaluating PubMedQA:  45%|████▌     | 225/500 [05:38<06:55,  1.51s/it]

[225] Accuracy: 0.9511


Evaluating PubMedQA:  45%|████▌     | 226/500 [05:40<06:55,  1.52s/it]

[226] Accuracy: 0.9513


Evaluating PubMedQA:  45%|████▌     | 227/500 [05:41<06:53,  1.51s/it]

[227] Accuracy: 0.9515


Evaluating PubMedQA:  46%|████▌     | 228/500 [05:43<06:51,  1.51s/it]

[228] Accuracy: 0.9518


Evaluating PubMedQA:  46%|████▌     | 229/500 [05:44<06:49,  1.51s/it]

[229] Accuracy: 0.9520


Evaluating PubMedQA:  46%|████▌     | 230/500 [05:46<06:46,  1.51s/it]

[230] Accuracy: 0.9522


Evaluating PubMedQA:  46%|████▌     | 231/500 [05:47<06:46,  1.51s/it]

[231] Accuracy: 0.9524


Evaluating PubMedQA:  46%|████▋     | 232/500 [05:49<06:45,  1.51s/it]

[232] Accuracy: 0.9526


Evaluating PubMedQA:  47%|████▋     | 233/500 [05:50<06:44,  1.52s/it]

[233] Accuracy: 0.9528


Evaluating PubMedQA:  47%|████▋     | 234/500 [05:52<06:42,  1.52s/it]

[234] Accuracy: 0.9530


Evaluating PubMedQA:  47%|████▋     | 235/500 [05:53<06:40,  1.51s/it]

[235] Accuracy: 0.9489


Evaluating PubMedQA:  47%|████▋     | 236/500 [05:55<06:37,  1.51s/it]

[236] Accuracy: 0.9492


Evaluating PubMedQA:  47%|████▋     | 237/500 [05:56<06:35,  1.50s/it]

[237] Accuracy: 0.9451


Evaluating PubMedQA:  48%|████▊     | 238/500 [05:58<06:34,  1.50s/it]

[238] Accuracy: 0.9454


Evaluating PubMedQA:  48%|████▊     | 239/500 [05:59<06:32,  1.50s/it]

[239] Accuracy: 0.9456


Evaluating PubMedQA:  48%|████▊     | 240/500 [06:01<06:30,  1.50s/it]

[240] Accuracy: 0.9458


Evaluating PubMedQA:  48%|████▊     | 241/500 [06:02<06:28,  1.50s/it]

[241] Accuracy: 0.9461


Evaluating PubMedQA:  48%|████▊     | 242/500 [06:04<06:26,  1.50s/it]

[242] Accuracy: 0.9463


Evaluating PubMedQA:  49%|████▊     | 243/500 [06:05<06:25,  1.50s/it]

[243] Accuracy: 0.9465


Evaluating PubMedQA:  49%|████▉     | 244/500 [06:07<06:25,  1.50s/it]

[244] Accuracy: 0.9467


Evaluating PubMedQA:  49%|████▉     | 245/500 [06:08<06:24,  1.51s/it]

[245] Accuracy: 0.9469


Evaluating PubMedQA:  49%|████▉     | 246/500 [06:10<06:23,  1.51s/it]

[246] Accuracy: 0.9472


Evaluating PubMedQA:  49%|████▉     | 247/500 [06:12<06:25,  1.52s/it]

[247] Accuracy: 0.9474


Evaluating PubMedQA:  50%|████▉     | 248/500 [06:13<06:25,  1.53s/it]

[248] Accuracy: 0.9476


Evaluating PubMedQA:  50%|████▉     | 249/500 [06:15<06:26,  1.54s/it]

[249] Accuracy: 0.9478


Evaluating PubMedQA:  50%|█████     | 250/500 [06:16<06:24,  1.54s/it]

[250] Accuracy: 0.9480


Evaluating PubMedQA:  50%|█████     | 251/500 [06:18<06:21,  1.53s/it]

[251] Accuracy: 0.9482


Evaluating PubMedQA:  50%|█████     | 252/500 [06:19<06:19,  1.53s/it]

[252] Accuracy: 0.9484


Evaluating PubMedQA:  51%|█████     | 253/500 [06:21<06:16,  1.53s/it]

[253] Accuracy: 0.9486


Evaluating PubMedQA:  51%|█████     | 254/500 [06:22<06:15,  1.53s/it]

[254] Accuracy: 0.9488


Evaluating PubMedQA:  51%|█████     | 255/500 [06:24<06:12,  1.52s/it]

[255] Accuracy: 0.9490


Evaluating PubMedQA:  51%|█████     | 256/500 [06:25<06:12,  1.53s/it]

[256] Accuracy: 0.9492


Evaluating PubMedQA:  51%|█████▏    | 257/500 [06:27<06:11,  1.53s/it]

[257] Accuracy: 0.9494


Evaluating PubMedQA:  52%|█████▏    | 258/500 [06:28<06:10,  1.53s/it]

[258] Accuracy: 0.9496


Evaluating PubMedQA:  52%|█████▏    | 259/500 [06:30<06:08,  1.53s/it]

[259] Accuracy: 0.9498


Evaluating PubMedQA:  52%|█████▏    | 260/500 [06:31<06:06,  1.53s/it]

[260] Accuracy: 0.9500


Evaluating PubMedQA:  52%|█████▏    | 261/500 [06:33<06:05,  1.53s/it]

[261] Accuracy: 0.9502


Evaluating PubMedQA:  52%|█████▏    | 262/500 [06:34<06:04,  1.53s/it]

[262] Accuracy: 0.9504


Evaluating PubMedQA:  53%|█████▎    | 263/500 [06:36<06:03,  1.53s/it]

[263] Accuracy: 0.9506


Evaluating PubMedQA:  53%|█████▎    | 264/500 [06:38<06:00,  1.53s/it]

[264] Accuracy: 0.9508


Evaluating PubMedQA:  53%|█████▎    | 265/500 [06:39<05:59,  1.53s/it]

[265] Accuracy: 0.9509


Evaluating PubMedQA:  53%|█████▎    | 266/500 [06:41<05:56,  1.52s/it]

[266] Accuracy: 0.9511


Evaluating PubMedQA:  53%|█████▎    | 267/500 [06:42<05:55,  1.53s/it]

[267] Accuracy: 0.9513


Evaluating PubMedQA:  54%|█████▎    | 268/500 [06:44<05:54,  1.53s/it]

[268] Accuracy: 0.9515


Evaluating PubMedQA:  54%|█████▍    | 269/500 [06:45<05:52,  1.52s/it]

[269] Accuracy: 0.9517


Evaluating PubMedQA:  54%|█████▍    | 270/500 [06:47<05:50,  1.52s/it]

[270] Accuracy: 0.9519


Evaluating PubMedQA:  54%|█████▍    | 271/500 [06:48<05:46,  1.51s/it]

[271] Accuracy: 0.9520


Evaluating PubMedQA:  54%|█████▍    | 272/500 [06:50<05:43,  1.51s/it]

[272] Accuracy: 0.9522


Evaluating PubMedQA:  55%|█████▍    | 273/500 [06:51<05:42,  1.51s/it]

[273] Accuracy: 0.9524


Evaluating PubMedQA:  55%|█████▍    | 274/500 [06:53<05:40,  1.51s/it]

[274] Accuracy: 0.9526


Evaluating PubMedQA:  55%|█████▌    | 275/500 [06:54<05:38,  1.51s/it]

[275] Accuracy: 0.9527


Evaluating PubMedQA:  55%|█████▌    | 276/500 [06:56<05:38,  1.51s/it]

[276] Accuracy: 0.9529


Evaluating PubMedQA:  55%|█████▌    | 277/500 [06:57<05:36,  1.51s/it]

[277] Accuracy: 0.9495


Evaluating PubMedQA:  56%|█████▌    | 278/500 [06:59<05:34,  1.51s/it]

[278] Accuracy: 0.9460


Evaluating PubMedQA:  56%|█████▌    | 279/500 [07:00<05:32,  1.50s/it]

[279] Accuracy: 0.9427


Evaluating PubMedQA:  56%|█████▌    | 280/500 [07:02<05:30,  1.50s/it]

[280] Accuracy: 0.9393


Evaluating PubMedQA:  56%|█████▌    | 281/500 [07:03<05:28,  1.50s/it]

[281] Accuracy: 0.9359


Evaluating PubMedQA:  56%|█████▋    | 282/500 [07:05<05:29,  1.51s/it]

[282] Accuracy: 0.9326


Evaluating PubMedQA:  57%|█████▋    | 283/500 [07:06<05:28,  1.51s/it]

[283] Accuracy: 0.9329


Evaluating PubMedQA:  57%|█████▋    | 284/500 [07:08<05:28,  1.52s/it]

[284] Accuracy: 0.9296


Evaluating PubMedQA:  57%|█████▋    | 285/500 [07:09<05:26,  1.52s/it]

[285] Accuracy: 0.9263


Evaluating PubMedQA:  57%|█████▋    | 286/500 [07:11<05:25,  1.52s/it]

[286] Accuracy: 0.9231


Evaluating PubMedQA:  57%|█████▋    | 287/500 [07:12<05:23,  1.52s/it]

[287] Accuracy: 0.9233


Evaluating PubMedQA:  58%|█████▊    | 288/500 [07:14<05:21,  1.52s/it]

[288] Accuracy: 0.9201


Evaluating PubMedQA:  58%|█████▊    | 289/500 [07:15<05:18,  1.51s/it]

[289] Accuracy: 0.9170


Evaluating PubMedQA:  58%|█████▊    | 290/500 [07:17<05:18,  1.52s/it]

[290] Accuracy: 0.9138


Evaluating PubMedQA:  58%|█████▊    | 291/500 [07:18<05:15,  1.51s/it]

[291] Accuracy: 0.9107


Evaluating PubMedQA:  58%|█████▊    | 292/500 [07:20<05:13,  1.51s/it]

[292] Accuracy: 0.9075


Evaluating PubMedQA:  59%|█████▊    | 293/500 [07:21<05:11,  1.51s/it]

[293] Accuracy: 0.9044


Evaluating PubMedQA:  59%|█████▉    | 294/500 [07:23<05:10,  1.51s/it]

[294] Accuracy: 0.9014


Evaluating PubMedQA:  59%|█████▉    | 295/500 [07:24<05:08,  1.50s/it]

[295] Accuracy: 0.8983


Evaluating PubMedQA:  59%|█████▉    | 296/500 [07:26<05:07,  1.51s/it]

[296] Accuracy: 0.8953


Evaluating PubMedQA:  59%|█████▉    | 297/500 [07:27<05:05,  1.50s/it]

[297] Accuracy: 0.8923


Evaluating PubMedQA:  60%|█████▉    | 298/500 [07:29<05:03,  1.50s/it]

[298] Accuracy: 0.8893


Evaluating PubMedQA:  60%|█████▉    | 299/500 [07:30<05:03,  1.51s/it]

[299] Accuracy: 0.8863


Evaluating PubMedQA:  60%|██████    | 300/500 [07:32<05:02,  1.51s/it]

[300] Accuracy: 0.8833


Evaluating PubMedQA:  60%|██████    | 301/500 [07:33<05:00,  1.51s/it]

[301] Accuracy: 0.8804


Evaluating PubMedQA:  60%|██████    | 302/500 [07:35<04:58,  1.51s/it]

[302] Accuracy: 0.8775


Evaluating PubMedQA:  61%|██████    | 303/500 [07:36<04:56,  1.51s/it]

[303] Accuracy: 0.8746


Evaluating PubMedQA:  61%|██████    | 304/500 [07:38<04:55,  1.51s/it]

[304] Accuracy: 0.8750


Evaluating PubMedQA:  61%|██████    | 305/500 [07:39<04:53,  1.51s/it]

[305] Accuracy: 0.8721


Evaluating PubMedQA:  61%|██████    | 306/500 [07:41<04:52,  1.51s/it]

[306] Accuracy: 0.8693


Evaluating PubMedQA:  61%|██████▏   | 307/500 [07:42<04:51,  1.51s/it]

[307] Accuracy: 0.8664


Evaluating PubMedQA:  62%|██████▏   | 308/500 [07:44<04:49,  1.51s/it]

[308] Accuracy: 0.8636


Evaluating PubMedQA:  62%|██████▏   | 309/500 [07:46<04:48,  1.51s/it]

[309] Accuracy: 0.8608


Evaluating PubMedQA:  62%|██████▏   | 310/500 [07:47<04:47,  1.51s/it]

[310] Accuracy: 0.8581


Evaluating PubMedQA:  62%|██████▏   | 311/500 [07:49<04:45,  1.51s/it]

[311] Accuracy: 0.8553


Evaluating PubMedQA:  62%|██████▏   | 312/500 [07:50<04:44,  1.51s/it]

[312] Accuracy: 0.8526


Evaluating PubMedQA:  63%|██████▎   | 313/500 [07:52<04:41,  1.51s/it]

[313] Accuracy: 0.8498


Evaluating PubMedQA:  63%|██████▎   | 314/500 [07:53<04:40,  1.51s/it]

[314] Accuracy: 0.8471


Evaluating PubMedQA:  63%|██████▎   | 315/500 [07:55<04:38,  1.51s/it]

[315] Accuracy: 0.8444


Evaluating PubMedQA:  63%|██████▎   | 316/500 [07:56<04:37,  1.51s/it]

[316] Accuracy: 0.8418


Evaluating PubMedQA:  63%|██████▎   | 317/500 [07:58<04:35,  1.50s/it]

[317] Accuracy: 0.8391


Evaluating PubMedQA:  64%|██████▎   | 318/500 [07:59<04:33,  1.51s/it]

[318] Accuracy: 0.8365


Evaluating PubMedQA:  64%|██████▍   | 319/500 [08:01<04:32,  1.51s/it]

[319] Accuracy: 0.8339


Evaluating PubMedQA:  64%|██████▍   | 320/500 [08:02<04:31,  1.51s/it]

[320] Accuracy: 0.8313


Evaluating PubMedQA:  64%|██████▍   | 321/500 [08:04<04:29,  1.50s/it]

[321] Accuracy: 0.8287


Evaluating PubMedQA:  64%|██████▍   | 322/500 [08:05<04:27,  1.50s/it]

[322] Accuracy: 0.8261


Evaluating PubMedQA:  65%|██████▍   | 323/500 [08:07<04:26,  1.51s/it]

[323] Accuracy: 0.8235


Evaluating PubMedQA:  65%|██████▍   | 324/500 [08:08<04:25,  1.51s/it]

[324] Accuracy: 0.8210


Evaluating PubMedQA:  65%|██████▌   | 325/500 [08:10<04:24,  1.51s/it]

[325] Accuracy: 0.8185


Evaluating PubMedQA:  65%|██████▌   | 326/500 [08:11<04:21,  1.50s/it]

[326] Accuracy: 0.8160


Evaluating PubMedQA:  65%|██████▌   | 327/500 [08:13<04:19,  1.50s/it]

[327] Accuracy: 0.8135


Evaluating PubMedQA:  66%|██████▌   | 328/500 [08:14<04:17,  1.50s/it]

[328] Accuracy: 0.8110


Evaluating PubMedQA:  66%|██████▌   | 329/500 [08:16<04:16,  1.50s/it]

[329] Accuracy: 0.8085


Evaluating PubMedQA:  66%|██████▌   | 330/500 [08:17<04:15,  1.50s/it]

[330] Accuracy: 0.8061


Evaluating PubMedQA:  66%|██████▌   | 331/500 [08:19<04:13,  1.50s/it]

[331] Accuracy: 0.8036


Evaluating PubMedQA:  66%|██████▋   | 332/500 [08:20<04:13,  1.51s/it]

[332] Accuracy: 0.8042


Evaluating PubMedQA:  67%|██████▋   | 333/500 [08:22<04:11,  1.51s/it]

[333] Accuracy: 0.8018


Evaluating PubMedQA:  67%|██████▋   | 334/500 [08:23<04:11,  1.51s/it]

[334] Accuracy: 0.7994


Evaluating PubMedQA:  67%|██████▋   | 335/500 [08:25<04:10,  1.52s/it]

[335] Accuracy: 0.7970


Evaluating PubMedQA:  67%|██████▋   | 336/500 [08:26<04:10,  1.52s/it]

[336] Accuracy: 0.7946


Evaluating PubMedQA:  67%|██████▋   | 337/500 [08:28<04:08,  1.52s/it]

[337] Accuracy: 0.7923


Evaluating PubMedQA:  68%|██████▊   | 338/500 [08:29<04:06,  1.52s/it]

[338] Accuracy: 0.7899


Evaluating PubMedQA:  68%|██████▊   | 339/500 [08:31<04:05,  1.53s/it]

[339] Accuracy: 0.7876


Evaluating PubMedQA:  68%|██████▊   | 340/500 [08:32<04:04,  1.53s/it]

[340] Accuracy: 0.7853


Evaluating PubMedQA:  68%|██████▊   | 341/500 [08:34<04:03,  1.53s/it]

[341] Accuracy: 0.7830


Evaluating PubMedQA:  68%|██████▊   | 342/500 [08:35<04:02,  1.54s/it]

[342] Accuracy: 0.7807


Evaluating PubMedQA:  69%|██████▊   | 343/500 [08:37<04:00,  1.53s/it]

[343] Accuracy: 0.7813


Evaluating PubMedQA:  69%|██████▉   | 344/500 [08:38<03:57,  1.52s/it]

[344] Accuracy: 0.7791


Evaluating PubMedQA:  69%|██████▉   | 345/500 [08:40<03:55,  1.52s/it]

[345] Accuracy: 0.7797


Evaluating PubMedQA:  69%|██████▉   | 346/500 [08:41<03:53,  1.52s/it]

[346] Accuracy: 0.7775


Evaluating PubMedQA:  69%|██████▉   | 347/500 [08:43<03:51,  1.52s/it]

[347] Accuracy: 0.7752


Evaluating PubMedQA:  70%|██████▉   | 348/500 [08:44<03:49,  1.51s/it]

[348] Accuracy: 0.7730


Evaluating PubMedQA:  70%|██████▉   | 349/500 [08:46<03:47,  1.51s/it]

[349] Accuracy: 0.7708


Evaluating PubMedQA:  70%|███████   | 350/500 [08:47<03:45,  1.51s/it]

[350] Accuracy: 0.7686


Evaluating PubMedQA:  70%|███████   | 351/500 [08:49<03:44,  1.50s/it]

[351] Accuracy: 0.7664


Evaluating PubMedQA:  70%|███████   | 352/500 [08:50<03:43,  1.51s/it]

[352] Accuracy: 0.7642


Evaluating PubMedQA:  71%|███████   | 353/500 [08:52<03:41,  1.51s/it]

[353] Accuracy: 0.7620


Evaluating PubMedQA:  71%|███████   | 354/500 [08:54<03:40,  1.51s/it]

[354] Accuracy: 0.7599


Evaluating PubMedQA:  71%|███████   | 355/500 [08:55<03:40,  1.52s/it]

[355] Accuracy: 0.7577


Evaluating PubMedQA:  71%|███████   | 356/500 [08:57<03:39,  1.52s/it]

[356] Accuracy: 0.7556


Evaluating PubMedQA:  71%|███████▏  | 357/500 [08:58<03:36,  1.52s/it]

[357] Accuracy: 0.7535


Evaluating PubMedQA:  72%|███████▏  | 358/500 [09:00<03:34,  1.51s/it]

[358] Accuracy: 0.7514


Evaluating PubMedQA:  72%|███████▏  | 359/500 [09:01<03:33,  1.51s/it]

[359] Accuracy: 0.7493


Evaluating PubMedQA:  72%|███████▏  | 360/500 [09:03<03:31,  1.51s/it]

[360] Accuracy: 0.7472


Evaluating PubMedQA:  72%|███████▏  | 361/500 [09:04<03:28,  1.50s/it]

[361] Accuracy: 0.7452


Evaluating PubMedQA:  72%|███████▏  | 362/500 [09:06<03:27,  1.50s/it]

[362] Accuracy: 0.7431


Evaluating PubMedQA:  73%|███████▎  | 363/500 [09:07<03:26,  1.51s/it]

[363] Accuracy: 0.7410


Evaluating PubMedQA:  73%|███████▎  | 364/500 [09:09<03:25,  1.51s/it]

[364] Accuracy: 0.7390


Evaluating PubMedQA:  73%|███████▎  | 365/500 [09:10<03:24,  1.52s/it]

[365] Accuracy: 0.7370


Evaluating PubMedQA:  73%|███████▎  | 366/500 [09:12<03:22,  1.51s/it]

[366] Accuracy: 0.7350


Evaluating PubMedQA:  73%|███████▎  | 367/500 [09:13<03:21,  1.51s/it]

[367] Accuracy: 0.7330


Evaluating PubMedQA:  74%|███████▎  | 368/500 [09:15<03:19,  1.51s/it]

[368] Accuracy: 0.7310


Evaluating PubMedQA:  74%|███████▍  | 369/500 [09:16<03:18,  1.51s/it]

[369] Accuracy: 0.7290


Evaluating PubMedQA:  74%|███████▍  | 370/500 [09:18<03:16,  1.51s/it]

[370] Accuracy: 0.7297


Evaluating PubMedQA:  74%|███████▍  | 371/500 [09:19<03:14,  1.51s/it]

[371] Accuracy: 0.7278


Evaluating PubMedQA:  74%|███████▍  | 372/500 [09:21<03:13,  1.51s/it]

[372] Accuracy: 0.7258


Evaluating PubMedQA:  75%|███████▍  | 373/500 [09:22<03:12,  1.51s/it]

[373] Accuracy: 0.7239


Evaluating PubMedQA:  75%|███████▍  | 374/500 [09:24<03:10,  1.51s/it]

[374] Accuracy: 0.7219


Evaluating PubMedQA:  75%|███████▌  | 375/500 [09:25<03:09,  1.52s/it]

[375] Accuracy: 0.7200


Evaluating PubMedQA:  75%|███████▌  | 376/500 [09:27<03:08,  1.52s/it]

[376] Accuracy: 0.7181


Evaluating PubMedQA:  75%|███████▌  | 377/500 [09:28<03:06,  1.51s/it]

[377] Accuracy: 0.7162


Evaluating PubMedQA:  76%|███████▌  | 378/500 [09:30<03:04,  1.51s/it]

[378] Accuracy: 0.7143


Evaluating PubMedQA:  76%|███████▌  | 379/500 [09:31<03:02,  1.51s/it]

[379] Accuracy: 0.7124


Evaluating PubMedQA:  76%|███████▌  | 380/500 [09:33<03:00,  1.51s/it]

[380] Accuracy: 0.7105


Evaluating PubMedQA:  76%|███████▌  | 381/500 [09:34<02:59,  1.51s/it]

[381] Accuracy: 0.7087


Evaluating PubMedQA:  76%|███████▋  | 382/500 [09:36<02:57,  1.50s/it]

[382] Accuracy: 0.7068


Evaluating PubMedQA:  77%|███████▋  | 383/500 [09:37<02:55,  1.50s/it]

[383] Accuracy: 0.7050


Evaluating PubMedQA:  77%|███████▋  | 384/500 [09:39<02:55,  1.51s/it]

[384] Accuracy: 0.7031


Evaluating PubMedQA:  77%|███████▋  | 385/500 [09:40<02:54,  1.51s/it]

[385] Accuracy: 0.7013


Evaluating PubMedQA:  77%|███████▋  | 386/500 [09:42<02:52,  1.51s/it]

[386] Accuracy: 0.6995


Evaluating PubMedQA:  77%|███████▋  | 387/500 [09:43<02:50,  1.51s/it]

[387] Accuracy: 0.6977


Evaluating PubMedQA:  78%|███████▊  | 388/500 [09:45<02:49,  1.51s/it]

[388] Accuracy: 0.6959


Evaluating PubMedQA:  78%|███████▊  | 389/500 [09:46<02:47,  1.51s/it]

[389] Accuracy: 0.6941


Evaluating PubMedQA:  78%|███████▊  | 390/500 [09:48<02:46,  1.51s/it]

[390] Accuracy: 0.6923


Evaluating PubMedQA:  78%|███████▊  | 391/500 [09:49<02:45,  1.51s/it]

[391] Accuracy: 0.6905


Evaluating PubMedQA:  78%|███████▊  | 392/500 [09:51<02:43,  1.52s/it]

[392] Accuracy: 0.6888


Evaluating PubMedQA:  79%|███████▊  | 393/500 [09:52<02:41,  1.51s/it]

[393] Accuracy: 0.6870


Evaluating PubMedQA:  79%|███████▉  | 394/500 [09:54<02:39,  1.51s/it]

[394] Accuracy: 0.6853


Evaluating PubMedQA:  79%|███████▉  | 395/500 [09:55<02:38,  1.50s/it]

[395] Accuracy: 0.6835


Evaluating PubMedQA:  79%|███████▉  | 396/500 [09:57<02:36,  1.51s/it]

[396] Accuracy: 0.6818


Evaluating PubMedQA:  79%|███████▉  | 397/500 [09:58<02:35,  1.51s/it]

[397] Accuracy: 0.6801


Evaluating PubMedQA:  80%|███████▉  | 398/500 [10:00<02:33,  1.51s/it]

[398] Accuracy: 0.6784


Evaluating PubMedQA:  80%|███████▉  | 399/500 [10:01<02:31,  1.50s/it]

[399] Accuracy: 0.6767


Evaluating PubMedQA:  80%|████████  | 400/500 [10:03<02:29,  1.50s/it]

[400] Accuracy: 0.6750


Evaluating PubMedQA:  80%|████████  | 401/500 [10:04<02:28,  1.50s/it]

[401] Accuracy: 0.6733


Evaluating PubMedQA:  80%|████████  | 402/500 [10:06<02:27,  1.51s/it]

[402] Accuracy: 0.6716


Evaluating PubMedQA:  81%|████████  | 403/500 [10:08<02:26,  1.51s/it]

[403] Accuracy: 0.6700


Evaluating PubMedQA:  81%|████████  | 404/500 [10:09<02:26,  1.52s/it]

[404] Accuracy: 0.6683


Evaluating PubMedQA:  81%|████████  | 405/500 [10:11<02:24,  1.52s/it]

[405] Accuracy: 0.6667


Evaluating PubMedQA:  81%|████████  | 406/500 [10:12<02:23,  1.53s/it]

[406] Accuracy: 0.6650


Evaluating PubMedQA:  81%|████████▏ | 407/500 [10:14<02:21,  1.53s/it]

[407] Accuracy: 0.6634


Evaluating PubMedQA:  82%|████████▏ | 408/500 [10:15<02:20,  1.52s/it]

[408] Accuracy: 0.6618


Evaluating PubMedQA:  82%|████████▏ | 409/500 [10:17<02:18,  1.52s/it]

[409] Accuracy: 0.6601


Evaluating PubMedQA:  82%|████████▏ | 410/500 [10:18<02:16,  1.51s/it]

[410] Accuracy: 0.6585


Evaluating PubMedQA:  82%|████████▏ | 411/500 [10:20<02:14,  1.51s/it]

[411] Accuracy: 0.6569


Evaluating PubMedQA:  82%|████████▏ | 412/500 [10:21<02:13,  1.51s/it]

[412] Accuracy: 0.6553


Evaluating PubMedQA:  83%|████████▎ | 413/500 [10:23<02:11,  1.51s/it]

[413] Accuracy: 0.6538


Evaluating PubMedQA:  83%|████████▎ | 414/500 [10:24<02:10,  1.52s/it]

[414] Accuracy: 0.6522


Evaluating PubMedQA:  83%|████████▎ | 415/500 [10:26<02:09,  1.53s/it]

[415] Accuracy: 0.6506


Evaluating PubMedQA:  83%|████████▎ | 416/500 [10:27<02:07,  1.52s/it]

[416] Accuracy: 0.6490


Evaluating PubMedQA:  83%|████████▎ | 417/500 [10:29<02:06,  1.52s/it]

[417] Accuracy: 0.6475


Evaluating PubMedQA:  84%|████████▎ | 418/500 [10:30<02:04,  1.52s/it]

[418] Accuracy: 0.6459


Evaluating PubMedQA:  84%|████████▍ | 419/500 [10:32<02:02,  1.51s/it]

[419] Accuracy: 0.6444


Evaluating PubMedQA:  84%|████████▍ | 420/500 [10:33<02:01,  1.52s/it]

[420] Accuracy: 0.6429


Evaluating PubMedQA:  84%|████████▍ | 421/500 [10:35<01:59,  1.52s/it]

[421] Accuracy: 0.6413


Evaluating PubMedQA:  84%|████████▍ | 422/500 [10:36<01:58,  1.52s/it]

[422] Accuracy: 0.6398


Evaluating PubMedQA:  85%|████████▍ | 423/500 [10:38<01:56,  1.51s/it]

[423] Accuracy: 0.6383


Evaluating PubMedQA:  85%|████████▍ | 424/500 [10:39<01:55,  1.51s/it]

[424] Accuracy: 0.6368


Evaluating PubMedQA:  85%|████████▌ | 425/500 [10:41<01:53,  1.52s/it]

[425] Accuracy: 0.6353


Evaluating PubMedQA:  85%|████████▌ | 426/500 [10:42<01:52,  1.52s/it]

[426] Accuracy: 0.6338


Evaluating PubMedQA:  85%|████████▌ | 427/500 [10:44<01:50,  1.51s/it]

[427] Accuracy: 0.6347


Evaluating PubMedQA:  86%|████████▌ | 428/500 [10:45<01:48,  1.51s/it]

[428] Accuracy: 0.6332


Evaluating PubMedQA:  86%|████████▌ | 429/500 [10:47<01:47,  1.52s/it]

[429] Accuracy: 0.6317


Evaluating PubMedQA:  86%|████████▌ | 430/500 [10:48<01:45,  1.51s/it]

[430] Accuracy: 0.6302


Evaluating PubMedQA:  86%|████████▌ | 431/500 [10:50<01:44,  1.51s/it]

[431] Accuracy: 0.6288


Evaluating PubMedQA:  86%|████████▋ | 432/500 [10:52<01:43,  1.52s/it]

[432] Accuracy: 0.6273


Evaluating PubMedQA:  87%|████████▋ | 433/500 [10:53<01:41,  1.51s/it]

[433] Accuracy: 0.6259


Evaluating PubMedQA:  87%|████████▋ | 434/500 [10:55<01:39,  1.51s/it]

[434] Accuracy: 0.6244


Evaluating PubMedQA:  87%|████████▋ | 435/500 [10:56<01:38,  1.52s/it]

[435] Accuracy: 0.6230


Evaluating PubMedQA:  87%|████████▋ | 436/500 [10:58<01:36,  1.51s/it]

[436] Accuracy: 0.6216


Evaluating PubMedQA:  87%|████████▋ | 437/500 [10:59<01:35,  1.51s/it]

[437] Accuracy: 0.6201


Evaluating PubMedQA:  88%|████████▊ | 438/500 [11:01<01:33,  1.51s/it]

[438] Accuracy: 0.6187


Evaluating PubMedQA:  88%|████████▊ | 439/500 [11:02<01:32,  1.51s/it]

[439] Accuracy: 0.6173


Evaluating PubMedQA:  88%|████████▊ | 440/500 [11:04<01:30,  1.51s/it]

[440] Accuracy: 0.6159


Evaluating PubMedQA:  88%|████████▊ | 441/500 [11:05<01:29,  1.51s/it]

[441] Accuracy: 0.6145


Evaluating PubMedQA:  88%|████████▊ | 442/500 [11:07<01:27,  1.52s/it]

[442] Accuracy: 0.6131


Evaluating PubMedQA:  89%|████████▊ | 443/500 [11:08<01:26,  1.52s/it]

[443] Accuracy: 0.6117


Evaluating PubMedQA:  89%|████████▉ | 444/500 [11:10<01:25,  1.52s/it]

[444] Accuracy: 0.6104


Evaluating PubMedQA:  89%|████████▉ | 445/500 [11:11<01:23,  1.52s/it]

[445] Accuracy: 0.6090


Evaluating PubMedQA:  89%|████████▉ | 446/500 [11:13<01:22,  1.52s/it]

[446] Accuracy: 0.6076


Evaluating PubMedQA:  89%|████████▉ | 447/500 [11:14<01:20,  1.52s/it]

[447] Accuracy: 0.6063


Evaluating PubMedQA:  90%|████████▉ | 448/500 [11:16<01:19,  1.52s/it]

[448] Accuracy: 0.6049


Evaluating PubMedQA:  90%|████████▉ | 449/500 [11:17<01:17,  1.52s/it]

[449] Accuracy: 0.6036


Evaluating PubMedQA:  90%|█████████ | 450/500 [11:19<01:15,  1.52s/it]

[450] Accuracy: 0.6022


Evaluating PubMedQA:  90%|█████████ | 451/500 [11:20<01:14,  1.52s/it]

[451] Accuracy: 0.6009


Evaluating PubMedQA:  90%|█████████ | 452/500 [11:22<01:12,  1.52s/it]

[452] Accuracy: 0.5996


Evaluating PubMedQA:  91%|█████████ | 453/500 [11:23<01:11,  1.52s/it]

[453] Accuracy: 0.5982


Evaluating PubMedQA:  91%|█████████ | 454/500 [11:25<01:10,  1.52s/it]

[454] Accuracy: 0.5969


Evaluating PubMedQA:  91%|█████████ | 455/500 [11:26<01:08,  1.52s/it]

[455] Accuracy: 0.5956


Evaluating PubMedQA:  91%|█████████ | 456/500 [11:28<01:06,  1.52s/it]

[456] Accuracy: 0.5943


Evaluating PubMedQA:  91%|█████████▏| 457/500 [11:29<01:05,  1.51s/it]

[457] Accuracy: 0.5930


Evaluating PubMedQA:  92%|█████████▏| 458/500 [11:31<01:03,  1.51s/it]

[458] Accuracy: 0.5917


Evaluating PubMedQA:  92%|█████████▏| 459/500 [11:32<01:01,  1.51s/it]

[459] Accuracy: 0.5904


Evaluating PubMedQA:  92%|█████████▏| 460/500 [11:34<01:00,  1.51s/it]

[460] Accuracy: 0.5891


Evaluating PubMedQA:  92%|█████████▏| 461/500 [11:35<00:58,  1.51s/it]

[461] Accuracy: 0.5879


Evaluating PubMedQA:  92%|█████████▏| 462/500 [11:37<00:57,  1.51s/it]

[462] Accuracy: 0.5866


Evaluating PubMedQA:  93%|█████████▎| 463/500 [11:38<00:55,  1.51s/it]

[463] Accuracy: 0.5853


Evaluating PubMedQA:  93%|█████████▎| 464/500 [11:40<00:54,  1.51s/it]

[464] Accuracy: 0.5841


Evaluating PubMedQA:  93%|█████████▎| 465/500 [11:42<00:53,  1.52s/it]

[465] Accuracy: 0.5828


Evaluating PubMedQA:  93%|█████████▎| 466/500 [11:43<00:51,  1.51s/it]

[466] Accuracy: 0.5815


Evaluating PubMedQA:  93%|█████████▎| 467/500 [11:45<00:49,  1.51s/it]

[467] Accuracy: 0.5803


Evaluating PubMedQA:  94%|█████████▎| 468/500 [11:46<00:48,  1.51s/it]

[468] Accuracy: 0.5791


Evaluating PubMedQA:  94%|█████████▍| 469/500 [11:48<00:46,  1.50s/it]

[469] Accuracy: 0.5778


Evaluating PubMedQA:  94%|█████████▍| 470/500 [11:49<00:45,  1.50s/it]

[470] Accuracy: 0.5766


Evaluating PubMedQA:  94%|█████████▍| 471/500 [11:51<00:43,  1.50s/it]

[471] Accuracy: 0.5754


Evaluating PubMedQA:  94%|█████████▍| 472/500 [11:52<00:42,  1.51s/it]

[472] Accuracy: 0.5742


Evaluating PubMedQA:  95%|█████████▍| 473/500 [11:54<00:40,  1.51s/it]

[473] Accuracy: 0.5729


Evaluating PubMedQA:  95%|█████████▍| 474/500 [11:55<00:39,  1.51s/it]

[474] Accuracy: 0.5717


Evaluating PubMedQA:  95%|█████████▌| 475/500 [11:57<00:37,  1.51s/it]

[475] Accuracy: 0.5705


Evaluating PubMedQA:  95%|█████████▌| 476/500 [11:58<00:36,  1.51s/it]

[476] Accuracy: 0.5693


Evaluating PubMedQA:  95%|█████████▌| 477/500 [12:00<00:34,  1.51s/it]

[477] Accuracy: 0.5681


Evaluating PubMedQA:  96%|█████████▌| 478/500 [12:01<00:33,  1.51s/it]

[478] Accuracy: 0.5669


Evaluating PubMedQA:  96%|█████████▌| 479/500 [12:03<00:31,  1.52s/it]

[479] Accuracy: 0.5658


Evaluating PubMedQA:  96%|█████████▌| 480/500 [12:04<00:30,  1.51s/it]

[480] Accuracy: 0.5646


Evaluating PubMedQA:  96%|█████████▌| 481/500 [12:06<00:28,  1.52s/it]

[481] Accuracy: 0.5634


Evaluating PubMedQA:  96%|█████████▋| 482/500 [12:07<00:27,  1.52s/it]

[482] Accuracy: 0.5622


Evaluating PubMedQA:  97%|█████████▋| 483/500 [12:09<00:25,  1.51s/it]

[483] Accuracy: 0.5611


Evaluating PubMedQA:  97%|█████████▋| 484/500 [12:10<00:24,  1.51s/it]

[484] Accuracy: 0.5599


Evaluating PubMedQA:  97%|█████████▋| 485/500 [12:12<00:22,  1.51s/it]

[485] Accuracy: 0.5588


Evaluating PubMedQA:  97%|█████████▋| 486/500 [12:13<00:21,  1.50s/it]

[486] Accuracy: 0.5576


Evaluating PubMedQA:  97%|█████████▋| 487/500 [12:15<00:19,  1.51s/it]

[487] Accuracy: 0.5565


Evaluating PubMedQA:  98%|█████████▊| 488/500 [12:16<00:18,  1.51s/it]

[488] Accuracy: 0.5553


Evaluating PubMedQA:  98%|█████████▊| 489/500 [12:18<00:16,  1.50s/it]

[489] Accuracy: 0.5542


Evaluating PubMedQA:  98%|█████████▊| 490/500 [12:19<00:14,  1.50s/it]

[490] Accuracy: 0.5531


Evaluating PubMedQA:  98%|█████████▊| 491/500 [12:21<00:13,  1.51s/it]

[491] Accuracy: 0.5519


Evaluating PubMedQA:  98%|█████████▊| 492/500 [12:22<00:12,  1.51s/it]

[492] Accuracy: 0.5508


Evaluating PubMedQA:  99%|█████████▊| 493/500 [12:24<00:10,  1.51s/it]

[493] Accuracy: 0.5497


Evaluating PubMedQA:  99%|█████████▉| 494/500 [12:25<00:09,  1.52s/it]

[494] Accuracy: 0.5486


Evaluating PubMedQA:  99%|█████████▉| 495/500 [12:27<00:07,  1.51s/it]

[495] Accuracy: 0.5475


Evaluating PubMedQA:  99%|█████████▉| 496/500 [12:28<00:06,  1.51s/it]

[496] Accuracy: 0.5464


Evaluating PubMedQA:  99%|█████████▉| 497/500 [12:30<00:04,  1.50s/it]

[497] Accuracy: 0.5453


Evaluating PubMedQA: 100%|█████████▉| 498/500 [12:31<00:03,  1.50s/it]

[498] Accuracy: 0.5442


Evaluating PubMedQA: 100%|█████████▉| 499/500 [12:33<00:01,  1.52s/it]

[499] Accuracy: 0.5431


Evaluating PubMedQA: 100%|██████████| 500/500 [12:34<00:00,  1.51s/it]

[500] Accuracy: 0.5420





In [6]:
# Evaluation.py - Original PubMed Scorer

import json
from sklearn.metrics import accuracy_score, f1_score

#load results
with open(output_file, 'r') as f:
    predictions = json.load(f)


pmids = list(gt_data)
truth = [gt_data[pmid] for pmid in pmids]
preds = [predictions[pmid] for pmid in pmids]

acc = accuracy_score(truth, preds)
maf = f1_score(truth, preds, average='macro')

print('Accuracy %f' % acc)
print('Macro-F1 %f' % maf)


Accuracy 0.542000
Macro-F1 0.260790
