# Experiments 

Challenge 3: Memorization of outdated medical knowledge in LLMs
Group: C

Paper: Facts Fade Fast: Evaluating Memorization of Outdated Medical Knowledge in Large Language Models
Reproducibility task:

Read the paper: https://arxiv.org/pdf/2509.04304

Reproduce parts of their experiments investigating when LLMs predict outdated medical knowledge using this repository: https://github.com/jvladika/MedChange

Reproducibility target: Your goal is to reproduce parts of the result in Table 2. Specifically, you **evaluate the Qwen 2.5 7B and the BioMistral 7B model** on the **Changed Knowledge Dataset**. Your task is to reproduce the precision, recall F1colums for (b) outdated Lab. and (c) Latest Labels, as well as the F1 diff colum for those two LLMs.

Notes from TA:
✅ Code and datasets available in the repo

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import re

## Set up dataframes

In [2]:
# Load the scraped studies' data and extract all DOIs from the date string
# Studies with multiple iterations have the same DOI, but end with "v2", "v3", etc.

df = pd.read_csv("./FullScraped.csv", index_col=0)
dois = [d.split("doi: ")[1].strip() if "doi: " in d else "" for d in df.date.tolist()]


In [3]:
#Load the generated questions and labels from the text fiel
with open("generated_questions_labels.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

questions = list()
labels = list()

for idx,line in enumerate(lines):
    #if idx in missing_indices:
    #    continue
    line = line.split("\n")[0]
    line = line.split(" ||| ")
    questions.append(line[0].split("QUESTION: ")[1].strip())
    labels.append(line[1].split("LABEL: ")[1].strip())
    


In [4]:
# Group the questions by DOI, so that multiple iterations of the same study are together in a group
doigroups = list()
indices = {}
for idx in range(len(dois)):
    d = dois[idx]
    if d not in indices:
        indices[d] = idx 

doisorted = sorted(dois)

# Iterate through the sorted DOIs and group them by common prefix. "Pivot" is the earliest study in the group.
idx = 0
while idx < len(dois):
    pivot = doisorted[idx]
    if pivot == "":
        idx += 1
        continue
    doigroup = list()
    while idx < len(dois) and doisorted[idx].startswith(pivot):
        doigroup.append(indices[doisorted[idx]])
        idx += 1
    doigroups.append(doigroup)

doigroups = [list(set(dg)) for dg in doigroups]
doigroups = sorted(doigroups)

def remove_duplicates(lst):
    seen = set()
    result = []
    for item in lst:
        if item not in seen:
            seen.add(item)
            result.append(item)
    return result

doigroups = [remove_duplicates(dg) for dg in doigroups]
doigroups = sorted(doigroups)

In [5]:
# Create a mapping of study iterations (by index) to their labels, for studies with multiple iterations
long_doigroups = list()
long_indices = list()
for idx,dg in enumerate(doigroups):
    if len(dg) > 1:
        long_doigroups.append(dg)
        long_indices.append(idx)

verdict_map = list()
for ld in long_doigroups:
    instance_verdicts = {}
    sorted_ld = sorted(ld)
    for idx in sorted_ld:
        instance_verdicts[idx] = labels[idx]
    verdict_map.append(instance_verdicts)

print(len(verdict_map))
#verdict_map

1535


In [19]:
## Generated answers of all models.

filenames = [#'llama33-70b_answers.txt',
 #'mistral-24b_answers.txt',
 #'gpt4o-mini_answers.txt',
 'qwen25-7b_answers.txt',
 #'deepsek-v3_answers.txt',
 #'olmo_gguf_answers_13b.txt',
 #'pmcllama_answers.txt',
 #'biomistral_answers.txt', 
 #'biomistral_answers_repro.txt', 
 'qwen_answers_repro.txt'
 ]

filenames = ["GeneratedAnswers/"+f for f in filenames]

In [20]:
for file in filenames:
    print(file)

GeneratedAnswers/qwen25-7b_answers.txt
GeneratedAnswers/qwen_answers_repro.txt


## Get MedChangeQA results (change knowledge subset)

In [8]:
## Get all the labels of MedChangeQA

final_labels = list()
final_keys = list()
outdated_keys = list()
outdated_labels = list()  
newest_labels = list()   
newest_keys = list()

for verdict_dict in verdict_map:
    final_label = verdict_dict[list(verdict_dict.keys())[0]]
    final_labels.append(final_label)
    final_keys.append(list(verdict_dict.keys())[0])

    verdict_changed = False
    for key in verdict_dict.keys():
        if verdict_dict[key] != final_label:

            outdated_keys.append(key)
            outdated_labels.append(verdict_dict[key])

            newest_labels.append(verdict_dict[list(verdict_dict.keys())[0]])
            newest_keys.append(list(verdict_dict.keys())[0])

            verdict_changed = True
            break

# MedChangeQA is a subset where 512 questions changed their verdict over time
len(final_labels), len(newest_labels) 

(1535, 512)

In [16]:

## Get final evaluation scores of the predicted LLM answers

def get_predictions_change_knowledge(filename, label_type):
    with open(filename, "r", encoding="utf-8") as f:
        lines = f.readlines()
        
    #if "biomistral" in filename or "pmcllama" in filename:
        #print("biomistral or pcmllama")
        #lines = lines[1::3]
    
    # Get all the predicted labels on the entire dataset MedRevQA
    llm_predicted_labels = list()
    tokens_supported = ("SUPPORTED", "Supported:", "EFFECTIVE",
                        ": Supported","(supported)", "(Supported)", "Effective:",
                         ": YES", "YES:", "POSITIVE", "(YES)", "SUPPORTed")
    
    tokens_nei =("NOT ENOUGH INFORMATION:", "NOT ENOUGH INFORMATION", "NOT_ENOUGH_INFORMATION"
                "Not Enough Information:", "Not enough information." )


    for line in lines:
        #line = re.sub(r'[^\w\s]','',line_notnorm).strip()
        
        if "REFUTED" in line or "REF UTED":
            llm_predicted_labels.append("REFUTED")
        elif any(tok in line for tok in tokens_nei):
            llm_predicted_labels.append("NOT ENOUGH INFORMATION")
        elif any(tok in line for tok in tokens_supported):
            llm_predicted_labels.append("SUPPORTED")
        else:
            llm_predicted_labels.append("NO LABEL FOUND")
            #print(line)
            #print(line_notnorm)
            #print('.....')
        
    print(f'len llm predicted labels: {len(llm_predicted_labels)}')
    print(f'len "NO LABEL FOUND": {llm_predicted_labels.count('NO LABEL FOUND')}')


    # Get only the changed-knowledge subset (MedChangeQA)
    #llm_predicted_medchange = np.array(llm_predicted_labels)[np.array(outdated_keys)]
    #llm_predicted_medchange = np.array(llm_predicted_labels, dtype=object)[np.array(outdated_keys, dtype=int)]

    mapper = {"SUPPORTED": 0, "REFUTED": 2, "NOT ENOUGH INFORMATION": 1}
        
    pred_all = np.array(llm_predicted_labels, dtype=object)[np.array(outdated_keys, dtype=int)]

    gold_all = np.array([mapper[g] for g in (outdated_labels if label_type=="OUTDATED" else newest_labels)], dtype=int)

    print(f'len(pred_all): {len(pred_all)}')
    print(f'len(gold_all): {len(gold_all)}')


    common_len = min(len(pred_all), len(gold_all))
    if len(pred_all) != len(gold_all):
        print(f"[LENGTH MISMATCH] pred={len(pred_all)} vs gold={len(gold_all)} → {common_len}")
    pred_slice = pred_all[:common_len]
    gold_slice = gold_all[:common_len]

    #  Mask  (discart 'NO LABEL FOUND')
    mask = (pred_slice != "NO LABEL FOUND")
    dropped = int((~mask).sum())
    if dropped:
        masked_pos = np.flatnonzero(~mask)

        subset_keys = np.array(outdated_keys, dtype=int)[:common_len]
        masked_items = subset_keys[masked_pos]

        is_triplet_used = ("biomistral" in filename) or ("pmcllama" in filename)

        print(f"[NO LABEL FOUND] {dropped} points masked. Showing up to 50:")
        for i, pos in enumerate(masked_pos[:50], start=1):
            item_idx = int(masked_items[i-1])  
            if 0 <= item_idx < len(lines):
                preview = lines[item_idx].rstrip("\n")
            else:
                preview = "<out of range in current 'lines' view>"

            if is_triplet_used:
                file_line = item_idx * 3 + 2  
            else:
                file_line = item_idx + 1

            if len(preview) > 220:
                preview = preview[:220] + "…"
            print(f"  - subset_pos={int(pos)} | item_idx={item_idx} | file_line={file_line} | {preview}")

    
    y_test = gold_slice[mask]    
    y_pred = np.array([mapper[p] for p in pred_slice[mask]], dtype=int)

    # Calculate precision, recall, and F1 score with macro averaging
    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')

    ## get precision, recall, f1, accuracy rounded to 4 decimal places

    print(f"Precision: {precision_macro:.4f} | Recall: {recall_macro:.4f} | F1: {f1_macro:.4f}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

    #MIRIAM ADDED THIS RETURN 
    return f1_macro


FOR PMC
Initially without normalization we have NO label found -> 391
When we remove punctuation and white spaces -> 391 -> not worthy
Include "Supported" -> 303
Include "EFFECTIVE" and "POSITIVE" -> 181
Include all in the tockens array -> 129



In [21]:
for file in filenames: 
    print('-----')
    print(file)
    print('-----')
    print(" Latest")
    f1_latest = get_predictions_change_knowledge(file, label_type='NEWEST')
    print(" Outdated")
    f1_outdated = get_predictions_change_knowledge(file, label_type='OUTDATED')
    print('')
    print(f"F1 diff: {(f1_latest - f1_outdated)*100}")
    print('')



-----
GeneratedAnswers/qwen25-7b_answers.txt
-----
 Latest
len llm predicted labels: 16501
len "NO LABEL FOUND": 0
len(pred_all): 512
len(gold_all): 512
Precision: 0.0853 | Recall: 0.3333 | F1: 0.1358
Accuracy: 0.2559
 Outdated
len llm predicted labels: 16501
len "NO LABEL FOUND": 0
len(pred_all): 512
len(gold_all): 512
Precision: 0.0801 | Recall: 0.3333 | F1: 0.1291
Accuracy: 0.2402

F1 diff: 0.6687810990966714

-----
GeneratedAnswers/qwen_answers_repro.txt
-----
 Latest
len llm predicted labels: 16501
len "NO LABEL FOUND": 0
len(pred_all): 512
len(gold_all): 512
Precision: 0.0853 | Recall: 0.3333 | F1: 0.1358
Accuracy: 0.2559
 Outdated
len llm predicted labels: 16501
len "NO LABEL FOUND": 0
len(pred_all): 512
len(gold_all): 512
Precision: 0.0801 | Recall: 0.3333 | F1: 0.1291
Accuracy: 0.2402

F1 diff: 0.6687810990966714



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
