In [2]:
import pandas as pd
import numpy as np

#### Setup and prepare

In [3]:
# Load the scraped studies' data and extract all DOIs from the date string
# Studies with multiple iterations have the same DOI, but end with "v2", "v3", etc.

df = pd.read_csv("./FullScraped.csv", index_col=0)
dois = [d.split("doi: ")[1].strip() if "doi: " in d else "" for d in df.date.tolist()]


In [4]:

# Load the generated questions and labels from the text fiel
with open("all_questions_labels.txt", "r") as f:
    lines = f.readlines()

questions = list()
labels = list()

for idx,line in enumerate(lines):
    #if idx in missing_indices:
    #    continue
    line = line.split("\n")[0]
    line = line.split(" ||| ")
    questions.append(line[0].split("QUESTION: ")[1].strip())
    labels.append(line[1].split("LABEL: ")[1].strip())
    


In [5]:
# Group the questions by DOI, so that multiple iterations of the same study are together in a group
doigroups = list()
indices = {}
for idx in range(len(dois)):
    d = dois[idx]
    if d not in indices:
        indices[d] = idx 

doisorted = sorted(dois)

# Iterate through the sorted DOIs and group them by common prefix. "Pivot" is the earliest study in the group.
idx = 0
while idx < len(dois):
    pivot = doisorted[idx]
    if pivot == "":
        idx += 1
        continue
    doigroup = list()
    while idx < len(dois) and doisorted[idx].startswith(pivot):
        doigroup.append(indices[doisorted[idx]])
        idx += 1
    doigroups.append(doigroup)

doigroups = [list(set(dg)) for dg in doigroups]
doigroups = sorted(doigroups)

def remove_duplicates(lst):
    seen = set()
    result = []
    for item in lst:
        if item not in seen:
            seen.add(item)
            result.append(item)
    return result

doigroups = [remove_duplicates(dg) for dg in doigroups]
doigroups = sorted(doigroups)

In [6]:
# Create a mapping of study iterations (by index) to their labels, for studies with multiple iterations
long_doigroups = list()
long_indices = list()
for idx,dg in enumerate(doigroups):
    if len(dg) > 1:
        long_doigroups.append(dg)
        long_indices.append(idx)

verdict_map = list()
for ld in long_doigroups:
    instance_verdicts = {}
    sorted_ld = sorted(ld)
    for idx in sorted_ld:
        instance_verdicts[idx] = labels[idx]
    verdict_map.append(instance_verdicts)

verdict_map

[{24: 'REFUTED', 15807: 'REFUTED'},
 {65: 'SUPPORTED',
  304: 'SUPPORTED',
  8136: 'REFUTED',
  11578: 'SUPPORTED',
  13866: 'SUPPORTED',
  13994: 'SUPPORTED',
  15205: 'REFUTED'},
 {175: 'SUPPORTED',
  10551: 'NOT ENOUGH INFORMATION',
  10837: 'NOT ENOUGH INFORMATION',
  15050: 'NOT ENOUGH INFORMATION'},
 {180: 'SUPPORTED',
  3893: 'SUPPORTED',
  6174: 'NOT ENOUGH INFORMATION',
  15140: 'NOT ENOUGH INFORMATION'},
 {328: 'SUPPORTED',
  2046: 'SUPPORTED',
  8701: 'SUPPORTED',
  15091: 'NOT ENOUGH INFORMATION'},
 {357: 'NOT ENOUGH INFORMATION',
  8253: 'NOT ENOUGH INFORMATION',
  16383: 'NOT ENOUGH INFORMATION'},
 {360: 'NOT ENOUGH INFORMATION',
  2969: 'NOT ENOUGH INFORMATION',
  4174: 'NOT ENOUGH INFORMATION'},
 {370: 'NOT ENOUGH INFORMATION', 8339: 'NOT ENOUGH INFORMATION'},
 {401: 'REFUTED', 11004: 'REFUTED', 14926: 'NOT ENOUGH INFORMATION'},
 {408: 'NOT ENOUGH INFORMATION', 1020: 'NOT ENOUGH INFORMATION'},
 {427: 'SUPPORTED',
  6590: 'SUPPORTED',
  8625: 'SUPPORTED',
  10556: 'SUPPO

In [None]:
## Generated answers of all models.

filenames = ['llama33-70b_answers.txt',
 'mistral-24b_answers.txt',
 'gpt4o-mini_answers.txt',
 'qwen25-7b_answers.txt',
 'deepsek-v3_answers.txt',
 'olmo_gguf_answers_13b.txt',
 ]

filenames = ["GeneratedAnswers/"+f for f in filenames]

### Get MedRevQA (full dataset) scores

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

## Get final evaluation scores of the predicted LLM answers

def get_predictions(filename, label_type):
    with open(filename, "r") as f:
        lines = f.readlines()
        
    if "olmo" in filename:
        lines = lines[1::3]
    
    # Get all the predicted labels on the entire dataset MedRevQA
    llm_predicted_labels = list()
    for line in lines:
        if "SUPPORTED" in line:
            llm_predicted_labels.append("SUPPORTED")
        elif "REFUTED" in line:
            llm_predicted_labels.append("REFUTED")
        elif "NOT ENOUGH INFORMATION" in line:
            llm_predicted_labels.append("NOT ENOUGH INFORMATION")
        else:
            llm_predicted_labels.append("NOT ENOUGH INFORMATION")
        

    # Get the full dataset predictions
    llm_predicted_medrevqa = np.array(llm_predicted_labels)

    mapper = {"SUPPORTED": 0, "REFUTED": 2, "NOT ENOUGH INFORMATION": 1}
    predicted_labels = list()
    gold_labels = list()

    for label in llm_predicted_medrevqa:
        predicted_labels.append(mapper[label])

    # Take all the labels (16501)
    for label in labels:
        gold_labels.append(mapper[label])
    

    # Make predictions
    y_pred = np.array(predicted_labels)
    y_test = np.array(gold_labels)

    # Calculate precision, recall, and F1 score with macro averaging
    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')

    ## get precision, recall, f1, accuracy rounded to 4 decimal places

    print(f"Precision: {precision_macro:.4f} | Recall: {recall_macro:.4f} | F1: {f1_macro:.4f}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


for filename in filenames:
    print(f"Results for {filename} (full dataset):")
    get_predictions(filename, label_type="NEWEST")
    print()

Results for llama33-70b_answers.txt (full dataset):
Precision: 0.5275 | Recall: 0.4591 | F1: 0.3934
Accuracy: 0.4661

Results for mistral-24b_answers.txt (full dataset):
Precision: 0.5060 | Recall: 0.4628 | F1: 0.4568
Accuracy: 0.5192

Results for gpt4o-mini_answers.txt (full dataset):
Precision: 0.5263 | Recall: 0.4513 | F1: 0.4293
Accuracy: 0.5211

Results for qwen25-7b_answers.txt (full dataset):
Precision: 0.4640 | Recall: 0.4235 | F1: 0.3875
Accuracy: 0.4569

Results for deepsek-v3_answers.txt (full dataset):
Precision: 0.5617 | Recall: 0.4622 | F1: 0.4380
Accuracy: 0.5409

Results for olmo_gguf_answers_13b.txt (full dataset):
Precision: 0.4347 | Recall: 0.4247 | F1: 0.3790
Accuracy: 0.4316



### Get MedChangeQA results (changed knowledge subset)

In [11]:
## Get all the labels of MedChangeQA

final_labels = list()
final_keys = list()
outdated_keys = list()
outdated_labels = list()  
newest_labels = list()   
newest_keys = list()

for verdict_dict in verdict_map:
    final_label = verdict_dict[list(verdict_dict.keys())[0]]
    final_labels.append(final_label)
    final_keys.append(list(verdict_dict.keys())[0])

    verdict_changed = False
    for key in verdict_dict.keys():
        if verdict_dict[key] != final_label:

            outdated_keys.append(key)
            outdated_labels.append(verdict_dict[key])

            newest_labels.append(verdict_dict[list(verdict_dict.keys())[0]])
            newest_keys.append(list(verdict_dict.keys())[0])

            verdict_changed = True
            break

# MedChangeQA is a subset where 512 questions changed their verdict over time
len(final_labels), len(newest_labels) 

(1535, 512)

In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

## Get final evaluation scores of the predicted LLM answers

def get_predictions(filename, label_type):
    with open(filename, "r") as f:
        lines = f.readlines()
        
    if "biomistral" in filename or "pmcllama" in filename:
        lines = lines[1::3]
    
    # Get all the predicted labels on the entire dataset MedRevQA
    llm_predicted_labels = list()
    for line in lines:
        if "SUPPORTED" in line:
            llm_predicted_labels.append("SUPPORTED")
        elif "REFUTED" in line:
            llm_predicted_labels.append("REFUTED")
        elif "NOT ENOUGH INFORMATION" in line:
            llm_predicted_labels.append("NOT ENOUGH INFORMATION")
        

    # Get only the changed-knowledge subset (MedChangeQA)
    llm_predicted_medchange = np.array(llm_predicted_labels)[np.array(outdated_keys)]

    mapper = {"SUPPORTED": 0, "REFUTED": 2, "NOT ENOUGH INFORMATION": 1}
    predicted_labels = list()
    gold_labels = list()

    for label in llm_predicted_medchange:
        predicted_labels.append(mapper[label])

    # One experiment uses outdated labels as gold labels, another one uses newest (latest) labels
    if label_type == "OUTDATED":
        for label in outdated_labels:
            gold_labels.append(mapper[label])
    elif label_type == "NEWEST":
        for label in newest_labels:
            gold_labels.append(mapper[label])

    # Make predictions
    y_pred = np.array(predicted_labels)
    y_test = np.array(gold_labels)

    # Calculate precision, recall, and F1 score with macro averaging
    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')

    ## get precision, recall, f1, accuracy rounded to 4 decimal places

    print(f"Precision: {precision_macro:.4f} | Recall: {recall_macro:.4f} | F1: {f1_macro:.4f}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


In [13]:
# Get scores using outdated labels
for filename in filenames:
    print(f"Results for {filename}:")
    get_predictions(filename, label_type="OUTDATED")
    print()

Results for llama33-70b_answers.txt:
Precision: 0.3893 | Recall: 0.3658 | F1: 0.2664
Accuracy: 0.3223

Results for mistral-24b_answers.txt:
Precision: 0.3820 | Recall: 0.3764 | F1: 0.3388
Accuracy: 0.3867

Results for gpt4o-mini_answers.txt:
Precision: 0.4553 | Recall: 0.3886 | F1: 0.3408
Accuracy: 0.3945

Results for qwen25-7b_answers.txt:
Precision: 0.4262 | Recall: 0.3711 | F1: 0.3076
Accuracy: 0.3535

Results for deepsek-v3_answers.txt:
Precision: 0.3419 | Recall: 0.3355 | F1: 0.2927
Accuracy: 0.3574

Results for olmo_gguf_answers_13b.txt:
Precision: 0.3623 | Recall: 0.3529 | F1: 0.2932
Accuracy: 0.3203



In [14]:
# Get scores using newest labels
for filename in filenames:
    print(f"Results for {filename} (newest labels):")
    get_predictions(filename, label_type="NEWEST")
    print()

Results for llama33-70b_answers.txt (newest labels):
Precision: 0.4281 | Recall: 0.3933 | F1: 0.3415
Accuracy: 0.4570

Results for mistral-24b_answers.txt (newest labels):
Precision: 0.3685 | Recall: 0.3555 | F1: 0.3377
Accuracy: 0.4043

Results for gpt4o-mini_answers.txt (newest labels):
Precision: 0.3524 | Recall: 0.3448 | F1: 0.3107
Accuracy: 0.4043

Results for qwen25-7b_answers.txt (newest labels):
Precision: 0.2712 | Recall: 0.3082 | F1: 0.2602
Accuracy: 0.3730

Results for deepsek-v3_answers.txt (newest labels):
Precision: 0.3579 | Recall: 0.3427 | F1: 0.3114
Accuracy: 0.3926

Results for olmo_gguf_answers_13b.txt (newest labels):
Precision: 0.3550 | Recall: 0.3568 | F1: 0.3318
Accuracy: 0.3984



### Get n-gram counts over the years

In [None]:
import requests

payload = {
    'index': 'v4_dolma-v1_7_llama',
    'query_type': 'count',
    'query': 'Cochrane Database of Systematic Reviews',
}

## Use the infi-gram API to get n-gram counts over the years
result = requests.post('https://api.infini-gram.io/', json=payload).json()
print(result)

df = pd.read_csv("./FullQA.csv", index_col=0)
titles = df.title.tolist()

changed_titles = np.array(titles)[np.array(outdated_keys)]
changed_titles = [c[:-1] for c in changed_titles]
changed_titles = [c.replace("?", "") for c in changed_titles]
changed_titles = [c.replace("  ", " ") for c in changed_titles]
changed_titles = [c.replace("   ", " ") for c in changed_titles]
changed_titles


In [None]:

import time

def count_ngram(query):
    payload = {
        'index': 'v4_dolma-v1_7_llama',
        'query_type': 'count',
        'query': query,
    }
    result = requests.post('https://api.infini-gram.io/', json=payload).json()
    print(result)
    return result

json_results = list()

for title in changed_titles:
    json_results.append(count_ngram(title))
    time.sleep(0.5)
json_results


cleaned_titles = np.array(titles)
cleaned_titles = [c[:-1] for c in cleaned_titles]
cleaned_titles = [c.replace("?", "") for c in cleaned_titles]
cleaned_titles = [c.replace("  ", " ") for c in cleaned_titles]
cleaned_titles = [c.replace("   ", " ") for c in cleaned_titles]
cleaned_titles



In [None]:
import time

def count_ngram(query):
    payload = {
        'index': 'v4_dolma-v1_7_llama',
        'query_type': 'count',
        'query': query,
    }
    result = requests.post('https://api.infini-gram.io/', json=payload).json()
    print(result)
    return result

full_json_results = list()

for title in cleaned_titles:
    full_json_results.append(count_ngram(title))
    time.sleep(0.8)


c = 0
counts = list()
for js in full_json_results:
    try:
        counts.append(js["count"])
    except:
        counts.append(0)
        c+=1
print(c)
counts = np.array(counts)
len(counts)

In [None]:
full_latest_ids = [dg[0] for dg in doigroups]
full_latest_ids = sorted(full_latest_ids)
len(full_latest_ids)

years = list()
ids_per_year = dict()
for y in range(2000, 2025):
    ids_per_year[str(y)] = list()

df = pd.read_csv("./FullScraped.csv", index_col=0)
dates = df.date.tolist()
dates = [d.split("Cochrane Database Syst Rev.")[1].strip()[:4] if "Cochrane Database Syst Rev." in d else "" for d in dates]

for fli in full_latest_ids:
    year = dates[fli]
    ids_per_year[year].append(fli)
    years.append(dates[fli])


year_counts = dict(zip(ids_per_year.keys(), [0]*len(ids_per_year.keys())))
year_sums = dict(zip(ids_per_year.keys(), [0]*len(ids_per_year.keys())))

year_countlists = dict()

for year in ids_per_year.keys():
    year_countlists[year] = list()

for idx in range(len(dates)):
    year = dates[idx]
    count = counts[idx]

    year_counts[year] += 1
    year_sums[year] += count

    year_countlists[year].append(count)

year_countlists


for key, value in year_sums.items():
    print(key, value/year_counts[key])
    print(key, np.median(np.array(year_countlists[key])))
    print("  ")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

''' Plot the n-gram counts over the years '''

# Data provided
data = [
    (2000, 42.111000991080275, 20.0),
    (2001, 52.63333333333333, 23.0),
    (2002, 47.3978021978022, 27.0),
    (2003, 48.988188976377955, 28.0),
    (2004, 54.72140221402214, 28.0),
    (2005, 44.83970856102004, 28.0),
    (2006, 54.95674740484429, 30.0),
    (2007, 37.6675567423231, 21.0),
    (2008, 42.50354609929078, 24.0),
    (2009, 39.649532710280376, 22.0),
    (2010, 40.01033591731266, 21.0),
    (2011, 39.78203434610304, 21.0),
    (2012, 39.76604146100691, 22.0),
    (2013, 42.10798122065728, 20.0),
    (2014, 32.301384451544195, 18.0),
    (2015, 28.74308300395257, 16.0),
    (2016, 25.783352337514252, 14.0),
    (2017, 27.019354838709678, 13.0),
    (2018, 22.353204172876303, 11.0),
    (2019, 20.89090909090909, 8.0),
    (2020, 21.214776632302407, 8.0),
    (2021, 17.03731343283582, 4.0),
    (2022, 14.085470085470085, 3.0),
    (2023, 17.06746987951807, 1.0),
    (2024, 32.65217391304348, 0.0)
]

# Separate the data into lists
years = [item[0] for item in data]
count1 = [item[1] for item in data]
count2 = [item[2] for item in data]

# Create the plot
plt.figure(figsize=(5.5, 3.8))  # Adjust figure size for better readability

# Define a color palette suitable for publications
color_palette = ['#0072B2', '#D55E00']  # Blue and Orange - Colorblind friendly

# Plot the two line charts with distinct colors and markers
plt.plot(years, count1, marker='o', linestyle='-', color=color_palette[0], label='Mean Average')
plt.plot(years, count2, marker='s', linestyle='--', color=color_palette[1], label='Median')

# Add labels and title with a more formal style
plt.xlabel('Year of study publication', fontsize=10, fontweight='bold')
plt.ylabel('N-gram count in Dolma', fontsize=10, fontweight='bold')
plt.title('Mentions of SLR titles from different years', fontsize=11, fontweight='bold')

# Add a legend to distinguish the two lines
plt.legend(fontsize=10, loc='upper right')

# Add gridlines for better readability, with a subtle style
plt.grid(True, linestyle=':', alpha=0.7)

# Customize the axes
xtick_labels = [str(year) for year in years] # Start with all years as strings
xtick_labels[-1] = "Jan 2024" # Change the last label

plt.xticks(years, xtick_labels, fontsize=9) # Apply both the positions (years) and the custom labels
plt.yticks(fontsize=10)

# Make the plot look cleaner
plt.gca().spines['top'].set_visible(False)    # Remove top spine
plt.gca().spines['right'].set_visible(False)  # Remove right spine
plt.gca().xaxis.set_tick_params(rotation=40, direction='out')  
plt.gca().yaxis.set_tick_params(direction='out')

# Adjust layout to prevent labels from overlapping
plt.tight_layout()

# Save the plot as a high-resolution TIFF file (for publications)
#plt.savefig('year_vs_counts_plot.tiff', format='tiff', dpi=300)  # 300 dpi is good for print

# Show the plot
plt.show()