In [None]:
from pymed import PubMed
import xml.etree.ElementTree as ET
import csv
import re

# 1. Retrieve article metadata and store it in a csv file

In [None]:
query = r'("medlinestatus medline"[All Fields] NOT ("indexingmethod curated"[All Fields] OR "indexingmethod automated"[All Fields])) AND ((fha[Filter]) AND (classicalarticle[Filter] OR introductoryjournalarticle[Filter]) AND (english[Filter]) AND (2015:2015[pdat]))'

pubmed = PubMed(tool="HumanIndexingSamples", email="nstrauc3@smail.uni-koeln.de")
results = pubmed.query(query, max_results=500)

In [None]:
with open('articles.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        field = ["id", "title", "publication_date", "abstract", "MeSH", "Year"]

        writer.writerow(field)

In [None]:
for article in results:

    # Extract relevant info for each article 
    article_id = article.pubmed_id
    title = article.title
    publication_date = article.publication_date
    abstract = article.abstract

    x_path_descriptor = ".//MeshHeading/DescriptorName"
    x_path_completed_date = ".//DateCompleted/Year"
    xml_element = article.xml
    mesh_terms = []
    
    for item in xml_element.findall(x_path_descriptor):
        m = re.search(r"(?<=>).*(?=<)", ET.tostring(item, encoding="unicode"))
        mesh_terms.append(m.group())
        
    for item in xml_element.findall(x_path_completed_date):
        date_completed = re.search(r"(?<=>).*(?=<)", ET.tostring(item, encoding="unicode"))

    with open('articles.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([article_id, title, publication_date, abstract, "|".join(mesh_terms), date_completed.group()])

# 2. Prompt LLM

In [None]:
import ollama

In [None]:
with open("keywords.csv", "a", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["LLM_Keywords", "Year"])

In [None]:
with open(r"articles.csv", newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        title = row["title"]
        abstract = row["abstract"]
        year = row["Year"]


        stream = ollama.chat(
            model="HammerAI/openhermes-2.5-mistral",
            stream=True,
            options={
                "temperature": 0.0,
                "seed": 0,
            },
            messages=[
                {
                    "role": "system",
                    "content": f"You are an expert in scientific literature analysis.",
                },
                {
                    "role": "user",
                    "content": f"Generate a list of keywords that best describe the following document, based on its title and abstract. Output only a numbered list with one keyword per line. The number of keywords should be variable, with an average target of 6–7 keywords, but may be more or fewer as appropriate, and must never exceed 15. Focus on the main concepts, topics, and important ideas presented. Prioritize specific, unique terms that accurately capture the document’s key themes, and avoid generic or overly broad terms.Use only English words.\nDo not include the names of individuals, organizations, conferences, symposiums, workshops, journals, or events, as these are unsuitable as descriptive keywords.\nTitle: {title}\nAbstract: {abstract}",
                },
            ],
        )

        message = ""

        for chunk in stream:
            if hasattr(chunk["message"], "thinking") and chunk["message"].get("thinking"):
                print(chunk["message"]["thinking"], end="", flush=True)
            message += chunk["message"]["content"]
            #print(chunk["message"]["content"], end="", flush=True)

        #print()  # Final newline
        with open("keywords.csv", "a", newline="") as f:
            writer = csv.writer(f)
            writer.writerow([message, year])

# 3. Create embeddings for LLM Keywords + MeSH Terms| Map LLM Keywords to the appropriate MeSH Terms

In [None]:
from sentence_transformers import SentenceTransformer
import torch

In [None]:
def create_corpus(path_to_xml: str) -> list[str]:

    tree = ET.parse(path_to_xml) 

    corpus = []

    path = r".//DescriptorName"
    root = tree.getroot()

    for elem in root.findall(path):
        corpus.append(elem.find("String").text)

    return corpus

In [None]:
mesh_files = {
    2015: "desc2015.xml",
    2016: "desc2016.xml",
    2017: "desc2017.xml",
}

embedder = SentenceTransformer("lokeshch19/ModernPubMedBERT")

mesh_corpora = {}
mesh_embeddings = {}

for year, path in mesh_files.items():
    corpus = create_corpus(path)
    mesh_corpora[year] = corpus
    mesh_embeddings[year] = embedder.encode_document(
        corpus, convert_to_tensor=True
    )

In [None]:
with open ("mapped_terms.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerow(["Original_Keywords","Mapped_Terms","Scores"])

In [None]:
with open("keywords.csv", "r") as f:
    reader = csv.DictReader(f)

    for row in reader:
        
        year = int(row["Year"])
        keywords = row["LLM_Keywords"]

        corpus = mesh_corpora[year]
        corpus_emb = mesh_embeddings[year]

        mapped_terms = []
        score_terms = []
        
        if "1." in keywords:
            keywords = re.sub(r"\d+\.\s", "", keywords)
        else:
            keywords = re.sub(r",\s", "\n", keywords)
        
        for keyword in keywords.split("\n"):
            #keyword = re.sub(r"^\d|\"", "", keyword)

            query_emb = embedder.encode_query(
                keyword, convert_to_tensor=True
            )

            similarity_scores = embedder.similarity(
                query_emb, corpus_emb
            )[0]

            scores, indices = torch.topk(similarity_scores, k=1)

            for score, idx in zip(scores, indices):
                mapped_terms.append(corpus[idx])
                score_terms.append(f"{score:.4f}")
                
        with open ("mapped_terms.csv", "a") as f:
            writer = csv.writer(f)
            writer.writerow([keywords, "\n".join(mapped_terms), "\n".join(score_terms)])

## 4. Analyse der Ergebnisse

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

In [None]:
length = {"MeSH_Keywords" :[] ,
         "LLM_Keywords" : []
         }


with open("Ergebnisse.csv", "r") as f:
    reader = csv.DictReader(f)

    for row in reader:
    
        llm_keywords = row["Original_Keywords"]
        mesh_keywords = row["MeSH"]

        length["MeSH_Keywords"].append(len(mesh_keywords.split("|")))
        length["LLM_Keywords"].append(len(llm_keywords.split("\n")))
        
average_mesh = sum(length["MeSH_Keywords"]) / len(length["MeSH_Keywords"])
average_llm = sum(length["LLM_Keywords"]) / len(length["LLM_Keywords"])

print(f"On Average Documents were {average_mesh} assigned.")
print(f"On Average The Open LLM assigned {average_llm} Keywords.")

In [None]:
bins = {
    "0<": 0,
    "0–0.25": 0,
    "0.26–0.50": 0,
    "0.51–0.75": 0,
    "0.76–1.00": 0
}

with open("Ergebnisse.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        scores = row["Scores"].split("\n")
        
        for score in scores:
            score = float(score)
    
            
            if 0.0 >= score:
                bins["0<"] += 1
            elif 0.0 < score <= 0.25:
                bins["0–0.25"] += 1
            elif 0.26 <= score <= 0.50:
                bins["0.26–0.50"] += 1
            elif 0.51 <= score <= 0.75:
                bins["0.51–0.75"] += 1
            elif 0.76 <= score <= 1.00:
                bins["0.76–1.00"] += 1


labels = list(bins.keys())
values = list(bins.values())


plt.figure(figsize=(6, 4))
plt.bar(labels, values, color="blue", edgecolor="black")

plt.xlabel("Score-Bereich")
plt.ylabel("Anzahl")
plt.title("Verteilung Kosinusähnlichkeit")
plt.tight_layout()

plt.savefig('Verteilung_Kosinus.png')

plt.show()

In [None]:
# Union MeSH Keywords and LLM Keywords
found = 0
# Total MeSH Keywords
total_mesh = 0
# Total LLM Keywords
total_llm_keywords = 0

with open("Ergebnisse.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        llm_mapped_keywords = row["Mapped_Terms"]
        true_keywords = row["MeSH"]

        
        llm_set = set(keyword.strip().lower() for keyword in llm_mapped_keywords.split("\n"))
        true_set = set(keyword.strip().lower() for keyword in true_keywords.split("|"))


        found += len(true_set & llm_set)
        total_mesh += len(true_set)
        total_llm_keywords += len(llm_set)


recall = found / total_mesh 
precision = found / total_llm_keywords
f1_score = 2 * precision * recall / (precision + recall)


print(f"Recall: {recall:.2%}")
print(f"Precision: {precision:.2%}")
print(f"F1-Score: {f1_score:.2%}")

In [None]:
labels = ["LLM Keywords", "MeSH Keywords", "Richtig"]
values = [total_llm_keywords, total_mesh, found]
colors = ["orange", "blue", "green"]

plt.figure(figsize=(6, 4))
plt.bar(labels, values, color=colors, edgecolor="black")
        
legend_patch = patches.Patch(color='white', label=f"Ø Keywords pro Dokument:\nMeSH={average_mesh:.1f}, LLM={average_llm:.1f}")

plt.legend(handles=[legend_patch])
plt.ylabel("Anzahl Keywords")
plt.title("Übersicht Keywords")

plt.tight_layout()
plt.savefig('Übersicht_Keywords.png')
plt.show()
print(f"LLM_Keywords: {total_llm_keywords}")
print(f"MeSH_Keywords: {total_mesh}")
print(f"Richtig: {found}")

In [None]:
metrics = ["Recall", "Precision", "F1-Score"]
values = [recall, precision, f1_score]

plt.figure(figsize=(6, 4))
plt.bar(metrics, values, color=["yellow", "red", "black"], edgecolor="black")
plt.ylim(0, 1)  # 0–100%
plt.ylabel("Wert")
plt.title("Keyword-Matching Performance")

# Prozentwerte über die Balken schreiben
for i, v in enumerate(values):
    plt.text(i, v + 0.02, f"{v:.1%}", ha="center")

plt.tight_layout()
plt.savefig('Keywords_Matching.png')
plt.show()