In [1]:
!python -m spacy download en_core_web_sm

import os
import json
import pandas as pd
import spacy
from tqdm import tqdm
from collections import Counter
from difflib import SequenceMatcher

ASSERTION_DIR = "assertion"
OUT_DIR = "eval"
os.makedirs(OUT_DIR, exist_ok=True)

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:02[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# Load full UMLS predicate list
UMLS_PREDICATES = set([
    "affects", "associated with", "causes", "complicates", "contraindicated with",
    "disrupts", "inhibits", "interacts with", "manages", "precedes",
    "prevents", "produces", "promotes", "stimulates", "treats",
    "increases", "decreases", "enhances", "induces", "leads to",
    "negatively regulates", "positively regulates", "regulates"
])

In [3]:
# Load assertions
def load_assertions(model_tag):
    data = []
    with open(os.path.join(ASSERTION_DIR, f"{model_tag}.jsonl"), encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            pmid = obj['pmid']
            for a in obj['assertion']:
                row = {'pmid': pmid, 'model': model_tag, **a}
                data.append(row)
    return pd.DataFrame(data)

# Load all
dfs = [load_assertions(tag) for tag in ["gpt4o", "claude", "llama"]]
df = pd.concat(dfs, ignore_index=True)

# Load SpaCy model for POS tagging
nlp = spacy.load("en_core_web_sm")

In [4]:
# POS check for subject/object
def check_np_type(text):
    doc = nlp(text)
    root = [token for token in doc if token.head == token]
    if not root:
        return "unknown"
    pos = root[0].pos_
    if pos in ["NOUN", "PROPN"]:
        return "noun"
    elif pos == "VERB":
        return "verb"
    elif pos == "ADJ":
        return "adjective"
    elif pos == "NUM":
        return "numeric"
    elif pos == "PRON":
        return "pronoun"
    elif pos == "ADV":
        return "adverb"
    elif pos == "AUX":
        return "auxiliary"
    else:
        return "phrase"

# Predicate match quality and similarity
def predicate_match(pred):
    pred_low = pred.lower().strip()
    if pred_low in UMLS_PREDICATES:
        return "exact"
    for std in UMLS_PREDICATES:
        sim = SequenceMatcher(None, pred_low, std).ratio()
        if sim >= 0.8:
            return "similar"
    return "non_umls"

def predicate_topk(pred, topk=3):
    pred_low = pred.lower().strip()
    scores = [(p, SequenceMatcher(None, pred_low, p).ratio()) for p in UMLS_PREDICATES]
    return sorted(scores, key=lambda x: x[1], reverse=True)[:topk]

# Condition type analysis
def condition_type(cond):
    if not cond:
        return "none"
    cond = cond.lower()
    if any(x in cond for x in ["patients", "subjects", "mice", "cohort"]):
        return "population"
    elif any(x in cond for x in ["in vitro", "in vivo", "experiment", "study"]):
        return "experiment"
    elif any(x in cond for x in ["if", "when", "under", "provided"]):
        return "logical"
    else:
        return "other"

In [5]:
# Apply checks
df["subj_type"] = df["subject"].map(check_np_type)
df["obj_type"] = df["object"].map(check_np_type)
df["predicate_match"] = df["predicate"].map(predicate_match)
df["predicate_top3"] = df["predicate"].map(lambda x: predicate_topk(x))
df["condition_type"] = df["condition"].fillna("").map(condition_type)

# Save detailed
df.to_csv(f"{OUT_DIR}/assertion_detailed.csv", index=False)

In [6]:
# Summary statistics
summary = df.groupby("model").agg({
    "predicate_match": lambda x: Counter(x).most_common(),
    "subj_type": lambda x: Counter(x).most_common(),
    "obj_type": lambda x: Counter(x).most_common(),
    "condition_type": lambda x: Counter(x).most_common()
})

summary.to_csv(f"{OUT_DIR}/assertion_summary.csv")
print("\n=== Assertion Evaluation Summary ===")
print(summary)
print("\nSaved evaluation results to eval/assertion_*.csv")


=== Assertion Evaluation Summary ===
                                      predicate_match  \
model                                                   
claude  [(non_umls, 284), (exact, 49), (similar, 28)]   
gpt4o   [(non_umls, 193), (exact, 86), (similar, 26)]   
llama   [(non_umls, 303), (exact, 29), (similar, 25)]   

                                                subj_type  \
model                                                       
claude  [(noun, 321), (verb, 34), (numeric, 3), (adjec...   
gpt4o   [(noun, 277), (verb, 23), (phrase, 2), (adject...   
llama   [(noun, 324), (verb, 20), (numeric, 6), (phras...   

                                                 obj_type  \
model                                                       
claude  [(noun, 283), (verb, 30), (phrase, 27), (adjec...   
gpt4o   [(noun, 254), (verb, 25), (phrase, 14), (adjec...   
llama   [(noun, 277), (verb, 30), (phrase, 21), (adjec...   

                                           condition_type  
mode