In [1]:
from src.config import (
    GPT_KEY, CLAUDE_KEY, LLMAPI_KEY,
    GPT_MODEL, CLAUDE_MODEL, LLMAPI_MODEL,
    CSV_PATH, FIND_DIR, COMP_DIR,
    MAX_TOKENS, TEMPERATURE
)
from src.llm_clients import GPTClient, ClaudeClient, LlamaAPIClient
from src.schemas import ResolvedSentence, CompletionDoc, PartialResolved
from src.prompts import get_prompt
from src.json_utils import parse_or_fix
from src.utils import split_sents, ensure_dir_exists

import pandas as pd
import json
import tqdm.auto as tqdm
from typing import List

In [2]:
# Prepare output directory
ensure_dir_exists(COMP_DIR)

# Load Abstract
df_abs = pd.read_csv(CSV_PATH, usecols=[0, 1], header=0, names=["pmid", "abstract"])
ABS_CACHE = df_abs.set_index("pmid")["abstract"].to_dict()

# Index the finding sentence for phase one loading
def load_findings(tag: str) -> dict[str, set[int]]:
    out = {}
    with open(f"{FIND_DIR}/{tag}.jsonl", "r", encoding="utf-8") as fr:
        for line in fr:
            j = json.loads(line)
            out[j["pmid"]] = set(j["finding_ids"])
    return out

find_ids = {t: load_findings(t) for t in ["gpt4o", "claude", "llama"]}

# Initialize LLM Client
clients = {
    "gpt4o": GPTClient(model=GPT_MODEL, key=GPT_KEY),
    "claude": ClaudeClient(model=CLAUDE_MODEL, key=CLAUDE_KEY),
    "llama": LlamaAPIClient(model=LLMAPI_MODEL, key=LLMAPI_KEY)
}

In [3]:
# Load prompt
system_prompt, fewshot = get_prompt("completion")

def build_msgs(pmid: str, sid: int, sent: str, context: str) -> list[dict]:
    user = (
        f"Context:\n{context}\n\n"
        f"Sentence [{sid}]:\n{sent}\n\n"
        "Return JSON:"
    )
    return fewshot + [{"role": "user", "content": user}]

In [6]:
def run_completion(model_tag: str):
    cli = clients[model_tag]
    out_path = f"{COMP_DIR}/{model_tag}.jsonl"
    fail_path = f"{COMP_DIR}/{model_tag}.fail.txt"

    with open(out_path, "w", encoding="utf-8") as fw, \
         open(fail_path, "w", encoding="utf-8") as fail_log:

        for pmid_str, ids in tqdm.tqdm(find_ids[model_tag].items(), desc=f"{model_tag}"):
            pmid_int = int(pmid_str)
            if pmid_int not in ABS_CACHE:
                continue

            context = ABS_CACHE[pmid_int]
            sents = split_sents(context)
            completed = []

            for sid in sorted(ids):
                if sid < 0 or sid >= len(sents):
                    continue

                msgs = build_msgs(pmid_str, sid, sents[sid], context)
                try:
                    raw = cli.run(msgs, task_id=f"{pmid_str}-{sid}")
                    part_list = parse_or_fix(raw, cli, msgs, target_class=List[PartialResolved])
                    for part in part_list:
                        completed.append(
                            ResolvedSentence(
                                id=part.id,
                                original=sents[part.id] if part.id < len(sents) else sents[sid],
                                resolved=part.resolved
                            )
                        )
                except Exception as e:
                    print(f"[{model_tag}][{pmid_str}][{sid}] failed → {e}")
                    fail_log.write(f"{pmid_str}\t{sid}\t{e}\n")
                    continue

            doc = CompletionDoc(pmid=pmid_str, sentences=completed)
            fw.write(doc.model_dump_json(ensure_ascii=False) + "\n")

    print(f"{model_tag} sentence complete -> {out_path}")

In [7]:
# Run models
run_completion("gpt4o")
run_completion("claude")
run_completion("llama")

gpt4o:   0%|          | 0/100 [00:00<?, ?it/s]

[Retry 1] Invalid JSON: Parsed JSON is empty or invalid content
-> Raw output: {} 
[Retry 2] Invalid JSON: Parsed JSON is empty or invalid content
-> Raw output: {} 
[Retry 3] Invalid JSON: Parsed JSON is empty or invalid content
-> Raw output: {} 
[gpt4o][16133256][11] failed ->Final JSON parse failed: Parsed JSON is empty or invalid content
gpt4o sentence complete -> completion/gpt4o.jsonl


claude:   0%|          | 0/100 [00:00<?, ?it/s]

[Retry 1] Invalid JSON: No valid JSON object found. Sample: [}...
-> Raw output: [] 
[Retry 2] Invalid JSON: No valid JSON object found. Sample: [}...
-> Raw output: [] 
[Retry 3] Invalid JSON: No valid JSON object found. Sample: [}...
-> Raw output: [] 
[claude][33095090][3] failed ->Final JSON parse failed: No valid JSON object found. Sample: [}...
[Retry 1] Invalid JSON: Unterminated string starting at: line 1 column 23 (char 22)
-> Raw output: [{"id": 3, "resolved": "Whole cell viscoelasticity depends strongly on time, frequency, and strain.}, {"id": 3, "resolved": "Comparison of wild-type and mutant strains under identical conditions generally produced significant differences in whole cell viscoelasticity."}] 
[Retry 1] Invalid JSON: Unterminated string starting at: line 1 column 23 (char 22)
-> Raw output: [{"id": 2, "resolved": "Eighty-six percent of spontaneous splenic arteriovenous fistulas occur in women.}, {"id": 2, "resolved": "Fifty-five percent of spontaneous splenic arte

llama:   0%|          | 0/100 [00:00<?, ?it/s]

[Retry 1] Invalid JSON: Expecting ',' delimiter: line 1 column 77 (char 76)
-> Raw output: [{"id": 12, "resolved": "Deactivation of IR channels was also slowed by Rb+.")] 
llama sentence complete -> completion/llama.jsonl
