In [1]:
# finding_pipeline.py
from src.config import (
    GPT_KEY, CLAUDE_KEY, LLMAPI_KEY,
    GPT_MODEL, CLAUDE_MODEL, LLMAPI_MODEL,
    CSV_PATH, FIND_DIR
)
from src.llm_clients import GPTClient, ClaudeClient, LlamaAPIClient
from src.schemas import Sentence, FindingDoc
from src.prompts import get_prompt
from src.json_utils import parse_or_fix
from src.utils import split_sents, ensure_dir_exists

import pandas as pd
import tqdm.auto as tqdm
import json

In [2]:
# Prepare output directory
ensure_dir_exists(FIND_DIR)

# Load abstracts
df = pd.read_csv(CSV_PATH, usecols=[0, 1], header=0).iloc[:100]

In [3]:
# Initialize LLM Client
clients = {
    "gpt4o": GPTClient(model=GPT_MODEL, key=GPT_KEY),
    "claude": ClaudeClient(model=CLAUDE_MODEL, key=CLAUDE_KEY),
    "llama": LlamaAPIClient(model=LLMAPI_MODEL, key=LLMAPI_KEY)
}

In [4]:
# Load prompt
system_prompt, fewshot = get_prompt("finding")

def build_task_msgs(pmid: str, sent_list: list[str]) -> list[dict]:
    numbered = "\n".join(f"[{i}] {s}" for i, s in enumerate(sent_list))
    user_block = f"PMID: {pmid}\nSentences:\n{numbered}\n\nReturn JSON:"
    return fewshot + [{"role": "user", "content": user_block}]

def run_finding(tag: str):
    cli = clients[tag]
    out_path = f"{FIND_DIR}/{tag}.jsonl"
    with open(out_path, "w", encoding="utf-8") as fw:
        for pmid, abs_txt in tqdm.tqdm(df.itertuples(index=False),
                                       total=len(df), desc=f"{tag} run"):
            sents = split_sents(abs_txt)
            msgs  = build_task_msgs(pmid, sents)
            raw   = cli.run(msgs)
            try:
                doc = parse_or_fix(raw, cli, msgs, target_class=FindingDoc)
            except Exception as err:
                print(f"[{tag}][{pmid}] failed → {err}")
                continue
            fw.write(json.dumps(doc.model_dump(), ensure_ascii=False) + "\n")
    print(f"{tag} detect finding -> {out_path}")

In [5]:
# Run models
run_finding("gpt4o")
run_finding("claude")
run_finding("llama")

gpt4o run:   0%|          | 0/100 [00:00<?, ?it/s]

gpt4o detect finding -> finding/gpt4o.jsonl


claude run:   0%|          | 0/100 [00:00<?, ?it/s]

claude detect finding -> finding/claude.jsonl


llama run:   0%|          | 0/100 [00:00<?, ?it/s]

llama detect finding -> finding/llama.jsonl
