In [1]:
import os
import time
import re
import pandas as pd
from openai import OpenAI
from difflib import SequenceMatcher
from tqdm import tqdm

In [2]:
df = pd.read_csv("../02_extract_results_CITATIONS/raw_abstracts.csv")
print(f"Loaded {len(df)} tagged abstracts.")
print(df.head())

os.environ["OPENAI_API_KEY"] = "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"
# Load API key from environment
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

Loaded 693 tagged abstracts.
       PMID                                           Abstract Status
0   8057077  Inward rectifier (IR) K+ channels of bovine pu...  valid
1  27168519  Self-harm (SH; intentional self-poisoning or s...  valid
2  20203436  Fecal samples from Ruddy Shelduck, Tadorna fer...  valid
3  27353385  Young women, especially adolescents, often lac...  valid
4  34657444                                [Figure: see text].  valid


In [3]:
SYSTEM_PROMPT = """
You are a biomedical expert. Given a full abstract, identify the sentence(s)
that clearly describe the key factual findings or results of the study.
Do not label background, objectives, methods, or interpretative statements.
Return each result sentence exactly as it appears, one per line.
""".strip()

def build_prompt(text):
    return f"""
Below is a biomedical abstract. Identify and return only the factual result sentences.
Return each sentence exactly as it appears, one per line.

{text}
""".strip()

In [4]:
def query_gpt(abstract, max_retries=3, pause=2):
    prompt = build_prompt(abstract)
    for i in range(max_retries):
        try:
            resp = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system",  "content": SYSTEM_PROMPT},
                    {"role": "user",    "content": prompt}
                ],
                temperature=0,
                max_tokens=512,
            )
            txt = resp.choices[0].message.content.strip()
            # Filter out apology
            if txt.lower().startswith(("i'm sorry","sorry","unfortunately")):
                return "", 0
            lines = [L.strip() for L in txt.split("\n") if L.strip()]
            return "\n".join(lines), len(lines)
        except Exception as e:
            print(f" GPT retry {i+1}/{max_retries} failed: {e}")
            time.sleep(pause)
    return "", 0

In [5]:
def split_sentences(text):
    paras = re.split(r'\n+', text)
    sents = []
    for p in paras:
        parts = re.split(r'(?<=[\.!?])\s+(?=[A-Z0-9])', p)
        sents.extend(parts)
    return [s.strip() for s in sents if s.strip()]


import string
def normalize(s):
    return s.lower().translate(str.maketrans("", "", string.punctuation)).strip()

def close_match(s, candidates, thresh=0.6):
    s_norm = normalize(s)
    for c in candidates:
        c_norm = normalize(c)
        if s_norm in c_norm or c_norm in s_norm:
            return True
        if SequenceMatcher(None, s_norm, c_norm).ratio() >= thresh:
            return True
    return False

In [7]:
results = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Querying GPT"):
    pmid     = row["PMID"]
    abstract = row["Abstract"]
    gpt_out, gpt_count = query_gpt(abstract)
    gpt_lines = [L.lstrip("-–— ").strip() for L in gpt_out.split("\n") if L.strip()]

    tagged_lines = []
    for sent in split_sentences(abstract):
        if close_match(sent, gpt_lines):
            tagged_lines.append(f"[GPT] {sent}")
        else:
            tagged_lines.append(sent)

    tagged_abstract = " ".join(tagged_lines)
    tag_count = tagged_abstract.count("[GPT]")

    results.append({
        "PMID": pmid,
        "Abstract": abstract,
        "GPT_Result_Sentences": gpt_out,
        "GPT_Count": gpt_count,
        "Tagged_Abstract": tagged_abstract,
        "Tagged_Count": tag_count
    })
    time.sleep(1.2)

Querying GPT: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 693/693 [27:49<00:00,  2.41s/it]


In [8]:
df_out = pd.DataFrame(results)
df_out.to_csv("annotated_results.csv", index=False)
print("Saved annotated_results.csv")

print(df_out[["PMID","GPT_Count","Tagged_Count"]].head())

avg_prec = (df_out["Tagged_Count"] / df_out["GPT_Count"].replace(0,1)).mean()
avg_recall = (df_out["Tagged_Count"] / df_out["GPT_Count"].replace(1,1)).mean()
print(f"Approx match rate: Precision-like = {avg_prec:.3f}")

Saved annotated_results.csv
       PMID  GPT_Count  Tagged_Count
0   8057077          6             6
1  27168519          6             5
2  20203436          3             3
3  27353385          6             6
4  34657444          1             1
Approx match rate: Precision-like = 1.001


In [10]:
df_targeted = df_out[["PMID", "Tagged_Abstract"]].rename(
    columns={"Tagged_Abstract": "Targeted_Abstract"}
)

df_targeted.to_csv("targeted_abstracts_gpt.csv", index=False)
print(f"Wrote {len(df_targeted)} rows to targeted_abstract_gpt.csv")

Wrote 693 rows to targeted_abstract_gpt.csv
