In [4]:
# config.py
# Configuration and API keys (replace placeholders with actual keys/emails)
SEMANTIC_SCHOLAR_API_KEY = "19tQFoyv7w5xBQNMsUA7C5lwNqEni5g3GKkP8Pkj"  # e.g., 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | https://www.semanticscholar.org/product/api/tutorial?utm_campaign=API%20transaction&utm_medium=email&_hsenc=p2ANqtz--KbD5dVfVRom22kVjKkL-55Ikb73h1Nze5JYW6_8OGfj15Pf_Z7OjRXzHnO2BntuA89mE6jdPyHOEzQnaYDLInKFPGxw&_hsmi=329822401&utm_content=329822401&utm_source=hs_automation
OPENAI_API_KEY = "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"  # e.g., 'sk-...'
UNPAYWALL_EMAIL = "yunruilu@caltech.edu"  # Email for Unpaywall API
NCBI_EMAIL = "yunruilu@caltech.edu"  # Email for NCBI Entrez (required by NCBI)
NCBI_API_KEY = '903b8602ced7c96ae73650c3ff78350a100'  # Optional: NCBI API Key for higher rate limits, or None | https://support.nlm.nih.gov/kbArticle/?pn=KA-05317

# Model and other options
OPENAI_MODEL = "gpt-5"  # Use 'gpt-4' for best results; 'gpt-3.5-turbo' if lower cost is desired
INSTITUTIONAL_ACCESS = True  # True if running on a network with institutional access to paywalled PDFs

# Search query and settings
SEARCH_QUERY = '("spatial transcriptomics" OR Visium OR MERFISH OR seqFISH OR CosMX OR Xenium)'
SEARCH_QUERY = 'spatial transcriptomics'
# FIELDS_OF_STUDY = "Biology"  # Restrict search to biology-related papers
FIELDS_OF_STUDY = None
# SEARCH_LIMIT = 100  # max results per API call (Semantic Scholar allows up to 100)
# SINCE_DAYS = 7  # default to search for papers in the last 7 days (for weekly run)


In [1]:
import pandas as pd

# Create an empty DataFrame with the specified columns
columns = [
    'title',
    'year', 
    'venue',
    'paper_id',
    'doi',
    'publication_date',
    'oa_pdf_url',
    'abstract',
    'tldr',
    'reference'
]

df = pd.DataFrame(columns=columns)

# Save to the specified path
output_path = '/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/Papers.csv'
df.to_csv(output_path, index=False)

print(f"Created empty DataFrame with columns: {list(df.columns)}")
print(f"Saved to: {output_path}")



Created empty DataFrame with columns: ['title', 'year', 'venue', 'paper_id', 'doi', 'publication_date', 'oa_pdf_url', 'abstract', 'tldr', 'reference']
Saved to: /resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/Papers.csv


In [2]:
# search_papers_bulk.py
import requests, datetime, time, config
from tqdm import tqdm

def _get_with_backoff(url, params, headers, max_retries=5, timeout=30):
    delay = 1.0
    for _ in range(max_retries):
        r = requests.get(url, params=params, headers=headers, timeout=timeout)
        if r.status_code in (429,) or 500 <= r.status_code < 600:
            time.sleep(delay); delay = min(delay * 2, 30); continue
        return r
    return r

def search_new_papers_bulk(since_days=365):
    since_date = datetime.date.today() - datetime.timedelta(days=since_days)
    url = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"

    headers = {}
    if config.SEMANTIC_SCHOLAR_API_KEY:
        headers["x-api-key"] = config.SEMANTIC_SCHOLAR_API_KEY

    params = {
        # "query": config.SEARCH_QUERY,
        "query": 'spatial transcriptomics',                       # simpler query string
        "fields": "title,year,venue,paperId,externalIds,openAccessPdf,publicationDate",
        "publicationDateOrYear": f"{since_date.isoformat()}:{datetime.date.today().isoformat()}",
        "sort": "publicationDate:desc",
        "limit": 1000,
        "fieldsOfStudy": config.FIELDS_OF_STUDY,
    }

    results, token = [], None
    while True:
        p = params.copy()
        if token: p["token"] = token
        resp = _get_with_backoff(url, p, headers)
        if resp.status_code != 200:
            print("API", resp.status_code, resp.text[:500]); break

        data = resp.json()
        for paper in data.get("data", []):
            pub = paper.get("publicationDate")
            doi = (paper.get("externalIds") or {}).get("DOI")
            oa  = (paper.get("openAccessPdf") or {}).get("url")
            abstract = paper.get("abstract")
            tldr = paper.get("tldr")
            if doi:
                results.append({
                    "title": paper.get("title",""),
                    "year": paper.get("year"),
                    "venue": paper.get("venue",""),
                    "paper_id": paper.get("paperId"),
                    "doi": doi,
                    "publication_date": pub,
                    "oa_pdf_url": oa,
                    "abstract": abstract,
                    "tldr": tldr,
                })
        token = data.get("token")
        if not token: break
    return results


In [3]:
result = search_new_papers_bulk(since_days = 10)
print('len(result)', len(result))
print(result[0])

len(result) 37
{'title': 'Spatial transcriptomics of intraductal carcinoma of the prostate.', 'year': 2025, 'venue': 'Histopathology', 'paper_id': '9673ebf76c2e0eb9d6b742f4f649f0982e5e6c82', 'doi': '10.1111/his.15551', 'publication_date': '2025-09-18', 'oa_pdf_url': '', 'abstract': None, 'tldr': None}


In [4]:
def get_references(paper_id, fields="citedPaper.paperId,citedPaper.externalIds", max_per_page=1000):
    """
    Robustly fetch references for a paper, returning a list of citedPaper dicts.
    Handles 'data': null, pagination via 'next', and non-200 responses.
    """
    url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references"
    headers = {"x-api-key": config.SEMANTIC_SCHOLAR_API_KEY} if config.SEMANTIC_SCHOLAR_API_KEY else {}
    params = {"fields": fields, "limit": max_per_page, "offset": 0}
    out = []
    while True:
        r = _get_with_backoff(url, params, headers)
        if r.status_code != 200:
            # Surface API response text to help debugging auth/rate-limit/etc.
            raise RuntimeError(f"S2 references error {r.status_code}: {r.text[:300]}")
        try:
            data = r.json()
        except ValueError:
            raise RuntimeError("S2 references returned non-JSON response")

        items = data.get("data") or []  # <- key fix: 'null' -> []
        # Each item normally has {'citedPaper': {...}}; fall back defensively.
        for row in items:
            cp = (row or {}).get("citedPaper") or row or {}
            out.append(cp)

        nxt = data.get("next")
        if nxt is None:
            break
        params["offset"] = nxt
    return out

def get_cited_papers(paper_id: str) -> list[dict]:
    """
    Return a list of {"paper_id": <str>, "doi": <str|None>} for all papers
    that the given paper_id cites.
    """
    refs = get_references(
        paper_id,
        fields="citedPaper.paperId,citedPaper.externalIds",
        max_per_page=1000
    )

    out, seen = [], set()
    for cp in refs:  # cp is the cited paper dict
        pid = cp.get("paperId")
        doi = (cp.get("externalIds") or {}).get("DOI")
        key = (pid, (doi or "").lower())
        if pid and key not in seen:
            out.append({"paper_id": pid, "doi": (doi.lower() if doi else None)})
            seen.add(key)
    return out


In [5]:
for i, one_paper in tqdm(enumerate(result)):
    items = get_cited_papers(one_paper['paper_id'])
    result[i]['reference'] = items

0it [00:00, ?it/s]

37it [00:36,  1.02it/s]


In [13]:
print(result[3])

{'title': 'Systematic benchmarking of computational methods to identify spatially variable genes', 'year': 2025, 'venue': 'Genome Biology', 'paper_id': '22f81f296713100ec8164688d7bb7d98a5576510', 'doi': '10.1186/s13059-025-03731-2', 'publication_date': '2025-09-18', 'oa_pdf_url': '', 'abstract': None, 'tldr': None, 'reference': [{'paper_id': 'fcbea9ca7bcbe8283456e0913f15b35f4bfed8b9', 'doi': '10.21203/rs.3.rs-4181617/v1'}, {'paper_id': 'f0278e4159224afafa27697267b2f62245dc4cae', 'doi': '10.1038/s41588-024-01664-3'}, {'paper_id': '4845e9f3292db0e8fd7ea3d60a5914ef2280e50a', 'doi': '10.1186/s13059-023-03145-y'}, {'paper_id': 'ab0f9ebab25d53a9f4633de0045235b8f08ec548', 'doi': '10.1186/s13059-023-03045-1'}, {'paper_id': 'dfb56d7fc358f7d1900e1e90dd2b1da56c509c81', 'doi': '10.1101/2022.05.16.492124'}, {'paper_id': '7740e268273b187df4927d58bd2396516d89388e', 'doi': '10.1038/s41587-023-01772-1'}, {'paper_id': '6d0c930590c0c0327fe698a6e5667ee7800ed6f1', 'doi': '10.1101/2023.04.01.535228'}, {'pap

In [1]:
# fetch_pdf_pipeline.py
import os, re, io, time, urllib.parse, requests, fitz
from pathlib import Path
from typing import Optional, Union, Tuple
import sys
sys.path.append('/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection')
import config

# ---------- Small helpers ----------

def _safe_filename_from_doi(doi: str) -> str:
    # Make a safe filename from the DOI
    # e.g., 10.1038/s41586-019-1049-y -> 10.1038_s41586-019-1049-y.pdf
    return re.sub(r'[^A-Za-z0-9._-]+', '_', doi) + ".pdf"

def _extract_text_from_pdf_bytes(pdf_bytes: bytes) -> Optional[str]:
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    except Exception:
        return None
    try:
        parts = []
        for page in doc:
            parts.append(page.get_text())  # plain text extraction
        return "".join(parts)
    finally:
        doc.close()

def _download_ok(resp: requests.Response) -> bool:
    ctype = (resp.headers.get("Content-Type") or "").lower()
    return resp.status_code == 200 and (resp.content and ("pdf" in ctype or resp.content[:4] == b"%PDF"))

# ---------- Main function ----------

def fetch_pdf_and_text_by_doi(
    doi: str,
    save_dir: str = "/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/PDF",
    oa_pdf_url: Optional[str] = None,
    openathens_prefix: str = "https://go.openathens.net/redirector/caltech.edu?url=",
    session: Optional[requests.Session] = None,
    timeout: int = 30,
) -> Tuple[Optional[str], Optional[str]]:
    Path(save_dir).mkdir(parents=True, exist_ok=True)
    pdf_bytes = None

    # 1) Direct OA URL
    if oa_pdf_url:
        print(f"Trying method 1: Direct OA URL for DOI {doi}")
        try:
            r = requests.get(oa_pdf_url, timeout=timeout)
            if _download_ok(r):
                pdf_bytes = r.content
                print(f"Success at method 1: Direct OA URL for DOI {doi}")
            else:
                print(f"Failed method 1: Direct OA URL for DOI {doi}")
        except Exception:
            print(f"Failed method 1: Direct OA URL for DOI {doi}")

    # 2) Unpaywall
    if pdf_bytes is None and config.UNPAYWALL_EMAIL and "your_email" not in config.UNPAYWALL_EMAIL:
        print(f"Trying method 2: Unpaywall for DOI {doi}")
        try:
            upw_url = f"https://api.unpaywall.org/v2/{doi}?email={config.UNPAYWALL_EMAIL}"
            ur = requests.get(upw_url, timeout=timeout)
            if ur.status_code == 200:
                j = ur.json()
                pdf_url = (j.get("best_oa_location") or {}).get("url_for_pdf") or (j.get("best_oa_location") or {}).get("url")
                if not pdf_url:
                    for loc in j.get("oa_locations") or []:
                        pdf_url = loc.get("url_for_pdf") or loc.get("url")
                        if pdf_url: break
                if pdf_url:
                    pr = requests.get(pdf_url, timeout=timeout)
                    if _download_ok(pr):
                        pdf_bytes = pr.content
                        print(f"Success at method 2: Unpaywall for DOI {doi}")
                    else:
                        print(f"Failed method 2: Unpaywall for DOI {doi}")
                else:
                    print(f"Failed method 2: Unpaywall for DOI {doi} (no PDF URL found)")
            else:
                print(f"Failed method 2: Unpaywall for DOI {doi} (API error)")
        except Exception:
            print(f"Failed method 2: Unpaywall for DOI {doi}")

    # 3) DOI content negotiation
    if pdf_bytes is None:
        print(f"Trying method 3: DOI content negotiation for DOI {doi}")
        try:
            r = requests.get(f"https://doi.org/{doi}", headers={"Accept": "application/pdf"}, timeout=timeout, allow_redirects=True)
            if _download_ok(r):
                pdf_bytes = r.content
                print(f"Success at method 3: DOI content negotiation for DOI {doi}")
            else:
                print(f"Failed method 3: DOI content negotiation for DOI {doi}")
        except Exception:
            print(f"Failed method 3: DOI content negotiation for DOI {doi}")

    # 4) OpenAthens redirector
    if pdf_bytes is None and openathens_prefix:
        print(f"Trying method 4: OpenAthens redirector for DOI {doi}")
        try:
            sess = session or requests.Session()
            proxied_url = f"{openathens_prefix}{urllib.parse.quote('https://doi.org/' + doi, safe='')}"
            resp = sess.get(proxied_url, allow_redirects=True, timeout=timeout)
            ctype = resp.headers.get("Content-Type", "")

            if ctype.startswith("application/pdf"):
                # Got the PDF directly
                pdf_bytes = resp.content
                print(f"Success at method 4: OpenAthens redirector for DOI {doi}")
            elif "html" in ctype:
                html = resp.text
                # Check if this is a login page or the article page by looking for clues
                if "openathens.net" in html.lower() or "login" in resp.url:
                    print(f"Failed method 4: OpenAthens redirector for DOI {doi} (authentication required)")
                else:
                    # Assume this is the article page HTML, try to find a PDF link
                    match = re.search(r'href="([^"]+\.pdf[^"]*)"', html)
                    if match:
                        pdf_link = match.group(1)
                        # Complete relative link if needed
                        if pdf_link.startswith("/"):
                            from urllib.parse import urljoin
                            pdf_link = urljoin(resp.url, pdf_link)
                        pdf_resp = sess.get(pdf_link, timeout=timeout)
                        if pdf_resp.headers.get("Content-Type","").startswith("application/pdf"):
                            pdf_bytes = pdf_resp.content
                            print(f"Success at method 4: OpenAthens redirector for DOI {doi}")
                        else:
                            print(f"Failed method 4: OpenAthens redirector for DOI {doi} (PDF link didn't return PDF)")
                    else:
                        print(f"Failed method 4: OpenAthens redirector for DOI {doi} (no PDF link found on page)")
            else:
                print(f"Failed method 4: OpenAthens redirector for DOI {doi} (unexpected content type)")
        except Exception:
            print(f"Failed method 4: OpenAthens redirector for DOI {doi}")

    if pdf_bytes is None:
        print(f"All methods failed for DOI {doi}")
        return (None, None)

    # Save + extract
    pdf_name = _safe_filename_from_doi(doi)
    dest = os.path.join(save_dir, pdf_name)
    with open(dest, "wb") as f:
        f.write(pdf_bytes)
    text = _extract_text_from_pdf_bytes(pdf_bytes) or ""
    print(f"Successfully saved PDF and extracted text for DOI {doi}")
    return (dest, text)


In [2]:
print(result[2])

NameError: name 'result' is not defined

In [3]:
doi = '10.1038/s41592-025-02773-5'
path, full_text = fetch_pdf_and_text_by_doi(doi)
print(path)
print(len(full_text))
print(full_text[:1000])

Trying method 2: Unpaywall for DOI 10.1038/s41592-025-02773-5
Failed method 2: Unpaywall for DOI 10.1038/s41592-025-02773-5 (no PDF URL found)
Trying method 3: DOI content negotiation for DOI 10.1038/s41592-025-02773-5
Failed method 3: DOI content negotiation for DOI 10.1038/s41592-025-02773-5
Trying method 4: OpenAthens redirector for DOI 10.1038/s41592-025-02773-5
Success at method 4: OpenAthens redirector for DOI 10.1038/s41592-025-02773-5
Successfully saved PDF and extracted text for DOI 10.1038/s41592-025-02773-5
/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/PDF/10.1038_s41592-025-02773-5.pdf
86249
Nature Methods | Volume 22 | September 2025 | 1846–1856
1846
nature methods
Article
https://doi.org/10.1038/s41592-025-02773-5
Cancer subclone detection based on DNA 
copy number in single-cell and spatial omic 
sequencing data
 
Chi-Yun Wu 
  1,2,7, Jiazhen Rong 
  1,2,7, Anuja Sathe 
  3, Paul R. Hess2,4, 
Billy T. Lau 
  3,5, Susan M. Grimes 
  3, Siji

In [5]:
import re
import json
import fitz  # PyMuPDF for PDF parsing
from openai import OpenAI
import sys
import os
sys.path.append('/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection')
import config

# Initialize OpenAI client with API key
client = OpenAI(api_key=config.OPENAI_API_KEY)

def extract_datasets_from_text(full_text: str = None, pdf_path: str = None, paper_title: str = None):
    """
    Extract detailed dataset information from a research paper given its text or PDF file path.
    
    Either `full_text` or `pdf_path` must be provided. If both are provided, `pdf_path` is prioritized.
    
    Returns:
        A list of dictionaries, each containing details about a dataset used in the paper:
        [
            {
                "data link": str,        # Direct URL or DOI link to the dataset if available
                "repository": str,       # Repository name (e.g., GEO, SRA, Zenodo) or 'Not available'
                "accession": str,        # Accession ID or DOI of the dataset (if applicable)
                "platform": str,         # Technology platform (e.g., Visium, Xenium, MERFISH, scRNA-seq, CODEX, etc.)
                "species": str,          # Organism species (if mentioned)
                "tissue": str,           # Tissue or sample type (if mentioned)
                "raw_data_available": bool, # True if raw data files are available, False otherwise
                "available": bool,       # True if the dataset is publicly available, False if restricted/not available
                "description": str       # Description of the dataset, including platform resolution and origin (generated by this study or from another source)
            },
            ...
        ]
    """
    # Validate input
    if pdf_path is None and full_text is None:
        print("Error: No input text or PDF path provided.")
        return []
    
    # Extract text from PDF if a path is provided
    text_content = ""
    if pdf_path is not None:
        try:
            # Open the PDF and extract all text
            doc = fitz.open(pdf_path)
            for page in doc:
                text_content += page.get_text()
            doc.close()
        except Exception as e:
            print(f"Failed to read PDF file {pdf_path}: {e}")
            return []
    else:
        text_content = full_text
    
    if not text_content:
        # If text extraction failed or resulted in empty content
        print("Error: No text content could be extracted from the input.")
        return []
    
    # Normalize whitespace and remove hyphenation line breaks for better parsing
    text_clean = text_content.replace("-\n", "").replace("\n", " ")
    # Exclude references section to avoid confusion with data DOIs or accessions in references
    text_upper = text_clean.upper()
    if "REFERENCES" in text_upper:
        text_body = text_clean[: text_upper.index("REFERENCES")]
    else:
        text_body = text_clean
    
    # Prepare the system and user messages for the GPT model
    system_msg = (
        "You are an expert assistant extracting dataset information from scientific papers. "
        "Identify all datasets mentioned in the paper and extract relevant details. "
        "Include the repository (or source) and accession/ID or DOI for each dataset, the data platform/technology used "
        "(e.g., 10x Genomics Visium, 10x Xenium, NanoString CosMX, MERFISH, seqFISH, CODEX, or single-cell RNA-seq if applicable), "
        "the species and tissue, whether raw data is available, whether the dataset is publicly available, and a brief description. "
        "whether the authors generated the data or reused the data from another source, and a brief description. "
        "In the description, mention the platform and its resolution (for example, if it's spatial transcriptomics with spot-based or single-cell resolution, or if it's non-spatial single-cell RNA-seq), "
        "and state whether the dataset was generated in this study or obtained from another source (citing the source or reference if mentioned)."
    )
    user_msg = (
        "Extract all datasets (particularly spatially-resolved omics datasets) mentioned in the following text. "
        "If the paper includes a single-cell RNA-seq dataset (which is non-spatial) for analysis, include it as well and denote it appropriately. "
        "Return ONLY a valid JSON array of objects, where each object has the keys: "
        "data link, repository, accession, platform, species, tissue, raw_data_available, available, original_data, description. "
        "If a dataset is not publicly available (e.g., available upon request or not provided), set repository to \"Not available\" and available to false. "
        "Provide no extra commentary or explanation, only the JSON.\n\n"
        f"Text:\n\"\"\"\n{text_body}\n\"\"\""
    )
    
    # Call the OpenAI API to get the dataset details in JSON format
    try:
        response = client.chat.completions.create(
            model="gpt-5",
            # model="gpt-4.1",
            # model=config.OPENAI_MODEL,  # e.g., 'gpt-4' or 'gpt-3.5-turbo'
            messages=[{"role": "system", "content": system_msg},
                      {"role": "user",  "content": user_msg}],
            # temperature=0
        )
    except Exception as e:
        print(f"OpenAI API error: {e}")
        return []
    
    # The model's answer (should be JSON or contain JSON)
    content = response.choices[0].message.content.strip()
    
    # Helper to parse the JSON from the model's response
    def _extract_json(text: str):
        # Attempt direct JSON parse
        try:
            return json.loads(text), None
        except Exception:
            pass
        # Check for JSON in a markdown code block
        match = re.search(r"```(?:json)?\s*([\s\S]+?)```", text)
        if match:
            try:
                return json.loads(match.group(1)), None
            except Exception as e:
                last_err = e
        else:
            last_err = None
        # Fallback: find first JSON object/array in the text
        match = re.search(r"(\{.*?\}|\[.*?\])", text, flags=re.DOTALL)
        if match:
            try:
                return json.loads(match.group(1)), None
            except Exception as e:
                last_err = e
        return None, last_err
    
    # Parse the JSON content from the model's output
    datasets, err = _extract_json(content)
    if err or datasets is None:
        print(f"Failed to parse JSON from model output: {err or 'No JSON found'}")
        return []
    
    # Ensure the result is a list of dicts
    if isinstance(datasets, dict):
        datasets = [datasets]
    
    return datasets


In [6]:
datasets_info = extract_datasets_from_text(pdf_path="/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/PDF/10.1038_s41551-022-00951-w.pdf")

for ds in datasets_info:
    print(json.dumps(ds, indent=2))

{
  "data link": "Not available",
  "repository": "Not available",
  "accession": "Not available",
  "platform": "Akoya Biosciences PhenoCycler (CODEX) multiplexed immunofluorescence (~40-plex), subcellular resolution",
  "species": "Human",
  "tissue": "Head and neck cancer tumour tissue (FFPE tumour microarray cores)",
  "raw_data_available": false,
  "available": false,
  "original_data": true,
  "description": "UPMC-HNC: 40-plex spatial proteomics (CODEX) imaging of FFPE head-and-neck cancer resections as TMA cores; subcellular resolution. 308 samples from 81 patients with clinical annotations. Dataset generated in this study; data available only upon reasonable request."
}
{
  "data link": "Not available",
  "repository": "Not available",
  "accession": "Not available",
  "platform": "Akoya Biosciences PhenoCycler (CODEX) multiplexed immunofluorescence (~40-plex), subcellular resolution",
  "species": "Human",
  "tissue": "Colorectal cancer tumour tissue (FFPE tumour microarray co

In [7]:
datasets_info

[{'data link': 'Not available',
  'repository': 'Not available',
  'accession': 'Not available',
  'platform': 'Akoya Biosciences PhenoCycler (CODEX) multiplexed immunofluorescence (~40-plex), subcellular resolution',
  'species': 'Human',
  'tissue': 'Head and neck cancer tumour tissue (FFPE tumour microarray cores)',
  'raw_data_available': False,
  'available': False,
  'original_data': True,
  'description': 'UPMC-HNC: 40-plex spatial proteomics (CODEX) imaging of FFPE head-and-neck cancer resections as TMA cores; subcellular resolution. 308 samples from 81 patients with clinical annotations. Dataset generated in this study; data available only upon reasonable request.'},
 {'data link': 'Not available',
  'repository': 'Not available',
  'accession': 'Not available',
  'platform': 'Akoya Biosciences PhenoCycler (CODEX) multiplexed immunofluorescence (~40-plex), subcellular resolution',
  'species': 'Human',
  'tissue': 'Colorectal cancer tumour tissue (FFPE tumour microarray cores)

In [41]:
# 1) If you have raw text already:
data_from_text = extract_datasets_from_text(full_text=full_text)
print(json.dumps(data_from_text, indent=2))


BadRequestError: Error code: 400 - {'error': {'message': "Invalid schema for response_format 'datasets': In context=('properties', 'datasets', 'items'), 'required' is required to be supplied and to be an array including every key in properties. Missing 'data_link'.", 'type': 'invalid_request_error', 'param': 'response_format', 'code': None}}

In [42]:

# 2) If you have a local PDF:
data_from_pdf = extract_datasets_from_pdf(pdf_path="/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/PDF/10.1186_s13059-025-03731-2.pdf")
print(json.dumps(data_from_pdf, indent=2))

TypeError: create() got an unexpected keyword argument 'response_format'

In [66]:
extract_datasets_from_text(full_text)

[{'data link': 'https://nda.nih.gov/abcd',
  'repository': 'NIMH Data Archive (NDA)',
  'accession': 'ABCD',
  'platform': 'MRI (T1-weighted, diffusion MRI, resting-state fMRI)',
  'species': 'Homo sapiens',
  'tissue': 'Brain (whole cortex, children aged 9-10 years)',
  'raw_data_available': True,
  'available': True,
  'description': 'Spatially resolved multimodal neuroimaging dataset from the Adolescent Brain Cognitive Development (ABCD) study, including T1-weighted, diffusion, and resting-state functional MRI for 7,025 children. Used to construct individual-level structural and functional connectomes and gradients for spatial transcriptomics and structure-function coupling analyses.'},
 {'data link': 'https://db.humanconnectome.org',
  'repository': 'Human Connectome Project (HCP)',
  'accession': 'HCP Young Adult',
  'platform': 'MRI (T1-weighted, T2-weighted, diffusion MRI, resting-state fMRI)',
  'species': 'Homo sapiens',
  'tissue': 'Brain (whole cortex, adults aged 22-35 year

In [6]:
import pandas as pd

csv_path = "/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/Papers.csv"
df = pd.read_csv(csv_path)
new_papers_df = pd.DataFrame(result)
df_updated = pd.concat([df, new_papers_df], ignore_index=True)
df_updated = df_updated.drop_duplicates(subset=['paper_id'], keep='first')

df_updated.to_csv(csv_path, index=False)

print(f"Added {len(new_papers_df)} new papers to the CSV file")
print(f"Total papers in CSV after deduplication: {len(df_updated)}")
print(display(df_updated.head()))

Added 37 new papers to the CSV file
Total papers in CSV after deduplication: 37


Unnamed: 0,title,year,venue,paper_id,doi,publication_date,oa_pdf_url,abstract,tldr,reference
0,Spatial transcriptomics of intraductal carcino...,2025,Histopathology,9673ebf76c2e0eb9d6b742f4f649f0982e5e6c82,10.1111/his.15551,2025-09-18,,,,[]
1,Anatomic Predilection of IDH-Mutant Gliomas: A...,2025,medRxiv,3eb894538844ac916ecd4f91bf9b3d65f8e183cf,10.1101/2025.09.16.25333605,2025-09-18,,,,[]
2,Brain Functional-Structural Gradient Coupling ...,2025,medRxiv,377d977c7b456afd9b806d642bc46c324ab53681,10.1101/2025.09.16.25335918,2025-09-18,,,,[]
3,Systematic benchmarking of computational metho...,2025,Genome Biology,22f81f296713100ec8164688d7bb7d98a5576510,10.1186/s13059-025-03731-2,2025-09-18,,,,[{'paper_id': 'fcbea9ca7bcbe8283456e0913f15b35...
4,Sclerotic GVHD and Scleroderma Share Dysregula...,2025,Blood,f0104be4ab11d99e37c8195f0203b68647678fcd,10.1182/blood.2025029836,2025-09-17,,,,[]


None


In [2]:
# import argparse

# # Parse command line arguments
# parser = argparse.ArgumentParser(description='Search and analyze spatial genomics papers')
# parser.add_argument('--since_days', type=int, default=10, help='Number of days to search back')
# parser.add_argument('--print_details', type=str, default='True', help='Whether to print details (True/False)')
# parser.add_argument('--model', type=str, default='gpt-5', help='OpenAI model to use')
# parser.add_argument('--search_query', type=str, default='spatial transcriptomics', help='Search query', nargs='+')

# args = parser.parse_args()

# # Convert string to boolean for print_details
# print_details_input = args.print_details.lower() in ['true', '1', 'yes', 'on']
# since_days_input = args.since_days
# model_input = args.model
# search_query_input = ' '.join(args.search_query)

print_details_input = True
since_days_input = 10
model_input = 'gpt-5'
search_query_input = 'spatial transcriptomics'

SEMANTIC_SCHOLAR_API_KEY = "19tQFoyv7w5xBQNMsUA7C5lwNqEni5g3GKkP8Pkj"  # e.g., 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | https://www.semanticscholar.org/product/api/tutorial?utm_campaign=API%20transaction&utm_medium=email&_hsenc=p2ANqtz--KbD5dVfVRom22kVjKkL-55Ikb73h1Nze5JYW6_8OGfj15Pf_Z7OjRXzHnO2BntuA89mE6jdPyHOEzQnaYDLInKFPGxw&_hsmi=329822401&utm_content=329822401&utm_source=hs_automation
OPENAI_API_KEY = "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"  # e.g., 'sk-...'
UNPAYWALL_EMAIL = "yunruilu@caltech.edu"  # Email for Unpaywall API
NCBI_EMAIL = "yunruilu@caltech.edu"  # Email for NCBI Entrez (required by NCBI)
NCBI_API_KEY = '903b8602ced7c96ae73650c3ff78350a100'  # Optional: NCBI API Key for higher rate limits, or None | https://support.nlm.nih.gov/kbArticle/?pn=KA-05317

# Model and other options
# OPENAI_MODEL = "gpt-5"  # Use 'gpt-4' for best results; 'gpt-3.5-turbo' if lower cost is desired
INSTITUTIONAL_ACCESS = True  # True if running on a network with institutional access to paywalled PDFs

# Search query and settings
# SEARCH_QUERY = '("spatial transcriptomics" OR Visium OR MERFISH OR seqFISH OR CosMX OR Xenium)'
SEARCH_QUERY = search_query_input
# FIELDS_OF_STUDY = "Biology"  # Restrict search to biology-related papers
FIELDS_OF_STUDY = None

# search_papers_bulk.py
import requests, datetime, time
from tqdm import tqdm
# fetch_pdf_pipeline.py
import os, re, io, time, urllib.parse, requests, fitz
from pathlib import Path
from typing import Optional, Union, Tuple
import re
import json
import fitz  # PyMuPDF for PDF parsing
from openai import OpenAI
import sys
import os
import pandas as pd

def _get_with_backoff(url, params, headers, max_retries=5, timeout=30):
    delay = 1.0
    for _ in range(max_retries):
        r = requests.get(url, params=params, headers=headers, timeout=timeout)
        if r.status_code in (429,) or 500 <= r.status_code < 600:
            time.sleep(delay); delay = min(delay * 2, 30); continue
        return r
    return r

def search_new_papers_bulk(since_days=365):
    since_date = datetime.date.today() - datetime.timedelta(days=since_days)
    url = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"

    headers = {}
    if SEMANTIC_SCHOLAR_API_KEY:
        headers["x-api-key"] = SEMANTIC_SCHOLAR_API_KEY

    params = {
        "query": SEARCH_QUERY,
        # "query": 'spatial transcriptomics',                       # simpler query string
        "fields": "title,year,venue,paperId,externalIds,openAccessPdf,publicationDate",
        "publicationDateOrYear": f"{since_date.isoformat()}:{datetime.date.today().isoformat()}",
        "sort": "publicationDate:desc",
        "limit": 1000,
        "fieldsOfStudy": FIELDS_OF_STUDY,
    }

    results, token = [], None
    while True:
        p = params.copy()
        if token: p["token"] = token
        resp = _get_with_backoff(url, p, headers)
        if resp.status_code != 200:
            print("API", resp.status_code, resp.text[:500]); break

        data = resp.json()
        for paper in data.get("data", []):
            pub = paper.get("publicationDate")
            doi = (paper.get("externalIds") or {}).get("DOI")
            oa  = (paper.get("openAccessPdf") or {}).get("url")
            abstract = paper.get("abstract")
            tldr = paper.get("tldr")
            if type(doi) == str and len(doi) > 0:
                results.append({
                    "title": paper.get("title",""),
                    "year": paper.get("year"),
                    "venue": paper.get("venue",""),
                    "paper_id": paper.get("paperId"),
                    "doi": doi,
                    "publication_date": pub,
                    "oa_pdf_url": oa,
                    "abstract": abstract,
                    "tldr": tldr,
                })
        token = data.get("token")
        if not token: break
    return results

def get_references(paper_id, fields="citedPaper.paperId,citedPaper.externalIds", max_per_page=1000):
    """
    Robustly fetch references for a paper, returning a list of citedPaper dicts.
    Handles 'data': null, pagination via 'next', and non-200 responses.
    """
    url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references"
    headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {}
    params = {"fields": fields, "limit": max_per_page, "offset": 0}
    out = []
    while True:
        r = _get_with_backoff(url, params, headers)
        if r.status_code != 200:
            # Surface API response text to help debugging auth/rate-limit/etc.
            raise RuntimeError(f"S2 references error {r.status_code}: {r.text[:300]}")
        try:
            data = r.json()
        except ValueError:
            raise RuntimeError("S2 references returned non-JSON response")

        items = data.get("data") or []  # <- key fix: 'null' -> []
        # Each item normally has {'citedPaper': {...}}; fall back defensively.
        for row in items:
            cp = (row or {}).get("citedPaper") or row or {}
            out.append(cp)

        nxt = data.get("next")
        if nxt is None:
            break
        params["offset"] = nxt
    return out

def get_cited_papers(paper_id: str) -> list[dict]:
    """
    Return a list of {"paper_id": <str>, "doi": <str|None>} for all papers
    that the given paper_id cites.
    """
    refs = get_references(
        paper_id,
        fields="citedPaper.paperId,citedPaper.externalIds",
        max_per_page=1000
    )

    out, seen = [], set()
    for cp in refs:  # cp is the cited paper dict
        pid = cp.get("paperId")
        doi = (cp.get("externalIds") or {}).get("DOI")
        key = (pid, (doi or "").lower())
        if pid and key not in seen:
            out.append({"paper_id": pid, "doi": (doi.lower() if doi else None)})
            seen.add(key)
    return out

# ---------- Small helpers ----------

def _safe_filename_from_doi(doi: str) -> str:
    # Make a safe filename from the DOI
    # e.g., 10.1038/s41586-019-1049-y -> 10.1038_s41586-019-1049-y.pdf
    return re.sub(r'[^A-Za-z0-9._-]+', '_', doi) + ".pdf"

def _extract_text_from_pdf_bytes(pdf_bytes: bytes) -> Optional[str]:
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    except Exception:
        return None
    try:
        parts = []
        for page in doc:
            parts.append(page.get_text())  # plain text extraction
        return "".join(parts)
    finally:
        doc.close()

def _download_ok(resp: requests.Response) -> bool:
    ctype = (resp.headers.get("Content-Type") or "").lower()
    return resp.status_code == 200 and (resp.content and ("pdf" in ctype or resp.content[:4] == b"%PDF"))

# ---------- Main function ----------

def fetch_pdf_and_text_by_doi(
    doi: str,
    save_dir: str = "/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/PDF",
    oa_pdf_url: Optional[str] = None,
    openathens_prefix: str = "https://go.openathens.net/redirector/caltech.edu?url=",
    session: Optional[requests.Session] = None,
    timeout: int = 30,
) -> Tuple[Optional[str], Optional[str]]:
    Path(save_dir).mkdir(parents=True, exist_ok=True)
    pdf_bytes = None

    # 1) Direct OA URL
    if oa_pdf_url:
        print(f"Trying method 1: Direct OA URL for DOI {doi}")
        try:
            r = requests.get(oa_pdf_url, timeout=timeout)
            if _download_ok(r):
                pdf_bytes = r.content
                print(f"Success at method 1: Direct OA URL for DOI {doi}")
            else:
                print(f"Failed method 1: Direct OA URL for DOI {doi}")
        except Exception:
            print(f"Failed method 1: Direct OA URL for DOI {doi}")

    # 2) Unpaywall
    if pdf_bytes is None and UNPAYWALL_EMAIL and "your_email" not in UNPAYWALL_EMAIL:
        print(f"Trying method 2: Unpaywall for DOI {doi}")
        try:
            upw_url = f"https://api.unpaywall.org/v2/{doi}?email={UNPAYWALL_EMAIL}"
            ur = requests.get(upw_url, timeout=timeout)
            if ur.status_code == 200:
                j = ur.json()
                pdf_url = (j.get("best_oa_location") or {}).get("url_for_pdf") or (j.get("best_oa_location") or {}).get("url")
                if not pdf_url:
                    for loc in j.get("oa_locations") or []:
                        pdf_url = loc.get("url_for_pdf") or loc.get("url")
                        if pdf_url: break
                if pdf_url:
                    pr = requests.get(pdf_url, timeout=timeout)
                    if _download_ok(pr):
                        pdf_bytes = pr.content
                        print(f"Success at method 2: Unpaywall for DOI {doi}")
                    else:
                        print(f"Failed method 2: Unpaywall for DOI {doi}")
                else:
                    print(f"Failed method 2: Unpaywall for DOI {doi} (no PDF URL found)")
            else:
                print(f"Failed method 2: Unpaywall for DOI {doi} (API error)")
        except Exception:
            print(f"Failed method 2: Unpaywall for DOI {doi}")

    # 3) DOI content negotiation
    if pdf_bytes is None:
        print(f"Trying method 3: DOI content negotiation for DOI {doi}")
        try:
            r = requests.get(f"https://doi.org/{doi}", headers={"Accept": "application/pdf"}, timeout=timeout, allow_redirects=True)
            if _download_ok(r):
                pdf_bytes = r.content
                print(f"Success at method 3: DOI content negotiation for DOI {doi}")
            else:
                print(f"Failed method 3: DOI content negotiation for DOI {doi}")
        except Exception:
            print(f"Failed method 3: DOI content negotiation for DOI {doi}")

    # 4) OpenAthens redirector
    if pdf_bytes is None and openathens_prefix:
        print(f"Trying method 4: OpenAthens redirector for DOI {doi}")
        try:
            sess = session or requests.Session()
            proxied_url = f"{openathens_prefix}{urllib.parse.quote('https://doi.org/' + doi, safe='')}"
            resp = sess.get(proxied_url, allow_redirects=True, timeout=timeout)
            ctype = resp.headers.get("Content-Type", "")

            if ctype.startswith("application/pdf"):
                # Got the PDF directly
                pdf_bytes = resp.content
                print(f"Success at method 4: OpenAthens redirector for DOI {doi}")
            elif "html" in ctype:
                html = resp.text
                # Check if this is a login page or the article page by looking for clues
                if "openathens.net" in html.lower() or "login" in resp.url:
                    print(f"Failed method 4: OpenAthens redirector for DOI {doi} (authentication required)")
                else:
                    # Assume this is the article page HTML, try to find a PDF link
                    match = re.search(r'href="([^"]+\.pdf[^"]*)"', html)
                    if match:
                        pdf_link = match.group(1)
                        # Complete relative link if needed
                        if pdf_link.startswith("/"):
                            from urllib.parse import urljoin
                            pdf_link = urljoin(resp.url, pdf_link)
                        pdf_resp = sess.get(pdf_link, timeout=timeout)
                        if pdf_resp.headers.get("Content-Type","").startswith("application/pdf"):
                            pdf_bytes = pdf_resp.content
                            print(f"Success at method 4: OpenAthens redirector for DOI {doi}")
                        else:
                            print(f"Failed method 4: OpenAthens redirector for DOI {doi} (PDF link didn't return PDF)")
                    else:
                        print(f"Failed method 4: OpenAthens redirector for DOI {doi} (no PDF link found on page)")
            else:
                print(f"Failed method 4: OpenAthens redirector for DOI {doi} (unexpected content type)")
        except Exception:
            print(f"Failed method 4: OpenAthens redirector for DOI {doi}")

    if pdf_bytes is None:
        print(f"All methods failed for DOI {doi}")
        return (None, None)

    # Save + extract
    pdf_name = _safe_filename_from_doi(doi)
    dest = os.path.join(save_dir, pdf_name)
    with open(dest, "wb") as f:
        f.write(pdf_bytes)
    text = _extract_text_from_pdf_bytes(pdf_bytes) or ""
    print(f"Successfully saved PDF and extracted text for DOI {doi}")
    return (dest, text)

# Initialize OpenAI client with API key
client = OpenAI(api_key=OPENAI_API_KEY)

def extract_datasets_from_text(full_text: str = None, pdf_path: str = None, paper_title: str = None):
    """
    Extract detailed dataset information from a research paper given its text or PDF file path.
    
    Either `full_text` or `pdf_path` must be provided. If both are provided, `pdf_path` is prioritized.
    
    Returns:
        A list of dictionaries, each containing details about a dataset used in the paper:
        [
            {
                "data link": str,        # Direct URL or DOI link to the dataset if available
                "repository": str,       # Repository name (e.g., GEO, SRA, Zenodo) or 'Not available'
                "accession": str,        # Accession ID or DOI of the dataset (if applicable)
                "platform": str,         # Technology platform (e.g., Visium, Xenium, MERFISH, scRNA-seq, CODEX, etc.)
                "species": str,          # Organism species (if mentioned)
                "tissue": str,           # Tissue or sample type (if mentioned)
                "raw_data_available": bool, # True if raw data files are available, False otherwise
                "available": bool,       # True if the dataset is publicly available, False if restricted/not available
                "description": str       # Description of the dataset, including platform resolution and origin (generated by this study or from another source)
            },
            ...
        ]
    """
    # Validate input
    if pdf_path is None and full_text is None:
        print("Error: No input text or PDF path provided.")
        return []
    
    # Extract text from PDF if a path is provided
    text_content = ""
    if pdf_path is not None:
        try:
            # Open the PDF and extract all text
            doc = fitz.open(pdf_path)
            for page in doc:
                text_content += page.get_text()
            doc.close()
        except Exception as e:
            print(f"Failed to read PDF file {pdf_path}: {e}")
            return []
    else:
        text_content = full_text
    
    if not text_content:
        # If text extraction failed or resulted in empty content
        print("Error: No text content could be extracted from the input.")
        return []
    
    # Normalize whitespace and remove hyphenation line breaks for better parsing
    text_clean = text_content.replace("-\n", "").replace("\n", " ")
    # Exclude references section to avoid confusion with data DOIs or accessions in references
    text_upper = text_clean.upper()
    if "REFERENCES" in text_upper:
        text_body = text_clean[: text_upper.index("REFERENCES")]
    else:
        text_body = text_clean
    
    # Prepare the system and user messages for the GPT model
    system_msg = (
        "You are an expert assistant extracting dataset information from scientific papers. "
        "Identify all datasets mentioned in the paper and extract relevant details. "
        "Include the repository (or source) and accession/ID or DOI for each dataset, the data platform/technology used "
        "(e.g., 10x Genomics Visium, 10x Xenium, NanoString CosMX, MERFISH, seqFISH, CODEX, or single-cell RNA-seq if applicable), "
        "the species and tissue, whether raw data is available, whether the dataset is publicly available, and a brief description. "
        "whether the authors generated the data or reused the data from another source, and a brief description. "
        "In the description, mention the platform and its resolution (for example, if it's spatial transcriptomics with spot-based or single-cell resolution, or if it's non-spatial single-cell RNA-seq), "
        "and state whether the dataset was generated in this study or obtained from another source (citing the source or reference if mentioned)."
    )
    user_msg = (
        "Extract all datasets (particularly spatially-resolved omics datasets) mentioned in the following text. "
        "If the paper includes a single-cell RNA-seq dataset (which is non-spatial) for analysis, include it as well and denote it appropriately. "
        "Return ONLY a valid JSON array of objects, where each object has the keys: "
        "data link, repository, accession, platform, species, tissue, raw_data_available, available, original_data, description. "
        "If a dataset is not publicly available (e.g., available upon request or not provided), set repository to \"Not available\" and available to false. "
        "Provide no extra commentary or explanation, only the JSON.\n\n"
        f"Text:\n\"\"\"\n{text_body}\n\"\"\""
    )
    
    # Call the OpenAI API to get the dataset details in JSON format
    try:
        response = client.chat.completions.create(
            model=model_input,
            # model="gpt-4.1",
            # model=OPENAI_MODEL,  # e.g., 'gpt-4' or 'gpt-3.5-turbo'
            messages=[{"role": "system", "content": system_msg},
                      {"role": "user",  "content": user_msg}],
            # temperature=0
        )
    except Exception as e:
        print(f"OpenAI API error: {e}")
        return []
    
    # The model's answer (should be JSON or contain JSON)
    content = response.choices[0].message.content.strip()
    
    # Helper to parse the JSON from the model's response
    def _extract_json(text: str):
        # Attempt direct JSON parse
        try:
            return json.loads(text), None
        except Exception:
            pass
        # Check for JSON in a markdown code block
        match = re.search(r"```(?:json)?\s*([\s\S]+?)```", text)
        if match:
            try:
                return json.loads(match.group(1)), None
            except Exception as e:
                last_err = e
        else:
            last_err = None
        # Fallback: find first JSON object/array in the text
        match = re.search(r"(\{.*?\}|\[.*?\])", text, flags=re.DOTALL)
        if match:
            try:
                return json.loads(match.group(1)), None
            except Exception as e:
                last_err = e
        return None, last_err
    
    # Parse the JSON content from the model's output
    datasets, err = _extract_json(content)
    if err or datasets is None:
        print(f"Failed to parse JSON from model output: {err or 'No JSON found'}")
        return []
    
    # Ensure the result is a list of dicts
    if isinstance(datasets, dict):
        datasets = [datasets]
    
    return datasets



result = search_new_papers_bulk(since_days = since_days_input)
print(f'Found {len(result)} new papers in the last {since_days_input} days since today: {datetime.date.today()}')

for i, one_paper in tqdm(enumerate(result)):
    print('--------------------------------')
    print(f"Processing paper {i+1} of {len(result)}")

    items = get_cited_papers(one_paper['paper_id'])
    result[i]['reference'] = items

    #############################################################################

    def fetch_pdf_and_text_by_doi(
        doi: str,
        save_dir: str = "/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/PDF",
        oa_pdf_url: Optional[str] = None,
        openathens_prefix: str = "https://go.openathens.net/redirector/caltech.edu?url=",
        session: Optional[requests.Session] = None,
        timeout: int = 30,
    ) -> Tuple[Optional[str], Optional[str]]:
        Path(save_dir).mkdir(parents=True, exist_ok=True)
        pdf_bytes = None

        # 1) Direct OA URL
        if oa_pdf_url:
            if print_details_input:
                print(f"Trying method 1: Direct OA URL for DOI {doi}")
            try:
                r = requests.get(oa_pdf_url, timeout=timeout)
                if _download_ok(r):
                    pdf_bytes = r.content
                    if print_details_input:
                        print(f"Success at method 1: Direct OA URL for DOI {doi}")
                else:
                    if print_details_input:
                        print(f"Failed method 1: Direct OA URL for DOI {doi}")
            except Exception:
                if print_details_input:
                    print(f"Failed method 1: Direct OA URL for DOI {doi}")

        # 2) Unpaywall
        if pdf_bytes is None and UNPAYWALL_EMAIL and "your_email" not in UNPAYWALL_EMAIL:
            if print_details_input:
                print(f"Trying method 2: Unpaywall for DOI {doi}")
            try:
                upw_url = f"https://api.unpaywall.org/v2/{doi}?email={UNPAYWALL_EMAIL}"
                ur = requests.get(upw_url, timeout=timeout)
                if ur.status_code == 200:
                    j = ur.json()
                    pdf_url = (j.get("best_oa_location") or {}).get("url_for_pdf") or (j.get("best_oa_location") or {}).get("url")
                    if not pdf_url:
                        for loc in j.get("oa_locations") or []:
                            pdf_url = loc.get("url_for_pdf") or loc.get("url")
                            if pdf_url: break
                    if pdf_url:
                        pr = requests.get(pdf_url, timeout=timeout)
                        if _download_ok(pr):
                            pdf_bytes = pr.content
                            if print_details_input:
                                print(f"Success at method 2: Unpaywall for DOI {doi}")
                        else:
                            if print_details_input:
                                print(f"Failed method 2: Unpaywall for DOI {doi}")
                    else:
                        if print_details_input:
                            print(f"Failed method 2: Unpaywall for DOI {doi} (no PDF URL found)")
                else:
                    if print_details_input:
                        print(f"Failed method 2: Unpaywall for DOI {doi} (API error)")
            except Exception:
                if print_details_input:
                    print(f"Failed method 2: Unpaywall for DOI {doi}")

        # 3) DOI content negotiation
        if pdf_bytes is None:
            if print_details_input:
                print(f"Trying method 3: DOI content negotiation for DOI {doi}")
            try:
                r = requests.get(f"https://doi.org/{doi}", headers={"Accept": "application/pdf"}, timeout=timeout, allow_redirects=True)
                if _download_ok(r):
                    pdf_bytes = r.content
                    if print_details_input:
                        print(f"Success at method 3: DOI content negotiation for DOI {doi}")
                else:
                    if print_details_input:
                        print(f"Failed method 3: DOI content negotiation for DOI {doi}")
            except Exception:
                if print_details_input:
                    print(f"Failed method 3: DOI content negotiation for DOI {doi}")

        # 4) OpenAthens redirector
        if pdf_bytes is None and openathens_prefix:
            if print_details_input:
                print(f"Trying method 4: OpenAthens redirector for DOI {doi}")
            try:
                sess = session or requests.Session()
                proxied_url = f"{openathens_prefix}{urllib.parse.quote('https://doi.org/' + doi, safe='')}"
                resp = sess.get(proxied_url, allow_redirects=True, timeout=timeout)
                ctype = resp.headers.get("Content-Type", "")

                if ctype.startswith("application/pdf"):
                    # Got the PDF directly
                    pdf_bytes = resp.content
                    if print_details_input:
                        print(f"Success at method 4: OpenAthens redirector for DOI {doi}")
                elif "html" in ctype:
                    html = resp.text
                    # Check if this is a login page or the article page by looking for clues
                    if "openathens.net" in html.lower() or "login" in resp.url:
                        if print_details_input:
                            print(f"Failed method 4: OpenAthens redirector for DOI {doi} (authentication required)")
                    else:
                        # Assume this is the article page HTML, try to find a PDF link
                        match = re.search(r'href="([^"]+\.pdf[^"]*)"', html)
                        if match:
                            pdf_link = match.group(1)
                            # Complete relative link if needed
                            if pdf_link.startswith("/"):
                                from urllib.parse import urljoin
                                pdf_link = urljoin(resp.url, pdf_link)
                            pdf_resp = sess.get(pdf_link, timeout=timeout)
                            if pdf_resp.headers.get("Content-Type","").startswith("application/pdf"):
                                pdf_bytes = pdf_resp.content
                                if print_details_input:
                                    print(f"Success at method 4: OpenAthens redirector for DOI {doi}")
                            else:
                                if print_details_input:
                                    print(f"Failed method 4: OpenAthens redirector for DOI {doi} (PDF link didn't return PDF)")
                        else:
                            if print_details_input:
                                print(f"Failed method 4: OpenAthens redirector for DOI {doi} (no PDF link found on page)")
                else:
                    if print_details_input:
                        print(f"Failed method 4: OpenAthens redirector for DOI {doi} (unexpected content type)")
            except Exception:
                if print_details_input:
                    print(f"Failed method 4: OpenAthens redirector for DOI {doi}")

        if pdf_bytes is None:
            print(f"All methods failed for DOI {doi}")
            return (None, None)

    #############################################################################
    path, full_text = fetch_pdf_and_text_by_doi(doi = one_paper['doi'])
    if full_text:
        data_info = extract_datasets_from_text(full_text = full_text)
        if data_info:
            if print_details_input:
                print(f"Successfully extracted datasets information")
                for ds in data_info:
                    print(json.dumps(ds, indent=2))
            result[i]['Datasets_info'] = data_info
        else:
            result[i]['Datasets_info'] = None
    else:
        result[i]['Datasets_info'] = None

csv_path = "/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/Papers.csv"
df = pd.read_csv(csv_path)
new_papers_df = pd.DataFrame(result)
df_updated = pd.concat([df, new_papers_df], ignore_index=True)
df_updated = df_updated.drop_duplicates(subset=['paper_id'], keep='first')

df_updated.to_csv(csv_path, index=False)

print(f"Added {len(new_papers_df)} new papers to the CSV file")
print(f"Total papers in CSV after deduplication: {len(df_updated)}")
# print(display(df_updated.head()))

Found 37 new papers in the last 10 days since today: 2025-09-20


0it [00:00, ?it/s]

--------------------------------
Processing paper 1 of 37
Trying method 2: Unpaywall for DOI 10.1111/his.15551
Failed method 2: Unpaywall for DOI 10.1111/his.15551 (API error)
Trying method 3: DOI content negotiation for DOI 10.1111/his.15551
Failed method 3: DOI content negotiation for DOI 10.1111/his.15551
Trying method 4: OpenAthens redirector for DOI 10.1111/his.15551


1it [00:00,  1.70it/s]

Failed method 4: OpenAthens redirector for DOI 10.1111/his.15551 (no PDF link found on page)
All methods failed for DOI 10.1111/his.15551
--------------------------------
Processing paper 2 of 37
Trying method 2: Unpaywall for DOI 10.1101/2025.09.16.25333605
Failed method 2: Unpaywall for DOI 10.1101/2025.09.16.25333605 (API error)
Trying method 3: DOI content negotiation for DOI 10.1101/2025.09.16.25333605
Failed method 3: DOI content negotiation for DOI 10.1101/2025.09.16.25333605
Trying method 4: OpenAthens redirector for DOI 10.1101/2025.09.16.25333605


1it [00:11, 11.30s/it]

Success at method 4: OpenAthens redirector for DOI 10.1101/2025.09.16.25333605





TypeError: cannot unpack non-iterable NoneType object

In [4]:
path, full_text = fetch_pdf_and_text_by_doi(doi = '10.1101/2025.09.16.25333605')

Trying method 2: Unpaywall for DOI 10.1101/2025.09.16.25333605
Failed method 2: Unpaywall for DOI 10.1101/2025.09.16.25333605 (API error)
Trying method 3: DOI content negotiation for DOI 10.1101/2025.09.16.25333605
Failed method 3: DOI content negotiation for DOI 10.1101/2025.09.16.25333605
Trying method 4: OpenAthens redirector for DOI 10.1101/2025.09.16.25333605
Success at method 4: OpenAthens redirector for DOI 10.1101/2025.09.16.25333605


TypeError: cannot unpack non-iterable NoneType object

In [7]:
USE_OPENATHENS = bool(int(os.getenv("USE_OPENATHENS", "1")))
USE_OPENATHENS

True

In [8]:
from http.cookiejar import LWPCookieJar

In [9]:
sess = requests.Session()
sess.headers.update({"User-Agent": "Mozilla/5.0"})
cookie_path = os.path.expanduser("~/.oa_cookies.lwp")
sess.cookies = LWPCookieJar(cookie_path)
try: sess.cookies.load(ignore_discard=True, ignore_expires=True)
except FileNotFoundError: pass

In [13]:
import requests
print("IP:", requests.get("https://api.ipify.org").text)

IP: 131.215.148.41


In [5]:
import pandas as pd
csv_path = "/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/temp.csv"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,title,year,venue,paper_id,doi,publication_date,oa_pdf_url,abstract,reference,datasets_info,index_paper
0,Cancer-associated fibroblasts drive lung adeno...,2025,Oncogene,13c0faec4d04ddee86bb924ee79c08f0dc4ced64,10.1038/s41388-025-03569-9,2025-09-19,,,[],,0
1,Mouse-Specific Single cell cytokine activity p...,2025,PLoS Computational Biology,0d3941790d9380f36a851362e4ab7e9d644104dd,10.1371/journal.pcbi.1013475,2025-09-19,,,[],,1
2,Spatial transcriptomics of intraductal carcino...,2025,Histopathology,9673ebf76c2e0eb9d6b742f4f649f0982e5e6c82,10.1111/his.15551,2025-09-18,,,[],,2
3,Anatomic Predilection of IDH-Mutant Gliomas: A...,2025,medRxiv,3eb894538844ac916ecd4f91bf9b3d65f8e183cf,10.1101/2025.09.16.25333605,2025-09-18,,,[],,3
4,Brain Functional-Structural Gradient Coupling ...,2025,medRxiv,377d977c7b456afd9b806d642bc46c324ab53681,10.1101/2025.09.16.25335918,2025-09-18,,,[],,4


In [7]:
df.loc[df['index_paper'] == 0, 'doi'].iloc[0]

'10.1038/s41388-025-03569-9'

In [8]:
import pandas as pd

csv_path = "/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/temp/temp.csv"
temp_df = pd.read_csv(csv_path)
print(display(temp_df.head()))
paper_df = pd.read_csv("/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/Papers.csv")
print(display(paper_df.head()))

Unnamed: 0,title,year,venue,paper_id,doi,publication_date,oa_pdf_url,abstract,reference,datasets_info,index_paper,Datasets_info
0,Cancer-associated fibroblasts drive lung adeno...,2025,Oncogene,13c0faec4d04ddee86bb924ee79c08f0dc4ced64,10.1038/s41388-025-03569-9,2025-09-19,,,[],,0,"[{""data link"": ""https://www.ncbi.nlm.nih.gov/g..."
1,Mouse-Specific Single cell cytokine activity p...,2025,PLoS Computational Biology,0d3941790d9380f36a851362e4ab7e9d644104dd,10.1371/journal.pcbi.1013475,2025-09-19,,,[],,1,
2,Spatial transcriptomics of intraductal carcino...,2025,Histopathology,9673ebf76c2e0eb9d6b742f4f649f0982e5e6c82,10.1111/his.15551,2025-09-18,,,[],,2,
3,Anatomic Predilection of IDH-Mutant Gliomas: A...,2025,medRxiv,3eb894538844ac916ecd4f91bf9b3d65f8e183cf,10.1101/2025.09.16.25333605,2025-09-18,,,[],,3,"[{""data link"": ""Not available"", ""repository"": ..."
4,Brain Functional-Structural Gradient Coupling ...,2025,medRxiv,377d977c7b456afd9b806d642bc46c324ab53681,10.1101/2025.09.16.25335918,2025-09-18,,,[],,4,"[{""data link"": ""https://nda.nih.gov/abcd"", ""re..."


None


Unnamed: 0,title,year,venue,paper_id,doi,publication_date,oa_pdf_url,abstract,tldr,reference,datasets_info


None


In [None]:
# Drop the index_paper column from temp_df
temp_df = temp_df.drop(columns=['index_paper'], errors='ignore')

# Append temp_df to paper_df, ensuring columns match
paper_df = pd.concat([paper_df, temp_df], ignore_index=True, sort=False)

# # Display the combined dataframe
# print("Combined dataframe:")
# display(paper_df.tail())
# Remove duplicates based on paper_id, keeping the first row where Datasets_info is not None
# First, sort by paper_id and put non-null Datasets_info rows first
paper_df_sorted = paper_df.sort_values(
    by=['paper_id', 'Datasets_info'], 
    key=lambda x: x.isnull() if x.name == 'Datasets_info' else x,
    na_position='last'
)

# Drop duplicates keeping the first occurrence (which will have non-null Datasets_info if available)
paper_df = paper_df_sorted.drop_duplicates(subset=['paper_id'], keep='first')

# Reset index
paper_df = paper_df.reset_index(drop=True)

# print("After removing duplicates:")
# display(paper_df.tail())

paper_df.to_csv("/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/Papers.csv", index=False)
print('Papers.csv updated')


Combined dataframe:


Unnamed: 0,title,year,venue,paper_id,doi,publication_date,oa_pdf_url,abstract,tldr,reference,datasets_info,Datasets_info
30,Extracellular matrix-myCAF signatures correlat...,2025,Clinical Cancer Research,ad25d367f982d21a0fae2317f0a87a9c79d57bcd,10.1158/1078-0432.CCR-25-1098,2025-09-11,,BACKGROUND\nImmune checkpoint inhibitors (ICI)...,,[],,
31,Spatiomolecular mapping reveals anatomical org...,2025,bioRxiv,9a98aaf5505c988d063d10aaeef79ce8d83c81e9,10.1101/2025.09.10.675374,2025-09-11,,Abstract The nucleus accumbens (NAc) is a key ...,,[],,
32,S3R: Spatially Smooth and Sparse Regression Re...,2025,bioRxiv,4c8193af35112ddec17374a450d4d9f71fdc898b,10.1101/2025.09.06.674629,2025-09-11,,Spatial transcriptomics (ST) data demands mode...,,[{'paper_id': '386d4820b21aed3495f0921667564f8...,,
33,Gut microbiota shape gene regulatory networks ...,2025,Insect Science,49e6fcee048f716117a2e28478d99c3730300552,10.1111/1744-7917.70157,2025-09-11,,Honeybees are key pollinators of flowering pla...,,[],,
34,JCHAIN: A Prognostic Marker Based on Pan-Cance...,2025,Genes,4326be73fcb6332128a7cae7d678faa6a3a4870c,10.3390/genes16091070,2025-09-11,,Background/Objectives: The JCHAIN (immunoglobu...,,[],,
