In [1]:
# config.py
# Configuration and API keys (replace placeholders with actual keys/emails)
SEMANTIC_SCHOLAR_API_KEY = "19tQFoyv7w5xBQNMsUA7C5lwNqEni5g3GKkP8Pkj"  # e.g., 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | https://www.semanticscholar.org/product/api/tutorial?utm_campaign=API%20transaction&utm_medium=email&_hsenc=p2ANqtz--KbD5dVfVRom22kVjKkL-55Ikb73h1Nze5JYW6_8OGfj15Pf_Z7OjRXzHnO2BntuA89mE6jdPyHOEzQnaYDLInKFPGxw&_hsmi=329822401&utm_content=329822401&utm_source=hs_automation
OPENAI_API_KEY = "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"  # e.g., 'sk-...'
UNPAYWALL_EMAIL = "yunruilu@caltech.edu"  # Email for Unpaywall API
NCBI_EMAIL = "yunruilu@caltech.edu"  # Email for NCBI Entrez (required by NCBI)
NCBI_API_KEY = '903b8602ced7c96ae73650c3ff78350a100'  # Optional: NCBI API Key for higher rate limits, or None | https://support.nlm.nih.gov/kbArticle/?pn=KA-05317

# Model and other options
OPENAI_MODEL = "gpt-5"  # Use 'gpt-4' for best results; 'gpt-3.5-turbo' if lower cost is desired
INSTITUTIONAL_ACCESS = True  # True if running on a network with institutional access to paywalled PDFs

# Search query and settings
SEARCH_QUERY = '("spatial transcriptomics" OR Visium OR MERFISH OR seqFISH OR CosMX OR Xenium)'
# SEARCH_QUERY = '("spatial transcriptomics")'
FIELDS_OF_STUDY = "Biology"  # Restrict search to biology-related papers
FIELDS_OF_STUDY = None
SEARCH_LIMIT = 100  # max results per API call (Semantic Scholar allows up to 100)
SINCE_DAYS = 7  # default to search for papers in the last 7 days (for weekly run)


In [18]:
# search_papers.py
import requests
import datetime
import time
import config

def _get_with_backoff(url, params, headers, max_retries=5, timeout=30):
    """HTTP GET with simple exponential backoff for 429/5xx responses."""
    delay_seconds = 1.0
    last_response = None
    for _ in range(max_retries):
        resp = requests.get(url, params=params, headers=headers, timeout=timeout)
        last_response = resp
        if resp.status_code == 429 or (500 <= resp.status_code < 600):
            time.sleep(delay_seconds)
            delay_seconds = min(delay_seconds * 2, 30)
            continue
        return resp
    return last_response

def search_new_papers(since_days=None):
    """
    Search for recent papers mentioning spatial transcriptomics (and specific methods)
    using the Semantic Scholar API. Returns a list of paper metadata dicts for papers published
    within the last `since_days` days.
    """
    if since_days is None:
        since_days = config.SINCE_DAYS
    today = datetime.date.today()
    since_date = today - datetime.timedelta(days=since_days)
    base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
    query = config.SEARCH_QUERY
    fields = "title,year,venue,externalIds,openAccessPdf,publicationDate"
    headers = {}
    if config.SEMANTIC_SCHOLAR_API_KEY:
        headers["x-api-key"] = config.SEMANTIC_SCHOLAR_API_KEY
    params = {
        # "query": query,
        "query": 'spatial transcriptomics',    
        "fields": fields,
        "limit": config.SEARCH_LIMIT,
        "offset": 0,
        "year": f"{since_date.year}-",
    }
    if config.FIELDS_OF_STUDY:
        params["fieldsOfStudy"] = config.FIELDS_OF_STUDY

    results = []
    while True:
        try:
            resp = _get_with_backoff(base_url, params, headers, timeout=30)
        except Exception as e:
            print(f"Request error: {e}")
            break
        if resp.status_code != 200:
            print(f"API {resp.status_code}: {resp.text[:500]}")
            break
        data = resp.json()
        papers = data.get("data", [])
        if not papers:
            break

        kept_this_page = 0
        for paper in papers:
            pub_date_str = paper.get("publicationDate")
            paper_date = None
            if pub_date_str:
                try:
                    date_part = (pub_date_str or "").split("T")[0]
                    paper_date = datetime.date.fromisoformat(date_part)
                except Exception:
                    paper_date = None
            if paper_date is None:
                year = paper.get("year")
                if year:
                    paper_date = datetime.date(year, 12, 31)
            if paper_date and paper_date < since_date:
                continue

            oa_url = None
            if isinstance(paper.get("openAccessPdf"), dict):
                oa_url = paper["openAccessPdf"].get("url")
            doi = paper.get("doi") or (paper.get("externalIds") or {}).get("DOI")
            results.append({
                "title": paper.get("title", ""),
                "year": paper.get("year"),
                "venue": paper.get("venue", ""),
                "doi": doi,
                "publication_date": pub_date_str,
                "oa_pdf_url": oa_url,
            })
            kept_this_page += 1

        if len(papers) < params["limit"]:
            break
        params["offset"] += params["limit"]
    return results


In [19]:
test_result = search_new_papers(since_days=10)

In [20]:
len(test_result)

10

In [22]:
test_result[1]

{'title': 'Integrating Tissue Microarray to GeoMx® Digital Spatial Profiler\xa0: Spatial Transcriptomics Assay with Bioinformatics Analysis.',
 'year': 2025,
 'venue': 'Methods in molecular biology',
 'doi': '10.1007/978-1-0716-4276-4_9',
 'publication_date': None,
 'oa_pdf_url': ''}

In [15]:
# search_papers_bulk.py
import requests, datetime, time, config

def _get_with_backoff(url, params, headers, max_retries=5, timeout=30):
    delay = 1.0
    for _ in range(max_retries):
        r = requests.get(url, params=params, headers=headers, timeout=timeout)
        if r.status_code in (429,) or 500 <= r.status_code < 600:
            time.sleep(delay); delay = min(delay * 2, 30); continue
        return r
    return r

def search_new_papers_bulk(since_days=365):
    since_date = datetime.date.today() - datetime.timedelta(days=since_days)
    url = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"

    headers = {}
    if config.SEMANTIC_SCHOLAR_API_KEY:
        headers["x-api-key"] = config.SEMANTIC_SCHOLAR_API_KEY

    params = {
        # "query": config.SEARCH_QUERY,
        "query": 'spatial transcriptomics',                       # simpler query string
        "fields": "title,year,venue,externalIds,openAccessPdf,publicationDate",
        # Either use year floor or an exact date floor:
        # "year": f"{since_date.year}-",
        # "publicationDateOrYear": f"{since_date.isoformat()}-",
        "publicationDate": f"{since_date.isoformat()}-",
        "sort": "publicationDate:desc",
        "limit": 1000,
        "fieldsOfStudy": 'Biology',
    }
    # if config.FIELDS_OF_STUDY:
    #     params["fieldsOfStudy"] = config.FIELDS_OF_STUDY

    results, token = [], None
    while True:
        p = params.copy()
        if token: p["token"] = token
        resp = _get_with_backoff(url, p, headers)
        if resp.status_code != 200:
            print("API", resp.status_code, resp.text[:500]); break

        data = resp.json()
        for paper in data.get("data", []):
            pub = paper.get("publicationDate")
            doi = (paper.get("externalIds") or {}).get("DOI")
            oa  = (paper.get("openAccessPdf") or {}).get("url")
            results.append({
                "title": paper.get("title",""),
                "year": paper.get("year"),
                "venue": paper.get("venue",""),
                "doi": doi,
                "publication_date": pub,
                "oa_pdf_url": oa,
            })
        token = data.get("token")
        if not token: break
    return results


In [16]:
result = search_new_papers_bulk(since_days = 10)
print(len(result))
# result = search_new_papers_bulk(since_days = 100)
# print(len(result))
# result = search_new_papers_bulk(since_days = 300)
# print(len(result))

8752


In [17]:
result[0]

{'title': 'High resolution single-cell transcriptomics towards precision profiling across multi-omics and spatial dimensions',
 'year': 2025,
 'venue': 'Trends in Analytical Chemistry (TrAC)',
 'doi': '10.1016/j.trac.2025.118418',
 'publication_date': '2025-11-01',
 'oa_pdf_url': ''}

In [24]:
# fetch_paper.py
import requests
import fitz  # PyMuPDF for PDF parsing
import config

def fetch_full_text(doi, oa_pdf_url=None):
    """
    Fetch the full-text PDF of a paper given its DOI. Tries open-access sources (Semantic Scholar link or Unpaywall),
    then (if enabled) uses institutional access via direct DOI resolution. Returns the extracted text, or None if failed.
    """
    pdf_bytes = None
    # 1. Try direct open-access PDF URL (if provided by Semantic Scholar)
    if oa_pdf_url:
        try:
            resp = requests.get(oa_pdf_url, timeout=30)
            if resp.status_code == 200 and resp.content:
                pdf_bytes = resp.content
        except Exception as e:
            print(f"Error fetching open-access PDF: {e}")
    # 2. Try Unpaywall API to find an open-access PDF
    if pdf_bytes is None:
        if config.UNPAYWALL_EMAIL and "your_email" not in config.UNPAYWALL_EMAIL:
            try:
                upw_url = f"https://api.unpaywall.org/v2/{doi}?email={config.UNPAYWALL_EMAIL}"
                ur = requests.get(upw_url, timeout=20)
            except Exception as e:
                print(f"Error contacting Unpaywall: {e}")
                ur = None
            if ur and ur.status_code == 200:
                data = ur.json()
                pdf_url = None
                if data.get("best_oa_location"):
                    loc = data["best_oa_location"]
                    pdf_url = loc.get("url_for_pdf") or loc.get("url")
                if not pdf_url and data.get("oa_locations"):
                    for loc in data["oa_locations"]:
                        pdf_url = loc.get("url_for_pdf") or loc.get("url")
                        if pdf_url:
                            print('pdf url')
                            print(pdf_url)
                            break
                if pdf_url:
                    try:
                        pr = requests.get(pdf_url, timeout=30)
                        if pr.status_code == 200 and pr.content:
                            pdf_bytes = pr.content
                    except Exception as e:
                        print(f"Error fetching PDF from Unpaywall link: {e}")
        else:
            print("Unpaywall email not set or invalid, skipping Unpaywall step.")
    # 3. Try institutional access via DOI (if enabled and still no PDF)
    if pdf_bytes is None and config.INSTITUTIONAL_ACCESS:
        try:
            resp = requests.get(f"https://doi.org/{doi}", headers={"Accept": "application/pdf"}, timeout=30)
            if resp.status_code == 200 and resp.content:
                # Check content type to confirm PDF
                content_type = resp.headers.get("Content-Type", "").lower()
                if "pdf" in content_type or resp.content[:4] == b"%PDF":
                    pdf_bytes = resp.content
        except Exception as e:
            print(f"Attempt to fetch PDF via institutional access failed for {doi}: {e}")
    if pdf_bytes is None:
        print(f"Could not retrieve full text for DOI {doi}")
        return None
    # Extract text from PDF
    try:
        doc = fitz.open(stream=pdf_bytes, filetype='pdf')
    except Exception as e:
        print(f"Failed to open PDF for {doi}: {e}")
        return None
    text = ""
    try:
        for page in doc:
            text += page.get_text()
    finally:
        doc.close()
    return text

In [25]:
full_text = fetch_full_text(doi,)
full_text

Starting fetch_full_text for DOI: 10.1038/s41467-024-48700-8
No open-access PDF URL provided
Trying Unpaywall API...
Contacting Unpaywall at: https://api.unpaywall.org/v2/10.1038/s41467-024-48700-8?email=yunruilu@caltech.edu
Unpaywall API responded successfully
Found best_oa_location: https://www.nature.com/articles/s41467-024-48700-8.pdf
Attempting to download PDF from: https://www.nature.com/articles/s41467-024-48700-8.pdf
Successfully downloaded PDF from Unpaywall
Extracting text from PDF...
PDF opened successfully, 17 pages
First page extracted 3872 characters
Total text extracted: 111136 characters


'Article\nhttps://doi.org/10.1038/s41467-024-48700-8\nSingle-cell and spatial transcriptomics\nanalysis of non-small cell lung cancer\nMarco De Zuani\n1,2,3,4,11, Haoliang Xue\n1,2,3,4,11, Jun Sung Park\n1,2,5,\nStefan C. Dentro5,6, Zaira Seferbekova\n5, Julien Tessier7, Sandra Curras-Alonso8,\nAngela Hadjipanayis7, Emmanouil I. Athanasiadis\n2,9, Moritz Gerstung2,5,6,\nOmer Bayraktar\n1,2 & Ana Cvejic\n1,2,3,10\nLung cancer is the second most frequently diagnosed cancer and the leading\ncause of cancer-related mortality worldwide. Tumour ecosystems feature\ndiverse immune cell types. Myeloid cells, in particular, are prevalent and have a\nwell-established role in promoting the disease. In our study, we proﬁle\napproximately 900,000 cells from 25 treatment-naive patients with adeno-\ncarcinoma and squamous-cell carcinoma by single-cell and spatial tran-\nscriptomics. We note an inverse relationship between anti-inﬂammatory\nmacrophages and NK cells/T cells, and with reduced NK cell cyt

# from local

In [39]:
# fetch_paper.py
import requests
import fitz  # PyMuPDF
from pathlib import Path
import config

def _extract_text_from_pdf_bytes(pdf_bytes, password=None):
    print("Extracting text from PDF...")
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        print(f"PDF opened successfully, {len(doc)} pages")
    except Exception as e:
        print(f"Failed to open PDF: {e}")
        return None

    try:
        if doc.needs_pass:
            if not password:
                print("PDF is password-protected and no password was provided.")
                doc.close()
                return None
            if not doc.authenticate(password):
                print("Incorrect password for PDF.")
                doc.close()
                return None

        text = []
        for page_num, page in enumerate(doc):
            page_text = page.get_text()  # or page.get_text('text')
            text.append(page_text)
            if page_num == 0:
                print(f"First page extracted {len(page_text)} characters")
        text = "".join(text)
        print(f"Total text extracted: {len(text)} characters")
        return text
    finally:
        doc.close()

def fetch_full_text(doi, oa_pdf_url=None, local_pdf_path=None, password=None):
    """
    Fetch the full-text of a paper as text.
    If `local_pdf_path` is provided, read the PDF from disk and extract text.
    Otherwise, try open-access sources (Semantic Scholar link or Unpaywall),
    then (if enabled) institutional access via DOI resolution.
    """
    print(f"Starting fetch_full_text for DOI: {doi}")
    pdf_bytes = None

    # 0) Local file (short-circuit)
    if local_pdf_path:
        p = Path(local_pdf_path).expanduser()
        print(f"Reading local PDF: {p}")
        try:
            pdf_bytes = p.read_bytes()
            print("Successfully read local PDF into memory")
        except Exception as e:
            print(f"Error reading local PDF: {e}")

    # 1) Direct open-access URL (if provided) — only if we don't already have local bytes
    if pdf_bytes is None and oa_pdf_url:
        print(f"Trying open-access PDF URL: {oa_pdf_url}")
        try:
            resp = requests.get(oa_pdf_url, timeout=30)
            if resp.status_code == 200 and resp.content:
                pdf_bytes = resp.content
                print("Successfully fetched PDF from open-access URL")
            else:
                print(f"Failed to fetch from open-access URL: status {resp.status_code}")
        except Exception as e:
            print(f"Error fetching open-access PDF: {e}")
    elif pdf_bytes is None:
        print("No open-access PDF URL provided")

    # 2) Unpaywall (if still no PDF)
    if pdf_bytes is None:
        print("Trying Unpaywall API...")
        if config.UNPAYWALL_EMAIL and "your_email" not in config.UNPAYWALL_EMAIL:
            try:
                upw_url = f"https://api.unpaywall.org/v2/{doi}?email={config.UNPAYWALL_EMAIL}"
                print(f"Contacting Unpaywall at: {upw_url}")
                ur = requests.get(upw_url, timeout=20)
            except Exception as e:
                print(f"Error contacting Unpaywall: {e}")
                ur = None
            if ur and ur.status_code == 200:
                print("Unpaywall API responded successfully")
                data = ur.json()
                pdf_url = None
                if data.get("best_oa_location"):
                    loc = data["best_oa_location"]
                    pdf_url = loc.get("url_for_pdf") or loc.get("url")
                    print(f"Found best_oa_location: {pdf_url}")
                if not pdf_url and data.get("oa_locations"):
                    print("Checking oa_locations...")
                    for loc in data["oa_locations"]:
                        pdf_url = loc.get("url_for_pdf") or loc.get("url")
                        if pdf_url:
                            print(f"Found PDF URL: {pdf_url}")
                            break
                if pdf_url:
                    pdf_url = 'https://www.nature.com/articles/s41586-019-1049-y.pdf'
                    print(f"Attempting to download PDF from: {pdf_url}")
                    try:
                        pr = requests.get(pdf_url, timeout=30)
                        if pr.status_code == 200 and pr.content:
                            pdf_bytes = pr.content
                            print("Successfully downloaded PDF from Unpaywall")
                        else:
                            print(f"Failed to download PDF: status {pr.status_code}")
                    except Exception as e:
                        print(f"Error fetching PDF from Unpaywall link: {e}")
                else:
                    print("No PDF URL found in Unpaywall response")
            elif ur:
                print(f"Unpaywall API error: status {ur.status_code}")
            else:
                print("Failed to contact Unpaywall API")
        else:
            print("Unpaywall email not set or invalid, skipping Unpaywall step.")

    # 3) Institutional access via DOI (if enabled and still no PDF)
    if pdf_bytes is None and config.INSTITUTIONAL_ACCESS:
        print("Trying institutional access via DOI...")
        try:
            resp = requests.get(f"https://doi.org/{doi}", headers={"Accept": "application/pdf"}, timeout=30)
            print(f"DOI resolution response: status {resp.status_code}")
            if resp.status_code == 200 and resp.content:
                content_type = resp.headers.get("Content-Type", "").lower()
                print(f"Content-Type: {content_type}")
                if "pdf" in content_type or resp.content[:4] == b"%PDF":
                    pdf_bytes = resp.content
                    print("Successfully fetched PDF via institutional access")
                else:
                    print("Response is not a PDF")
            else:
                print("No content received from DOI resolution")
        except Exception as e:
            print(f"Attempt to fetch PDF via institutional access failed for {doi}: {e}")
    elif pdf_bytes is None:
        print("Institutional access is disabled or not configured")

    if pdf_bytes is None:
        print(f"Could not retrieve full text for DOI {doi}")
        return None

    # Extract text
    return _extract_text_from_pdf_bytes(pdf_bytes, password=password)

def fetch_full_text_from_file(local_pdf_path, password=None):
    """
    Convenience wrapper when you only have a local PDF and no DOI lookup is needed.
    """
    # Use a pseudo-DOI for consistent logging
    pseudo_doi = f"local:{Path(local_pdf_path).name}"
    return fetch_full_text(pseudo_doi, local_pdf_path=local_pdf_path, password=password)


In [40]:
# full_text = fetch_full_text_from_file("/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/PIIS0092867425005720.pdf")
# full_text
full_text = fetch_full_text(doi = doi)
full_text

Starting fetch_full_text for DOI: 10.1038/s41467-024-48700-8
No open-access PDF URL provided
Trying Unpaywall API...
Contacting Unpaywall at: https://api.unpaywall.org/v2/10.1038/s41467-024-48700-8?email=yunruilu@caltech.edu
Unpaywall API responded successfully
Found best_oa_location: https://www.nature.com/articles/s41467-024-48700-8.pdf
Attempting to download PDF from: https://www.nature.com/articles/s41586-019-1049-y.pdf
Successfully downloaded PDF from Unpaywall
Extracting text from PDF...
PDF opened successfully, 24 pages
First page extracted 7304 characters
Total text extracted: 88150 characters


"LetteR\nhttps://doi.org/10.1038/s41586-019-1049-y\nTranscriptome-scale super-resolved imaging in \ntissues by RNA seqFISH+\nChee-Huat Linus Eng1, Michael Lawson2, Qian Zhu3, Ruben Dries3, Noushin Koulena2, Yodai Takei2, Jina Yun2,  \nChristopher Cronin2, Christoph Karp2, Guo-Cheng Yuan3 & Long Cai2*\nImaging the transcriptome in situ with high accuracy has been a \nmajor challenge in single-cell biology, which is particularly hindered \nby the limits of optical resolution and the density of transcripts in \nsingle cells1–5. Here we demonstrate an evolution of sequential \nfluorescence in situ hybridization (seqFISH+). We show that \nseqFISH+ can image mRNAs for 10,000 genes in single cells—with \nhigh accuracy and sub-diffraction-limit resolution—in the cortex, \nsubventricular zone and olfactory bulb of mouse brain, using a \nstandard confocal microscope. The transcriptome-level profiling \nof seqFISH+ allows unbiased identification of cell classes and \ntheir spatial organization in

In [None]:
# parse_paper.py
import re
import json
import config
from openai import OpenAI

client = OpenAI(api_key=config.OPENAI_API_KEY)

def find_data_section(full_text):
    """
    Identify the 'Data Availability' section or relevant dataset mentions in the paper text.
    Returns a string containing the extracted snippet that likely contains dataset information.
    """
    if not full_text:
        return ""
    # Normalize text for searching (join hyphenated words and lines)
    text_clean = full_text.replace("-\n", "").replace("\n", " ")
    # Look for a "Data Availability" section
    match = re.search(r'data availability', text_clean, flags=re.IGNORECASE)
    if match:
        start_idx = match.start()
        # Determine end of that section by looking for the next section heading or conclusion
        possible_ends = ["code availability", "materials availability", "acknowledg", 
                         "references", "supplementary", "author contributions", "ethics"]
        end_idx = len(text_clean)
        for term in possible_ends:
            m2 = re.search(term, text_clean[start_idx+1:], flags=re.IGNORECASE)
            if m2:
                candidate_end = start_idx + 1 + m2.start()
                end_idx = min(end_idx, candidate_end)
        snippet = text_clean[start_idx:end_idx]
        return snippet.strip()
    # If no explicit section, search for dataset accessions or repository indicators in text
    text_upper = text_clean.upper()
    # Exclude reference section if present to avoid picking up DOIs from references
    if "REFERENCES" in text_upper:
        text_body = text_clean[: text_upper.index("REFERENCES")]
    else:
        text_body = text_clean
    patterns = [
        r'\bGSE\d+', r'\bGSM\d+', r'\bSRR\d+', r'\bSRP\d+', r'\bPRJNA\d+', r'\bPRJEB\d+',
        r'10\.5281/zenodo\.\d+', r'zenodo', r'figshare', r'\bE-MTAB-\d+', r'\bEGAD\d+', r'\bEGAS\d+', r'doi:10\.'
    ]
    matches = []
    for pattern in patterns:
        for m in re.finditer(pattern, text_body, flags=re.IGNORECASE):
            matches.append((m.start(), m.end()))
    if not matches:
        return ""
    # Merge overlapping matches to get contiguous context blocks
    matches.sort(key=lambda x: x[0])
    merged = []
    for (s, e) in matches:
        if not merged or s > merged[-1][1] + 50:
            merged.append([s, e])
        else:
            merged[-1][1] = max(merged[-1][1], e)
    segments = []
    for (s, e) in merged:
        seg_start = max(0, s - 100)
        seg_end = min(len(text_body), e + 100)
        segments.append(text_body[seg_start:seg_end])
    snippet = "\n---\n".join(segments)
    return snippet.strip()

# def extract_datasets_from_text(full_text, paper_title=None):
#     snippet = find_data_section(full_text)
#     if not snippet:
#         text_clean = full_text.replace("-\n", "").replace("\n", " ")
#         snippet = text_clean[-10000:]
#     if not snippet:
#         return []

#     system_msg = (
#         "You are an expert assistant extracting dataset information from scientific papers. "
#         "Only include datasets that are spatially resolved transcriptomics data (with spatial information)."
#     )
#     user_msg = (
#         "Extract all spatial transcriptomics dataset details from the following text. "
#         "Return ONLY a JSON array of objects with keys: repository, accession, platform, species, tissue, "
#         "raw_data_available, available, description. If data is not public (e.g., 'upon request'), "
#         "set repository='Not available' and available=false.\n\nText:\n\"\"\"\n" + snippet + "\n\"\"\""
#     )

#     try:
#         resp = client.chat.completions.create(
#             model=config.OPENAI_MODEL,  # e.g., "gpt-4o-mini"
#             messages=[{"role": "system", "content": system_msg},
#                       {"role": "user", "content": user_msg}],
#             temperature=0,
#             # Enforce JSON array shape via structured outputs
#             response_format={
#                 "type": "json_object",
#             }
#         )
#         content = resp.choices[0].message.content
#     except Exception as e:
#         print(f"OpenAI API error: {e}")
#         return []

#     try:
#         datasets = json.loads(content)
#     except Exception:
#         # Fallback if the model returned something unexpected
#         m = re.search(r'\[.*\]', content, flags=re.DOTALL)
#         datasets = json.loads(m.group(0)) if m else []

#     if isinstance(datasets, dict):
#         datasets = [datasets]
#     return datasets

def _extract_json(text: str):
    import json, re
    # 1) direct parse
    try:
        return json.loads(text), None
    except Exception:
        pass
    # 2) fenced code blocks ```json ... ``` or ``` ... ```
    m = re.search(r"```(?:json)?\s*(.+?)```", text, flags=re.DOTALL|re.IGNORECASE)
    if m:
        candidate = m.group(1).strip()
        try:
            return json.loads(candidate), None
        except Exception as e:
            last_err = e
    else:
        last_err = None
    # 3) first top-level array/object fallback
    m = re.search(r"(\{.*\}|\[.*\])", text, flags=re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1)), None
        except Exception as e:
            last_err = e
    return None, last_err

def extract_datasets_from_text(full_text, paper_title=None):
    # snippet = find_data_section(full_text)
    # if not snippet:
    #     text_clean = full_text.replace("-\n", "").replace("\n", " ")
    #     snippet = text_clean[-10000:]
    snippet = full_text

    system_msg = (
        "You are an expert assistant extracting dataset information from scientific papers. "
        "Only include datasets that are spatially resolved transcriptomics data (with spatial information)."
    )
    user_msg = (
        # "Extract all spatial transcriptomics dataset details from the following text. If it provides the data link and the detailed informaton point to the data storage in the link, explore the data link, and provide the detailed information."
        # "Return ONLY a valid JSON array of objects with keys: detailed data information "
        "Extract all spatial transcriptomics dataset details from the following text."
        "Return ONLY a valid JSON array of objects with keys: data link, repository, accession, platform, species, tissue, "
        "raw_data_available, available, description. If data is not public (e.g., 'upon request'), "
        "set repository='Not available' and available=false. No commentary—JSON only.\n\n"
        f"Text:\n\"\"\"\n{snippet}\n\"\"\""
    )

    try:
        resp = client.chat.completions.create(
            model='gpt-4.1',  # "gpt-4"
            messages=[{"role": "system", "content": system_msg},
                      {"role": "user", "content": user_msg}],
            temperature=0
        )
        content = resp.choices[0].message.content.strip()
    except Exception as e:
        print(f"OpenAI API error: {e}")
        return []

    data, err = _extract_json(content)
    if err or data is None:
        print(f"Failed to parse JSON from model output: {err}")
        return []
    if isinstance(data, dict):
        data = [data]
    return data


In [9]:
extract_datasets_from_text(full_text)

[{'detailed data information': 'The scRNA-seq and Visium datasets generated in this study are publicly available at BioStudies with accession numbers E-MTAB-13526 and E-MTAB-13530, respectively.'}]

In [11]:
extract_datasets_from_text(full_text)

[{'data link': 'https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526',
  'repository': 'BioStudies',
  'accession': 'E-MTAB-13526',
  'platform': '10x Genomics Visium',
  'species': 'Homo sapiens',
  'tissue': 'Lung',
  'raw_data_available': True,
  'available': True,
  'description': 'Spatial transcriptomics data from lung tissue resections of 25 treatment-naive patients with LUAD or LUSC and two healthy deceased donors.'},
 {'data link': 'https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13530',
  'repository': 'BioStudies',
  'accession': 'E-MTAB-13530',
  'platform': '10x Genomics Visium',
  'species': 'Homo sapiens',
  'tissue': 'Lung',
  'raw_data_available': True,
  'available': True,
  'description': 'Spatial transcriptomics data from fresh frozen tumour and background tissue sections from eight patients using the 10x Genomics Visium platform.'}]

In [38]:
# perturb_multimodal
extract_datasets_from_text(full_text)

[{'data link': 'https://huggingface.co/datasets/xingjiepan/PerturbMulti/tree/main',
  'repository': 'Hugging Face',
  'accession': '',
  'platform': 'RCA-MERFISH (MERFISH with rolling circle amplification)',
  'species': 'mouse',
  'tissue': 'liver',
  'raw_data_available': True,
  'available': True,
  'description': 'Spatial RCA-MERFISH imaging of mouse liver sections measuring 209 endogenous mRNAs and 456 perturbation barcode RNAs, with parallel multiplexed protein/abundant-RNA imaging (14 proteins, 4 abundant RNAs); includes cell segmentation and processed spatial molecule/cell matrices and raw images.'}]

In [43]:
# seqfish+
extract_datasets_from_text(full_text)

[{'data link': 'https://github.com/CaiGroup/seqFISH-PLUS',
  'repository': 'GitHub',
  'accession': None,
  'platform': 'seqFISH+',
  'species': 'Mus musculus',
  'tissue': 'brain (cortex, subventricular zone, olfactory bulb)',
  'raw_data_available': True,
  'available': True,
  'description': 'Spatial transcriptomics dataset generated using seqFISH+ profiling 10,000 genes in single cells from mouse brain regions (cortex, subventricular zone, olfactory bulb), with subcellular spatial resolution.'}]

In [49]:
from openai import OpenAI
import json

client = OpenAI(api_key=config.OPENAI_API_KEY)

pdf_path = "/resnick/groups/mthomson/yunruilu/Github_repo/spatial-genomics-llm-collection/s41586-019-1049-y.pdf"
with open(pdf_path, "rb") as f:
    uploaded = client.files.create(file=f, purpose="assistants")

prompt = (
    "Extract the full text from the PDF file I've provided, without omitting any content. "
    "Extract all spatial transcriptomics dataset details from the following text. "
    "Return ONLY a valid JSON array of objects with keys: "
    "data link, repository, accession, platform, species, tissue, raw_data_available, available, description. "
    "If data is not public (e.g., 'upon request'), set repository='Not available' and available=false. "
    "No commentary—JSON only."
)

resp = client.responses.create(
    model="gpt-4.1",
    input=[{
        "role": "user",
        "content": [
            {"type": "input_text", "text": prompt},
            {"type": "input_file", "file_id": uploaded.id}
        ]
    }],
    temperature=0
)

raw = resp.output_text
try:
    datasets = json.loads(raw)
except json.JSONDecodeError:
    fix = client.responses.create(
        model="gpt-4.1",
        input=f"Return ONLY valid JSON (no commentary). Fix this:\n\n{raw}",
        temperature=0
    )
    datasets = json.loads(fix.output_text)

print(datasets)

BadRequestError: Error code: 400 - {'error': {'message': 'Total file size exceeds the limit of 32MB', 'type': 'invalid_request_error', 'param': 'file_id', 'code': 'file_above_max_size'}}