In [19]:
# config.py
# Configuration and API keys (replace placeholders with actual keys/emails)
SEMANTIC_SCHOLAR_API_KEY = "19tQFoyv7w5xBQNMsUA7C5lwNqEni5g3GKkP8Pkj"  # e.g., 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | https://www.semanticscholar.org/product/api/tutorial?utm_campaign=API%20transaction&utm_medium=email&_hsenc=p2ANqtz--KbD5dVfVRom22kVjKkL-55Ikb73h1Nze5JYW6_8OGfj15Pf_Z7OjRXzHnO2BntuA89mE6jdPyHOEzQnaYDLInKFPGxw&_hsmi=329822401&utm_content=329822401&utm_source=hs_automation
OPENAI_API_KEY = "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"  # e.g., 'sk-...'
UNPAYWALL_EMAIL = "yunruilu@caltech.edu"  # Email for Unpaywall API
NCBI_EMAIL = "yunruilu@caltech.edu"  # Email for NCBI Entrez (required by NCBI)
NCBI_API_KEY = '903b8602ced7c96ae73650c3ff78350a100'  # Optional: NCBI API Key for higher rate limits, or None | https://support.nlm.nih.gov/kbArticle/?pn=KA-05317

# Model and other options
OPENAI_MODEL = "gpt-5"  # Use 'gpt-4' for best results; 'gpt-3.5-turbo' if lower cost is desired
INSTITUTIONAL_ACCESS = True  # True if running on a network with institutional access to paywalled PDFs

# Search query and settings
SEARCH_QUERY = '("spatial transcriptomics" OR Visium OR MERFISH OR seqFISH OR CosMX OR Xenium)'
# SEARCH_QUERY = '("spatial transcriptomics")'
FIELDS_OF_STUDY = "Biology"  # Restrict search to biology-related papers
FIELDS_OF_STUDY = None
SEARCH_LIMIT = 100  # max results per API call (Semantic Scholar allows up to 100)
SINCE_DAYS = 7  # default to search for papers in the last 7 days (for weekly run)


In [20]:
import requests
import datetime
import time
import config

def _get_with_backoff(url, params, headers, max_retries=5, timeout=30):
    delay = 1.0
    for _ in range(max_retries):
        r = requests.get(url, params=params, headers=headers, timeout=timeout)
        if r.status_code in (429,) or 500 <= r.status_code < 600:
            time.sleep(delay); delay = min(delay * 2, 30); continue
        return r
    return r

def search_new_papers_bulk(since_days=365):
    since_date = datetime.date.today() - datetime.timedelta(days=since_days)
    url = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"

    headers = {}
    if config.SEMANTIC_SCHOLAR_API_KEY:
        headers["x-api-key"] = config.SEMANTIC_SCHOLAR_API_KEY

    params = {
        "query": config.SEARCH_QUERY,  # keep queries consistent across functions
        "fields": "title,year,venue,externalIds,openAccessPdf,publicationDate",
        "publicationDateOrYear": f"{since_date.isoformat()}-",
        "sort": "publicationDate:desc",
        "limit": 1000,
        "fieldsOfStudy": config.FIELDS_OF_STUDY,
    }

    results, token = [], None
    while True:
        p = params.copy()
        if token: p["token"] = token
        resp = _get_with_backoff(url, p, headers)
        if resp.status_code != 200:
            print("API", resp.status_code, resp.text[:500]); break

        data = resp.json()
        for paper in data.get("data", []):
            pub = paper.get("publicationDate")
            paper_date = None
            if pub:
                try:
                    paper_date = datetime.date.fromisoformat(pub.split("T")[0])
                except Exception:
                    pass
            if paper_date and paper_date < since_date:
                continue  # client-side guard

            doi = (paper.get("externalIds") or {}).get("DOI")
            oa  = (paper.get("openAccessPdf") or {}).get("url")
            results.append({
                "title": paper.get("title",""),
                "year": paper.get("year"),
                "venue": paper.get("venue",""),
                "doi": doi,
                "publication_date": pub,
                "oa_pdf_url": oa,
            })
        token = data.get("token")
        if not token: break
    return results

In [10]:
test_result = search_new_papers_bulk(since_days=100)
print(len(test_result))

0


In [21]:
from urllib.parse import quote

doi = "10.1038/s41551-022-00951-w"
doi_url = f"https://doi.org/{doi}"
proxy_prefix = "https://go.openathens.net/redirector/caltech.edu?url="
proxied = proxy_prefix + quote(doi_url, safe='')
print(proxied)

https://go.openathens.net/redirector/caltech.edu?url=https%3A%2F%2Fdoi.org%2F10.1038%2Fs41551-022-00951-w


In [22]:
import requests
session = requests.Session()

In [23]:
proxied_url = "https://go.openathens.net/redirector/caltech.edu?url=https://doi.org/10.1038/s41551-022-00951-w"
response = session.get(proxied_url, allow_redirects=True)
print(response.status_code, response.headers.get('Content-Type'))

200 text/html;charset=utf-8


In [3]:
import requests, urllib.parse, re

# doi = "10.1038/s41551-022-00951-w"
# doi = "10.1038/s41592-025-02770-8"
# doi = "10.1007/978-1-0716-4276-4_9"
doi = '10.1101/2025.09.16.25335918'
proxied_url = f"https://go.openathens.net/redirector/caltech.edu?url={urllib.parse.quote('https://doi.org/' + doi, safe='')}"

session = requests.Session()
resp = session.get(proxied_url, allow_redirects=True)
ctype = resp.headers.get("Content-Type", "")

In [4]:
if ctype.startswith("application/pdf"):
    # Got the PDF directly
    with open("article.pdf", "wb") as f:
        f.write(resp.content)
    print("PDF downloaded directly.")
elif "html" in ctype:
    html = resp.text
    # Check if this is a login page or the article page by looking for clues
    if "openathens.net" in html.lower() or "login" in resp.url:
        print("Authentication required – no active session. You would need to log in via OpenAthens first.")
        # (Here you would trigger a Selenium login or use stored cookies)
    else:
        # Assume this is the article page HTML, try to find a PDF link
        match = re.search(r'href="([^"]+\.pdf[^"]*)"', html)
        if match:
            pdf_link = match.group(1)
            # Complete relative link if needed
            if pdf_link.startswith("/"):
                from urllib.parse import urljoin
                pdf_link = urljoin(resp.url, pdf_link)
            pdf_resp = session.get(pdf_link)
            if pdf_resp.headers.get("Content-Type","").startswith("application/pdf"):
                with open("article.pdf", "wb") as f:
                    f.write(pdf_resp.content)
                print("PDF downloaded after fetching article page.")
            else:
                print("Failed to retrieve PDF – may need additional steps (e.g., login or different parsing).")
        else:
            print("No direct PDF link found on page – manual analysis might be required.")

PDF downloaded after fetching article page.


In [5]:
ctype

'text/html; charset="UTF-8"'

In [6]:
resp

<Response [200]>