In [None]:
import requests
import re
import pandas as pd
from datetime import datetime, timedelta
from time import sleep

# Utility: strip JATS/HTML tags
def strip_html_tags(text):
    return re.sub("<.*?>", "", text) if text else None

# Journal ISSNs
JOURNALS = {
    "Nature": ["0028-0836", "1476-4687"],
    "Science": [],  # Not reliably indexed in Crossref
    "Cell": ["1934-5909"],
    "eLife": ["2050-084X"],
    "PLoS Biology": ["1545-7885"],
    "PNAS": ["2752-6542"],
    "Nature Methods": ["1548-7091", "1548-7105"],
    "Molecular Systems Biology": ["1744-4292"],
    "Nature Genetics": ["1061-4036", "1546-1718"],
    "Nature Structural & Molecular Biology": ["1545-9993", "1545-9985"],
    "Nature Chemical Biology": ["1552-4450", "1552-4469"],
    "PLoS Genetics": ["1553-7390", "1553-7404"],
    "PLoS Computational Biology": ["1553-734X", "1553-7358"],
    "Science Signaling": ["1945-0877", "1937-9145"],
    "Cell Reports": ["2211-1247"],
    "Molecular Cell": ["1097-2765"]
}

HEADERS = {
    "Accept": "application/json"
}
# Function 1: Fetch articles for given months back (default 2 months)
def fetch_articles_recent(months_back=2, rows=10):
    from_date = (datetime.now() - timedelta(days=months_back*30)).strftime("%Y-%m-%d")
    records = []
    
    for journal_name, issns in JOURNALS.items():
        for issn in issns:
            url = f"https://api.crossref.org/journals/{issn}/works"
            params = {
                "filter": f"from-pub-date:{from_date}",
                "rows": rows,
                "sort": "published",
                "order": "desc",
                "mailto": "your-email@example.com"
            }
            try:
                resp = requests.get(url, headers=HEADERS, params=params, timeout=30)
                resp.raise_for_status()
                articles = resp.json()["message"]["items"]
                for article in articles:
                    title = article.get("title", [""])[0]
                    abstract_raw = article.get("abstract", "")
                    abstract = strip_html_tags(abstract_raw)
                    date_parts = article.get("published-print", article.get("published-online", {})).get("date-parts", [[None]])
                    date_str = "-".join(map(str, date_parts[0])) if date_parts[0][0] else None
                    records.append({
                        "title": title,
                        "abstract": abstract,
                        "journal": journal_name,
                        "date_published": date_str
                    })
            except Exception as e:
                print(f"Error fetching {journal_name} / {issn}: {e}")
            sleep(1)  # Be polite to Crossref API
    return records

# Function 2: Convert to DataFrame
def articles_to_dataframe(records):
    return pd.DataFrame(records)

# Example usage:
records = fetch_articles_recent(months_back=2)
df = articles_to_dataframe(records)
df.head()


Error fetching Nature Structural & Molecular Biology / 1545-9993: 500 Server Error: Server Error for url: https://api.crossref.org/journals/1545-9993/works?filter=from-pub-date%3A2025-05-26&rows=10&sort=published&order=desc&mailto=your-email%40example.com
Error fetching Nature Structural & Molecular Biology / 1545-9985: 500 Server Error: Server Error for url: https://api.crossref.org/journals/1545-9985/works?filter=from-pub-date%3A2025-05-26&rows=10&sort=published&order=desc&mailto=your-email%40example.com
Error fetching PLoS Computational Biology / 1553-734X: 500 Server Error: Server Error for url: https://api.crossref.org/journals/1553-734X/works?filter=from-pub-date%3A2025-05-26&rows=10&sort=published&order=desc&mailto=your-email%40example.com


In [5]:
df

Unnamed: 0,title,abstract,journal,date_published
0,Why we need mandatory safeguards for emotional...,,Nature,2025-7-3
1,Protect Iran’s scientists from attacks,,Nature,2025-7-3
2,Human embryo research: how to move towards a 2...,,Nature,2025-7-3
3,It should be free to apply to graduate school,,Nature,2025-7-3
4,Global pandemic agreement needs sustained pres...,,Nature,2025-7-3
...,...,...,...,...
205,Emerging mechanisms underlying formaldehyde to...,,Molecular Cell,2025-6
206,Synaptic activity causes minute-scale changes ...,,Molecular Cell,2025-6
207,Non-canonical PRC1.1 licenses transcriptional ...,,Molecular Cell,2025-6
208,An NADH-controlled gatekeeper of ATP synthase,,Molecular Cell,2025-6


In [None]:
# Updated fetch_articles_recent with pagination
def fetch_articles_recent_all(months_back=2):
    from_date = (datetime.now() - timedelta(days=months_back * 30)).strftime("%Y-%m-%d")
    records = []
    rows = 1000  # Max allowed by Crossref
    for journal_name, issns in JOURNALS.items():
        for issn in issns:
            offset = 0
            while True:
                url = f"https://api.crossref.org/journals/{issn}/works"
                params = {
                    "filter": f"from-pub-date:{from_date}",
                    "rows": rows,
                    "offset": offset,
                    "sort": "published",
                    "order": "desc",
                    "mailto": "your-email@example.com"
                }
                try:
                    resp = requests.get(url, headers=HEADERS, params=params, timeout=30)
                    resp.raise_for_status()
                    items = resp.json()["message"]["items"]
                    if not items:
                        break
                    for article in items:
                        title = article.get("title", [""])[0]
                        abstract_raw = article.get("abstract", "")
                        abstract = strip_html_tags(abstract_raw)
                        date_parts = article.get("published-print", article.get("published-online", {})).get("date-parts", [[None]])
                        date_str = "-".join(map(str, date_parts[0])) if date_parts[0][0] else None
                        records.append({
                            "title": title,
                            "abstract": abstract,
                            "journal": journal_name,
                            "date_published": date_str
                        })
                    offset += rows
                    sleep(1)  # Rate limit
                except Exception as e:
                    print(f"Error fetching {journal_name} / {issn}: {e}")
                    break
    return records

# Also update the weekly version
def fetch_articles_this_week_all():
    return fetch_articles_recent_all(months_back=0.25)  # Roughly 1 week

# Example usage
recent_all_records = fetch_articles_recent_all(months_back=1)
recent_all_df = articles_to_dataframe(recent_all_records)
recent_all_df.head()


In [10]:
recent_all_df.to_csv("2023-2025_papers.csv", index=False)