Dataset link

https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_datasets.html

Pub Med link

https://pubmed.ncbi.nlm.nih.gov/

In [1]:
# Constants
OUTPUT_CSV = "methods_sections.csv"
BATCH_SLEEP = 1.5  # to avoid getting blocked
EMAIL = "test@example.com"

In [2]:
import cellxgene_census

census = cellxgene_census.open_soma(census_version="2025-01-30")
census_datasets = census["census_info"]["datasets"].read().concat().to_pandas()

# for convenience, indexing on the soma_joinid which links this to other census data.
census_datasets = census_datasets.set_index("soma_joinid")

census_datasets.head()

Unnamed: 0_level_0,citation,collection_id,collection_name,collection_doi,collection_doi_label,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,4eb29386-de81-452f-b3c0-e00844e8c7fd,f76861bb-becb-4eb7-82fc-782dc96ccc7f,Spatial transcriptomics in mouse: Puck_191112_05,4eb29386-de81-452f-b3c0-e00844e8c7fd.h5ad,10888
1,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,78d59e4a-82eb-4a61-a1dc-da974d7ea54b,7d7ec1b6-6e3f-4aaa-9442-4b22f3424396,Spatial transcriptomics in mouse: Puck_191112_08,78d59e4a-82eb-4a61-a1dc-da974d7ea54b.h5ad,10250
2,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,add5eb84-5fc9-4f01-982e-a346dd42ee82,de54aed8-4f73-48f6-9229-418a840e2d82,Spatial transcriptomics in mouse: Puck_191109_20,add5eb84-5fc9-4f01-982e-a346dd42ee82.h5ad,12906
3,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,b020294c-ab82-4547-b5a7-63d8ffa575ed,abe4fce1-0859-4a56-ad1e-734d79f0e6c8,Spatial transcriptomics in mouse: Puck_191112_13,b020294c-ab82-4547-b5a7-63d8ffa575ed.h5ad,15161
4,Publication: https://doi.org/10.1038/s41591-02...,a96133de-e951-4e2d-ace6-59db8b3bfb1d,HTAN/HTAPP Broad - Spatio-molecular dissection...,10.1038/s41591-024-03215-z,Klughammer et al. (2024) Nat Med,d7476ae2-e320-4703-8304-da5c42627e71,863fc5e4-bd4a-4681-9c3d-0ee7ef54e327,HTAPP-330-SMP-1082 scRNA-seq,d7476ae2-e320-4703-8304-da5c42627e71.h5ad,565


In [3]:
import re
doi_links = []
for i in range(len(census_datasets)):
    text = census_datasets.loc[i, "citation"]
    match = re.search(r'Publication:\s*https?://doi.org/(\S+)', text)
    if match:
        doi_links.append((i, match.group(1)))

print(f"Found {len(doi_links)} publication DOIs.")
doi_links[:10]

Found 1558 publication DOIs.


[(0, '10.1016/j.isci.2022.104097'),
 (1, '10.1016/j.isci.2022.104097'),
 (2, '10.1016/j.isci.2022.104097'),
 (3, '10.1016/j.isci.2022.104097'),
 (4, '10.1038/s41591-024-03215-z'),
 (5, '10.1016/j.isci.2022.104097'),
 (6, '10.1002/hep4.1854'),
 (7, '10.1038/s41586-024-07944-6'),
 (8, '10.1016/j.isci.2022.104097'),
 (9, '10.1126/sciimmunol.abe6291')]

In [4]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import os
import time
import json
import csv

# Constants
OUTPUT_CSV = "methods_sections.csv"
EMAIL = "test@example.com"
CSV_FIELDS = ["soma_joinid", "doi", "title", "method_headers"]
if not os.path.exists(OUTPUT_CSV):
    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=CSV_FIELDS)
        writer.writeheader()

In [5]:
def get_pmcid_from_doi(doi: str) -> str | None:
    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=test&email={EMAIL}&ids={doi}"
    resp = requests.get(url)
    # time.sleep(1.5)
    soup = BeautifulSoup(resp.text, "xml")
    return soup.find("record").get("pmcid")

def fetch_pmc_xml(pmcid: str) -> BeautifulSoup:
    xml_url = f"https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:{pmcid[3:]}&metadataPrefix=pmc"
    resp = requests.get(xml_url)
    time.sleep(BATCH_SLEEP)
    return BeautifulSoup(resp.content, "xml")

def extract_article_title(xml_soup: BeautifulSoup) -> str | None:
    # PubMed format
    alt_title = xml_soup.find("ArticleTitle")
    if alt_title:
        return alt_title.get_text(strip=True)
    
    # PMC full-text format
    article_title = xml_soup.find("article-title")
    if article_title:
        return article_title.get_text(strip=True)

    return None

def extract_methods_headers_only(xml_soup: BeautifulSoup) -> list[str]:
    methods_sec = [
        sec for sec in xml_soup.find("body").find_all("sec", recursive=False)
        if sec.find("title") and "method" in sec.find("title").get_text(strip=True).lower()
    ]
    if len(methods_sec) != 1:
        return []

    method = methods_sec[0]
    headers = []
    for sub_sec in method.find_all("sec", recursive=False):
        title = sub_sec.find("title")
        if title:
            headers.append(title.get_text(strip=True))
    return headers

def append_headers_to_csv(path: str, records: list[dict], write_header: bool = False):
    with open(path, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["soma_joinid", "doi", "title", "method_headers"])
        if write_header:
            writer.writeheader()
        for row in records:
            row["method_headers"] = json.dumps(row["method_headers"], ensure_ascii=False)
            row["title"] = row["title"].strip().replace("\n", " ").replace("\r", " ")  # clean up line breaks
            writer.writerow(row)

In [6]:
written_df = pd.read_csv(OUTPUT_CSV)
written_set = set(zip(written_df["soma_joinid"], written_df["doi"]))

# Main processing loop (you can adjust range or remove [:10] limit)
for idx, doi in doi_links[:100]:
    if (idx, doi) in written_set:
        print(f"Row {idx} - Already processed.")
        continue

    try:
        pmcid = get_pmcid_from_doi(doi)
        if not pmcid:
            print(f"Row {idx} - No PMCID for DOI {doi}")
            continue

        xml_soup = fetch_pmc_xml(pmcid)

        title = extract_article_title(xml_soup)
        if not title:
            print(f"Row {idx} - No title found.")
            continue

        headers = extract_methods_headers_only(xml_soup)
        if not headers:
            print(f"Row {idx} - No valid Methods section found.")
            continue

        record = [{
            "soma_joinid": idx,
            "doi": doi,
            "title": title,
            "method_headers": headers
        }]

        append_headers_to_csv(OUTPUT_CSV, record, write_header=False)
        print(f"Row {idx} - Wrote {len(headers)} headers, for '{title[:50]}...'")
    except Exception as e:
        print(f"Row {idx} - Error: {e}")


Row 0 - Already processed.
Row 1 - Already processed.
Row 2 - Already processed.
Row 3 - Already processed.
Row 4 - Already processed.
Row 5 - Already processed.
Row 6 - Already processed.
Row 7 - Already processed.
Row 8 - Already processed.
Row 9 - No PMCID for DOI 10.1126/sciimmunol.abe6291
Row 10 - Already processed.
Row 11 - Already processed.
Row 12 - Already processed.
Row 13 - Already processed.
Row 14 - Already processed.
Row 15 - Already processed.
Row 16 - Already processed.
Row 17 - Already processed.
Row 18 - Already processed.
Row 19 - Already processed.
Row 20 - Already processed.
Row 21 - No title found.
Row 22 - Already processed.
Row 23 - Already processed.
Row 24 - Already processed.
Row 25 - Already processed.
Row 26 - No title found.
Row 27 - Already processed.
Row 28 - Already processed.
Row 29 - Already processed.
Row 30 - No title found.
Row 31 - No PMCID for DOI 10.1016/j.celrep.2019.12.082
Row 32 - Already processed.
Row 33 - Already processed.
Row 34 - Alread

In [7]:
import pandas as pd

df = pd.read_csv(OUTPUT_CSV, dtype={"soma_joinid": str, "doi": str, "title": str, "method_headers": str})
unique_df = df.drop_duplicates(subset="doi", keep="first")
print(unique_df[["soma_joinid", "doi", "title", "method_headers"]].head())
unique_df.to_csv("unique_method_sections.csv", index=False, encoding="utf-8")

   soma_joinid                         doi  \
0            0  10.1016/j.isci.2022.104097   
4            4  10.1038/s41591-024-03215-z   
6            6           10.1002/hep4.1854   
7            7  10.1038/s41586-024-07944-6   
12          13  10.1038/s41588-023-01435-6   

                                                title  \
0   High-resolution Slide-seqV2 spatial transcript...   
4   A multi-modal single-cell and spatial expressi...   
6   Single‐Cell, Single‐Nucleus, and Spatial RNA S...   
7   A spatial human thymus cell atlas mapped to a ...   
12  Spatiotemporal transcriptomic maps of whole mo...   

                                       method_headers  
0   ["Key resources table", "Resource availability...  
4   ["Ethics statement", "Sample acquisition, hand...  
6   ["Preparation of Fresh Tissue Homogenates and ...  
7   ["Data generation by institute", "Sample proce...  
12  ["Animal work and embryo preparation (WT and K...  
