Dataset link

https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_datasets.html

Pub Med link

https://pubmed.ncbi.nlm.nih.gov/

In [41]:
import cellxgene_census

census = cellxgene_census.open_soma(census_version="2025-01-30")
census_datasets = census["census_info"]["datasets"].read().concat().to_pandas()

# for convenience, indexing on the soma_joinid which links this to other census data.
census_datasets = census_datasets.set_index("soma_joinid")

census_datasets.head()

Unnamed: 0_level_0,citation,collection_id,collection_name,collection_doi,collection_doi_label,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,4eb29386-de81-452f-b3c0-e00844e8c7fd,f76861bb-becb-4eb7-82fc-782dc96ccc7f,Spatial transcriptomics in mouse: Puck_191112_05,4eb29386-de81-452f-b3c0-e00844e8c7fd.h5ad,10888
1,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,78d59e4a-82eb-4a61-a1dc-da974d7ea54b,7d7ec1b6-6e3f-4aaa-9442-4b22f3424396,Spatial transcriptomics in mouse: Puck_191112_08,78d59e4a-82eb-4a61-a1dc-da974d7ea54b.h5ad,10250
2,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,add5eb84-5fc9-4f01-982e-a346dd42ee82,de54aed8-4f73-48f6-9229-418a840e2d82,Spatial transcriptomics in mouse: Puck_191109_20,add5eb84-5fc9-4f01-982e-a346dd42ee82.h5ad,12906
3,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,b020294c-ab82-4547-b5a7-63d8ffa575ed,abe4fce1-0859-4a56-ad1e-734d79f0e6c8,Spatial transcriptomics in mouse: Puck_191112_13,b020294c-ab82-4547-b5a7-63d8ffa575ed.h5ad,15161
4,Publication: https://doi.org/10.1038/s41591-02...,a96133de-e951-4e2d-ace6-59db8b3bfb1d,HTAN/HTAPP Broad - Spatio-molecular dissection...,10.1038/s41591-024-03215-z,Klughammer et al. (2024) Nat Med,d7476ae2-e320-4703-8304-da5c42627e71,863fc5e4-bd4a-4681-9c3d-0ee7ef54e327,HTAPP-330-SMP-1082 scRNA-seq,d7476ae2-e320-4703-8304-da5c42627e71.h5ad,565


In [42]:
import re
doi_links: list[tuple[int, str]] = []
for i in range(len(census_datasets)):
    text = census_datasets.loc[i, "citation"]
    match = re.search(r'Publication:\s*https?://doi.org/(\S+)', text)
    if match:
        doi_links.append((i, match.group(1)))

print(f"Found {len(doi_links)} publication DOIs.")

import pandas as pd
duplicate_doi_series = pd.Series(dict(doi_links))
doi_series = duplicate_doi_series[~duplicate_doi_series.duplicated(keep='first')]
print(f"Found {len(doi_series)} unique publication DOIs.")
doi_series.head(10)

Found 1558 publication DOIs.
Found 236 unique publication DOIs.


0     10.1016/j.isci.2022.104097
4     10.1038/s41591-024-03215-z
6              10.1002/hep4.1854
7     10.1038/s41586-024-07944-6
9     10.1126/sciimmunol.abe6291
13    10.1038/s41588-023-01435-6
18    10.1038/s41593-020-00764-7
21     10.1101/2024.09.10.612293
22    10.1038/s41467-022-29450-x
24    10.1038/s41590-021-01059-0
dtype: object

In [43]:
import os
import csv

# Constants
OUTPUT_CSV = "methods_sections.csv"
EMAIL = "test@example.com"
CSV_FIELDS = ["soma_joinid", "doi", "title", "method_headers"]
if not os.path.exists(OUTPUT_CSV):
    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=CSV_FIELDS)
        writer.writeheader()

In [44]:
import requests
from bs4 import BeautifulSoup
import time
import json

def get_pmcid_from_doi(doi: str) -> str | None:
    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=test&email={EMAIL}&ids={doi}"
    resp = requests.get(url)
    # time.sleep(1.5)
    soup = BeautifulSoup(resp.text, "xml")
    return soup.find("record").get("pmcid")

def fetch_pmc_xml(pmcid: str) -> BeautifulSoup:
    xml_url = f"https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:{pmcid[3:]}&metadataPrefix=pmc"
    resp = requests.get(xml_url)
    # time.sleep(1.5)
    return BeautifulSoup(resp.content, "xml")

def extract_article_title(xml_soup: BeautifulSoup) -> str | None:
    # PubMed format
    alt_title = xml_soup.find("ArticleTitle")
    if alt_title:
        return alt_title.get_text(strip=True)
    
    # PMC full-text format
    article_title = xml_soup.find("article-title")
    if article_title:
        return article_title.get_text(strip=True)

    return None

def extract_methods_headers_only(xml_soup: BeautifulSoup) -> list[str]:
    methods_sec: list[str] = [
        sec for sec in xml_soup.find("body").find_all("sec", recursive=False)
        if sec.find("title") and "method" in sec.find("title").get_text(strip=True).lower()
    ]
    if len(methods_sec) == 0:
        return []
    if len(methods_sec) > 1:
        print("Warning: Multiple methods sections found, using the first one.")

    method = methods_sec[0]
    headers = []
    for sub_sec in method.find_all("sec", recursive=False):
        title = sub_sec.find("title")
        if title:
            headers.append(title.get_text(strip=True))
    return headers

def append_headers_to_csv(path: str, records: list[dict], write_header: bool = False):
    with open(path, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["soma_joinid", "doi", "title", "method_headers"])
        if write_header:
            writer.writeheader()
        for row in records:
            row["method_headers"] = json.dumps(row["method_headers"], ensure_ascii=False)
            row["title"] = row["title"].strip().replace("\n", " ").replace("\r", " ")  # clean up line breaks
            writer.writerow(row)

In [46]:
import pandas as pd

written_df = pd.read_csv(OUTPUT_CSV)
written_set = set(zip(written_df["soma_joinid"], written_df["doi"]))

# Lists to track rows with missing info
no_pmcid_list = []
no_title_list = []
no_headers_list = []

for idx, doi in doi_series.items():
    if (idx, doi) in written_set:
        print(f"Row {idx} - Already processed.")
        continue

    try:
        pmcid = get_pmcid_from_doi(doi)
        if not pmcid:
            print(f"Row {idx} - No PMCID for DOI {doi}")
            no_pmcid_list.append(idx)
            continue

        xml_soup = fetch_pmc_xml(pmcid)

        title = extract_article_title(xml_soup)
        if not title:
            print(f"Row {idx} - No title found.")
            no_title_list.append(idx)
            continue

        headers = extract_methods_headers_only(xml_soup)
        if not headers:
            print(f"Row {idx} - No valid Methods section found.")
            no_headers_list.append(idx)
            continue

        record = [{
            "soma_joinid": idx,
            "doi": doi,
            "title": title,
            "method_headers": headers
        }]

        append_headers_to_csv(OUTPUT_CSV, record, write_header=False)
        print(f"Row {idx} - Wrote {len(headers)} headers, for '{title[:50]}...'")
    except Exception as e:
        print(f"Row {idx} - Error: {e}")


Row 0 - Wrote 5 headers, for 'High-resolution Slide-seqV2 spatial transcriptomic...'
Row 4 - Wrote 32 headers, for 'A multi-modal single-cell and spatial expression m...'
Row 6 - Wrote 8 headers, for 'Single‐Cell, Single‐Nucleus, and Spatial RNA Seque...'
Row 7 - Wrote 34 headers, for 'A spatial human thymus cell atlas mapped to a cont...'
Row 9 - No PMCID for DOI 10.1126/sciimmunol.abe6291
Row 13 - Wrote 17 headers, for 'Spatiotemporal transcriptomic maps of whole mouse ...'
Row 18 - Wrote 15 headers, for 'Molecular characterization of selectively vulnerab...'
Row 21 - No title found.
Row 22 - Wrote 20 headers, for 'Single-cell Atlas of common variable immunodeficie...'
Row 24 - Wrote 22 headers, for 'Single-cell proteo-genomic reference maps of the h...'
Row 26 - No title found.
Row 31 - No PMCID for DOI 10.1016/j.celrep.2019.12.082
Row 40 - No PMCID for DOI 10.1016/j.cell.2022.11.005
Row 51 - No PMCID for DOI 10.1016/j.jhep.2023.12.023
Row 74 - Wrote 5 headers, for 'Longitudinal pro

In [None]:
# remove duplicates with method headers

import pandas as pd
df = pd.read_csv(OUTPUT_CSV, dtype={"soma_joinid": int, "doi": str, "title": str, "method_headers": str})
print(df[["soma_joinid", "doi", "title", "method_headers"]].head())

   soma_joinid                         doi  \
0            0  10.1016/j.isci.2022.104097   
1            4  10.1038/s41591-024-03215-z   
2            6           10.1002/hep4.1854   
3            7  10.1038/s41586-024-07944-6   
4           13  10.1038/s41588-023-01435-6   

                                               title  \
0  High-resolution Slide-seqV2 spatial transcript...   
1  A multi-modal single-cell and spatial expressi...   
2  Single‐Cell, Single‐Nucleus, and Spatial RNA S...   
3  A spatial human thymus cell atlas mapped to a ...   
4  Spatiotemporal transcriptomic maps of whole mo...   

                                      method_headers  
0  ["Key resources table", "Resource availability...  
1  ["Ethics statement", "Sample acquisition, hand...  
2  ["Preparation of Fresh Tissue Homogenates and ...  
3  ["Data generation by institute", "Sample proce...  
4  ["Animal work and embryo preparation (WT and K...  


In [70]:
# bad dataset table with reason not working

bad_indices = sorted(set(no_pmcid_list + no_title_list + no_headers_list))
df_filtered = census_datasets.loc[bad_indices]
"""
['citation', 'collection_id', 'collection_name', 'collection_doi',
       'collection_doi_label', 'dataset_id', 'dataset_version_id',
       'dataset_title', 'dataset_h5ad_path', 'dataset_total_cell_count']"""
# df_filtered[["collection_doi", "dataset_title"]].head(20)

# Create mapping from index to label
labels = {}

# if sec.find("title") and "method" in sec.find("title").get_text(strip=True).lower()
# xml_soup.find("body").find_all("sec", recursive=False)
# method = methods_sec[0]
# method.find_all("sec", recursive=False)

for i in no_pmcid_list:
    labels[i] = "no_pmcid"
for i in no_title_list:
    labels[i] = "no_title"
for i in no_headers_list:
    labels[i] = "no_method_headers"

# Convert to DataFrame with the same index
issue_labels = pd.Series(labels, name="issue_type")

# Join with the filtered dataset
df_filtered = census_datasets.loc[bad_indices].copy()
df_filtered["issue_type"] = issue_labels
df_filtered[["collection_doi", "dataset_title", "issue_type"]].head(20)

Unnamed: 0_level_0,collection_doi,dataset_title,issue_type
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,10.1126/sciimmunol.abe6291,Human tonsil nonlymphoid cells scRNA,no_pmcid
21,10.1101/2024.09.10.612293,slide-seqV2 analysis of aorta,no_title
26,10.1016/j.immuni.2022.09.002,Normal Spleen OT1 Puck_200727_02,no_title
31,10.1016/j.celrep.2019.12.082,Myeloid cells of human eye,no_pmcid
40,10.1016/j.cell.2022.11.005,PNS,no_pmcid
51,10.1016/j.jhep.2023.12.023,Stellate cells from human healthy donor liver ...,no_pmcid
75,10.1002/pros.24020,Urethral luminal epithelia are castration-inse...,no_title
97,10.1038/s41588-021-00911-1,A single-cell and spatially-resolved atlas of ...,no_title
106,10.1126/science.adf1226,ClassPlacodes,no_pmcid
114,10.1158/2159-8290.cd-22-0824,UMAP of Myeloid cells,no_pmcid
