# GEO 肿瘤单细胞数据：按步骤下载（metadata + 表达矩阵）

这个 Notebook 采用 **“宽进 → MINiML 二次判定 → 只下载矩阵文件”** 的策略，尽量减少漏掉肿瘤单细胞数据集。

你会得到：
- 候选 GSE 列表（来自 NCBI 官方 E-utilities / db=gds）
- 每个候选的 `GSE*_family.xml.tgz`（MINiML metadata）
- 本地判定为“像单细胞”的 GSE（带打分与命中关键词）
- 只下载 `suppl/` 里“矩阵类”文件（mtx/h5/h5ad/loom/rds/csv/tsv 等）

> 注意：RNA-seq 的 raw reads 通常在 SRA（FASTQ），本 Notebook 不下载 raw reads。


## 0. 安装依赖（只需一次）

在终端或 Notebook 里运行（如果你环境已经有，就跳过）：

In [None]:
# 如果你是在 Jupyter 里，也可以取消注释这行安装
# %pip install -q requests lxml tqdm

## 1. 基本配置：输出目录、NCBI 参数

- `OUT_DIR`：你希望保存到哪里
- `EMAIL`：建议填真实邮箱（NCBI 更友好）
- `NCBI_API_KEY`：可选（有的话速率限制更宽松）

In [3]:
import os, time, re, math
from pathlib import Path

OUT_DIR = Path.home() / "Projects" / "BigData" / "GEO_scData_tumor"
META_DIR = OUT_DIR / "metadata_miniml"
MATRIX_DIR = OUT_DIR / "matrices_suppl"

OUT_DIR.mkdir(parents=True, exist_ok=True)
META_DIR.mkdir(parents=True, exist_ok=True)
MATRIX_DIR.mkdir(parents=True, exist_ok=True)

EUTILS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
GEO_SERIES_BASE = "https://ftp.ncbi.nlm.nih.gov/geo/series"

TOOL = "geo_sc_notebook"
EMAIL = os.getenv("NCBI_EMAIL", "jiaxinhe@sjtu.edu.cn")   # 建议改成你真实邮箱
NCBI_API_KEY = os.getenv("NCBI_API_KEY", "")         # 可选：export NCBI_API_KEY=xxxx

print("OUT_DIR:", OUT_DIR)
print("EMAIL:", EMAIL)
print("API_KEY set?:", bool(NCBI_API_KEY))

OUT_DIR: /Users/jaxonhe/Projects/BigData/GEO_scData_tumor
EMAIL: jiaxinhe@sjtu.edu.cn
API_KEY set?: False


## 2. 定义“宽进”的检索式（肿瘤 + 单细胞）

你可以自行修改 `QUERY_EXTRA` 来加特定肿瘤类型（比如 glioblastoma、AML、NSCLC 等）。

这里我们限定：
- `gse[ETYP]`：只要 GSE（Series）
- 关键词：肿瘤 + 单细胞（尽量宽，减少漏）

In [None]:
TUMOR_PART = "(cancer OR tumor OR tumour OR neoplasm OR carcinoma OR leukemia OR lymphoma OR melanoma OR glioma OR sarcoma OR metastasis)"
SC_PART = '("single cell" OR scRNA-seq OR snRNA-seq OR "single-nucleus" OR 10x OR Chromium OR Drop-seq OR Smart-seq)'

QUERY_EXTRA = ""   # 例如: 'glioblastoma OR AML OR "non small cell"'

QUERY = f"{TUMOR_PART} AND {SC_PART} AND gse[ETYP]"
if QUERY_EXTRA.strip():
    QUERY = f"({QUERY}) AND ({QUERY_EXTRA})"

print("Entrez query:", QUERY)

Entrez query: (cancer OR tumor OR tumour OR neoplasm OR carcinoma OR leukemia OR lymphoma OR melanoma OR glioma OR sarcoma OR metastasis) AND ("single cell" OR scRNA-seq OR snRNA-seq OR "single-nucleus" OR 10x OR Chromium OR Drop-seq OR Smart-seq) AND gse[ETYP]


In [9]:
# 改变策略，先筛选所有的单细胞数据集，然后在结果里找肿瘤/衰老/血管/神经科学
SC_PART = (
  '("single cell" OR "single-cell" OR scRNA-seq OR snRNA-seq OR "single nucleus" OR "single-nucleus" '
  'OR 10x OR "10x Genomics" OR Chromium OR droplet OR UMI OR barcode OR barcodes '
  'OR Drop-seq OR Smart-seq OR SmartSeq OR inDrops OR "Seq-Well" OR "CEL-Seq" '
  'OR CITE-seq OR "Multiome" OR "scATAC" OR "single-cell ATAC")'
)

QUERY = f"{SC_PART} AND gse[ETYP]"
print("Entrez query:", QUERY)

Entrez query: ("single cell" OR "single-cell" OR scRNA-seq OR snRNA-seq OR "single nucleus" OR "single-nucleus" OR 10x OR "10x Genomics" OR Chromium OR droplet OR UMI OR barcode OR barcodes OR Drop-seq OR Smart-seq OR SmartSeq OR inDrops OR "Seq-Well" OR "CEL-Seq" OR CITE-seq OR "Multiome" OR "scATAC" OR "single-cell ATAC") AND gse[ETYP]


## 3. 用 NCBI E-utilities 搜索（ESearch → UID 列表）

这一步只拿到“命中记录的 UID”。下一步用 ESummary 才能拿到 GSE 号。

In [10]:
import requests
from lxml import etree
from tqdm import tqdm

session = requests.Session()

def eutils_get(endpoint: str, params: dict, timeout=60, max_retries=6):
    """NCBI E-utilities GET（带 tool/email/api_key + 重试）"""
    params = dict(params)
    params["tool"] = TOOL
    params["email"] = EMAIL
    if NCBI_API_KEY:
        params["api_key"] = NCBI_API_KEY

    url = f"{EUTILS}/{endpoint}"
    backoff = 1.8
    for i in range(max_retries):
        r = session.get(url, params=params, timeout=timeout)
        if r.status_code == 200:
            return r.text
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(backoff ** i)
            continue
        r.raise_for_status()
    raise RuntimeError(f"Failed after retries: {url}")

def polite_sleep():
    time.sleep(0.12 if NCBI_API_KEY else 0.35)

def esearch_gds(term: str, retmax=50000):
    xml = eutils_get("esearch.fcgi", {
        "db": "gds",
        "term": term,
        "retmax": str(retmax),
        "retmode": "xml"
    })
    root = etree.fromstring(xml.encode("utf-8"))
    return root.xpath("//IdList/Id/text()")

uids = esearch_gds(QUERY, retmax=50000)
print("UID hits:", len(uids))
uids[:10]

UID hits: 43015


['200315372',
 '200308970',
 '200303866',
 '200300758',
 '200285948',
 '200285521',
 '200293983',
 '200288573',
 '200315445',
 '200308557']

## 4. 用 ESummary 把 UID 转成 GSE 列表（Accession + Title）

我们会：
- 分批（batch）拉取
- 只保留 Accession 以 `GSE` 开头的记录
- 保存为 `candidates_gse.tsv`

In [None]:
def esummary_gds(uids, batch=50):
    rows = []
    for i in tqdm(range(0, len(uids), batch), desc="ESummary batches"):
        chunk = uids[i:i+batch]
        xml = eutils_get("esummary.fcgi", {
            "db": "gds",
            "id": ",".join(chunk),
            "retmode": "xml"
        }, timeout=90)
        root = etree.fromstring(xml.encode("utf-8"))
        for doc in root.xpath("//DocSum"):
            acc = "".join(doc.xpath("./Item[@Name='Accession']/text()")).strip()
            title = "".join(doc.xpath("./Item[@Name='title']/text()")).strip()
            if acc.startswith("GSE"):
                rows.append((acc, title))
        polite_sleep()
    return rows

rows = esummary_gds(uids)
# 去重
seen = set()
gse_list = []
for acc, title in rows:
    if acc not in seen:
        seen.add(acc)
        gse_list.append((acc, title))

# 只运行一次，如果有candidate_gse.csv就不需要再运行
print("Candidate GSE:", len(gse_list))
gse_list[:5]

ESummary batches: 100%|██████████| 861/861 [16:58<00:00,  1.18s/it]

Candidate GSE: 43015





[('GSE315372',
  'Plasminogen lipid cargo drives macrophage TLR2 activation independent of protease activity'),
 ('GSE308970',
  'Multi-tissue transcriptomic aging atlas reveals predictive aging biomarkers in the killifish'),
 ('GSE303866',
  '35 somites tail single cell RNAseq in control and homocysteine treated chicken embryo'),
 ('GSE300758',
  'Clariom D Pico gene array of microdissected lipid droplet (LD)+ and LD- human GBM areas.'),
 ('GSE285948',
  'TRF2 downregulation acts as an upstream driver of heart failure')]

In [None]:
# 保存候选列表
cand_path = OUT_DIR / "candidates_gse.tsv"
with cand_path.open("w", encoding="utf-8") as f:
    f.write("GSE\tTitle\n")
    for acc, title in gse_list:
        f.write(f"{acc}\t{title}\n")

print("Saved:", cand_path)

Saved: /Users/jaxonhe/Projects/BigData/GEO_scData_tumor/candidates_gse.tsv


In [4]:
from pathlib import Path

cand_path = OUT_DIR / "candidates_gse.tsv"

gse_list = []  # 统一用这个变量，后续代码不需要改

if cand_path.exists() and cand_path.stat().st_size > 0:
    # 直接读取已有候选列表（第二次/以后运行走这里）
    with cand_path.open("r", encoding="utf-8") as f:
        next(f)  # 跳过表头
        for line in f:
            parts = line.rstrip("\n").split("\t", 1)
            if len(parts) == 2:
                acc, title = parts
            else:
                acc, title = parts[0], ""
            if acc.startswith("GSE"):
                gse_list.append((acc, title))
    print(f"Loaded candidates from file: {cand_path}")
    print("Candidate GSE:", len(gse_list))
else:
    # 文件不存在/为空，就重新跑 ESearch + ESummary 生成它
    print("candidates_gse.tsv not found, running ESearch/ESummary ...")
    uids = esearch_gds(QUERY, retmax=50000)
    rows = esummary_gds(uids, batch=200)

    seen = set()
    for acc, title in rows:
        if acc.startswith("GSE") and acc not in seen:
            seen.add(acc)
            gse_list.append((acc, title))

    with cand_path.open("w", encoding="utf-8") as f:
        f.write("GSE\tTitle\n")
        for acc, title in gse_list:
            f.write(f"{acc}\t{title}\n")

    print(f"Saved candidates to: {cand_path}")
    print("Candidate GSE:", len(gse_list))

Loaded candidates from file: /Users/jaxonhe/Projects/BigData/GEO_scData_tumor/candidates_gse.tsv
Candidate GSE: 43015


## 5. 生成每个 GSE 的 MINiML / suppl URL

GEO 的 series 目录使用“千位分桶”规则：
- `GSE20329` → `.../series/GSE20nnn/GSE20329/...`

我们先构造：
- `MINiML family`：metadata
- `suppl/`：矩阵文件通常在这里
- （可选）`SeriesMatrix`：单细胞不一定有，但也能试试

In [11]:
def bucket_for_gse(acc: str) -> str:
    n = int(acc[3:])
    return f"GSE{n//1000}nnn"

def gse_root(acc: str) -> str:
    return f"{GEO_SERIES_BASE}/{bucket_for_gse(acc)}/{acc}"

def gse_urls(acc: str) -> dict:
    root = gse_root(acc)
    return {
        "miniml_family": f"{root}/miniml/{acc}_family.xml.tgz",
        "suppl_dir": f"{root}/suppl/",
        "series_matrix": f"{root}/matrix/{acc}_series_matrix.txt.gz",
        "raw_tar": f"{root}/suppl/{acc}_RAW.tar",
    }

# 看看一个例子
example = gse_list[0][0]
example, gse_urls(example)

('GSE315372',
 {'miniml_family': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE315nnn/GSE315372/miniml/GSE315372_family.xml.tgz',
  'suppl_dir': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE315nnn/GSE315372/suppl/',
  'series_matrix': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE315nnn/GSE315372/matrix/GSE315372_series_matrix.txt.gz',
  'raw_tar': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE315nnn/GSE315372/suppl/GSE315372_RAW.tar'})

## 6A. 用候选列表做肿瘤/衰老/血管/神经筛选

In [5]:
# 从候选 gse_list（含 Title）先筛肿瘤：速度快，先把 40k 压到更小
TUMOR_WORDS = [
    "cancer","tumor","tumour","neoplasm","carcinoma","malignan","metasta",
    "leukemia","lymphoma","melanoma","glioma","sarcoma","myeloma",
    "adenocarcinoma","oncology","pdac","hcc","nsclc","gbm"
]
# 常见癌种/组织名（减少漏）
TUMOR_TYPES = [
    "breast","lung","colon","colorectal","gastric","stomach","pancrea",
    "liver","hepat","kidney","renal","bladder","prostate","ovarian","cervic",
    "thyroid","brain","glioblast","melanoma"
]

def has_tumor_signal(title: str) -> bool:
    t = (title or "").lower()
    return any(w in t for w in (TUMOR_WORDS + TUMOR_TYPES))

tumor_candidates = [(acc, title) for acc, title in gse_list if has_tumor_signal(title)]
print("Tumor candidates (by Title):", len(tumor_candidates))
tumor_candidates[:10]

Tumor candidates (by Title): 13325


[('GSE300758',
  'Clariom D Pico gene array of microdissected lipid droplet (LD)+ and LD- human GBM areas.'),
 ('GSE273146',
  'Patient-derived liver organoids recapitulate epithelial heterogeneity and enable precision disease modelling of alcohol-associated liver disease'),
 ('GSE314072',
  'Functionally heterogeneous intratumoral CD4+CD8+ double positive T cells can give rise to single positive T cells [scRNA-seq + scTCR-seq]'),
 ('GSE314071',
  'Functionally heterogeneous intratumoral CD4+CD8+ double positive T cells can give rise to single positive T cells [bulkRNA-seq]'),
 ('GSE311064',
  'Cellular senescence in human liver under normal aging and cancer'),
 ('GSE310392',
  'Cellular senescence in human liver under normal aging and cancer [Xenium]'),
 ('GSE304005',
  'CBTi-seq profiling spatially resolved single-cell transcriptomics in mouse brain functional areas'),
 ('GSE303934',
  'Dissecting transcriptional heterogeneity in sinonasal adenocarcinoma by single-cell RNA sequencing

In [6]:
# 保存癌症候选列表
out = OUT_DIR / "tumor_candidates_by_title.tsv"
with out.open("w", encoding="utf-8") as f:
    f.write("GSE\tTitle\n")
    for acc, title in tumor_candidates:
        f.write(f"{acc}\t{title}\n")
print("Saved:", out)

Saved: /Users/jaxonhe/Projects/BigData/GEO_scData_tumor/tumor_candidates_by_title.tsv


## 6B. 下载 MINiML（metadata）并做“单细胞打分”

### 为什么要打分？
因为 GEO 的标注不统一，单靠检索式容易漏。我们下载 MINiML 后在本地扫描一些“强单细胞”关键词。

你可以调 `MIN_SCORE`：
- 更低：更不容易漏，但会混入更多 bulk，需要你再人工扫一眼
- 更高：更干净，但可能漏一些写得含糊的项目

In [7]:
from pathlib import Path
from tqdm import tqdm
import re

# ① 如果你 6A 的结果保存在文件里（推荐），就从文件读
tumor_path = OUT_DIR / "tumor_candidates_by_title.tsv"  # 你的 6A 输出文件名
tumor_gse = []

with tumor_path.open("r", encoding="utf-8") as f:
    next(f)  # skip header
    for line in f:
        acc = line.split("\t", 1)[0].strip()
        if acc.startswith("GSE"):
            tumor_gse.append(acc)

tumor_gse = sorted(set(tumor_gse))
print("6A tumor candidates:", len(tumor_gse))
tumor_gse[:10]

6A tumor candidates: 13325


['GSE100033',
 'GSE100050',
 'GSE100107',
 'GSE100108',
 'GSE100189',
 'GSE100198',
 'GSE100223',
 'GSE100313',
 'GSE100337',
 'GSE100361']

In [9]:
import tarfile

SC_KEYWORDS = [
    "single cell", "single-cell", "scrna", "scrna-seq", "snrna", "snrna-seq",
    "single nucleus", "single-nucleus", "10x", "chromium", "drop-seq", "smart-seq",
    "umi", "cell barcode", "barcodes", "droplet", "microwell"
]

MIN_SCORE = 3   # 推荐 2（更保守不漏）；如果太多误报可调到 3

def download_with_resume(url: str, out_path: Path, timeout=180):
    """HTTP Range 断点续传；返回 OK/404/FAIL"""
    out_path.parent.mkdir(parents=True, exist_ok=True)
    for attempt in range(6):
        try:
            headers = {}
            mode = "wb"
            existing = out_path.stat().st_size if out_path.exists() else 0
            if existing > 0:
                headers["Range"] = f"bytes={existing}-"
                mode = "ab"

            with session.get(url, stream=True, timeout=timeout, headers=headers) as r:
                if r.status_code == 404:
                    return "404"
                r.raise_for_status()
                # Range 不被支持时，服务器可能返回 200，重新从头下更安全
                if "Range" in headers and r.status_code == 200:
                    mode = "wb"
                with out_path.open(mode, "ab" if mode=="ab" else "wb") as f:
                    for chunk in r.iter_content(chunk_size=1024*1024):
                        if chunk:
                            f.write(chunk)
            return "OK"
        except Exception:
            time.sleep(1.8 ** attempt)
    return "FAIL"

def extract_xml_from_tgz(tgz_path: Path):
    """从 .xml.tgz 中取出第一个 .xml 文件内容（bytes）"""
    try:
        with tarfile.open(tgz_path, "r:gz") as tf:
            for m in tf.getmembers():
                if m.isfile() and m.name.lower().endswith(".xml"):
                    f = tf.extractfile(m)
                    if f:
                        return f.read()
    except Exception:
        return None
    return None

def xml_to_text_lower(xml_bytes: bytes) -> str:
    try:
        root = etree.fromstring(xml_bytes)
        text = " ".join(root.xpath("//text()"))
        return re.sub(r"\s+", " ", text).lower()
    except Exception:
        return ""

def sc_score(text_lc: str):
    score = 0
    matched = []
    for kw in SC_KEYWORDS:
        if kw in text_lc:
            matched.append(kw)
            if kw in ("single cell", "single-cell", "scrna-seq", "snrna-seq"):
                score += 3
            elif kw in ("umi", "cell barcode", "barcodes", "droplet"):
                score += 2
            else:
                score += 1
    return score, matched

In [12]:
# 为了先跑通，建议先在小样本上测试，比如前 50 个
TEST_N = min(50, len(gse_list))
to_check = [acc for acc, _ in gse_list[:TEST_N]]

accepted = []
rejected = []

for acc in tqdm(to_check, desc="MINiML download+score"):
    urls = gse_urls(acc)
    tgz_path = META_DIR / acc / f"{acc}_family.xml.tgz"
    st = download_with_resume(urls["miniml_family"], tgz_path)
    if st != "OK":
        rejected.append((acc, -1, st))
        continue

    xml_bytes = extract_xml_from_tgz(tgz_path)
    if not xml_bytes:
        rejected.append((acc, -1, "no_xml"))
        continue

    text_lc = xml_to_text_lower(xml_bytes)
    score, matched = sc_score(text_lc)

    if score >= MIN_SCORE:
        accepted.append((acc, score, matched))
    else:
        rejected.append((acc, score, "low_score"))

len(accepted), len(rejected), accepted[:3]

MINiML download+score: 100%|██████████| 50/50 [34:24<00:00, 41.29s/it]


(0, 50, [])

把测试跑通后，你可以把 `TEST_N` 改成全量（`len(gse_list)`），并把结果写到文件。

In [None]:
# 保存测试阶段的结果
acc_path = OUT_DIR / "accepted_singlecell_TEST.tsv"
rej_path = OUT_DIR / "rejected_TEST.tsv"

with acc_path.open("w", encoding="utf-8") as f:
    f.write("GSE\tScore\tMatchedKeywords\n")
    for acc, score, matched in sorted(accepted, key=lambda x: (-x[1], x[0])):
        f.write(f"{acc}\t{score}\t{';'.join(matched)}\n")

with rej_path.open("w", encoding="utf-8") as f:
    f.write("GSE\tScore\tReason\n")
    for acc, score, reason in rejected:
        f.write(f"{acc}\t{score}\t{reason}\n")

print("Saved:", acc_path)
print("Saved:", rej_path)

## 7. 对“判定为单细胞”的 GSE：只下载 suppl 里矩阵类文件

这里我们做两件事：
1) 访问 `.../suppl/` 的目录 listing（HTML）拿到文件名
2) 用白名单规则只下载矩阵文件（mtx/h5/h5ad/loom/rds/csv/tsv/txt 等）

> 这一步不会下载 SRA raw reads。

In [None]:
from lxml import html as lxml_html

MATRIX_PATTERNS = [
    r"(matrix\.mtx(\.gz)?)$",
    r"(barcodes\.tsv(\.gz)?)$",
    r"(features\.tsv(\.gz)?)$",
    r"(genes\.tsv(\.gz)?)$",
    r"(\.h5ad)$",
    r"(\.loom)$",
    r"(\.h5)$",
    r"(\.rds)$",
    r"(\.rda)$",
    r"(count|counts|umi|expression|matrix).*?\.(csv|tsv|txt)(\.gz)?$",
]

def list_suppl_files(suppl_url: str):
    r = session.get(suppl_url, timeout=60)
    if r.status_code == 404:
        return []
    r.raise_for_status()
    doc = lxml_html.fromstring(r.text)
    hrefs = doc.xpath("//a/@href")
    files = []
    for h in hrefs:
        h = h.strip()
        if not h or h.endswith("/") or h in (".", "..", "../"):
            continue
        files.append(h)
    # 去重
    out, seen = [], set()
    for f in files:
        if f not in seen:
            seen.add(f)
            out.append(f)
    return out

def is_matrix_file(fn: str) -> bool:
    fn_l = fn.lower()
    return any(re.search(p, fn_l) for p in MATRIX_PATTERNS)

In [None]:
# 先对一个 GSE 试试（取第一个 accepted）
if not accepted:
    raise RuntimeError("No accepted GSE in test run; lower MIN_SCORE or increase TEST_N.")

acc0 = accepted[0][0]
urls0 = gse_urls(acc0)
files0 = list_suppl_files(urls0["suppl_dir"])

print("Example accepted GSE:", acc0)
print("Suppl files (first 30):")
print(files0[:30])

matrix_like = [f for f in files0 if is_matrix_file(f)]
print("\nMatrix-like files:")
print(matrix_like[:30])

如果 `matrix_like` 是空的，常见原因：
- 作者把矩阵打进了 `GSE*_RAW.tar`
- 或者矩阵在更深的子目录/其它链接（GEO listing 不一定直接列出）

你可以选择：
- 只下载 `GSE*_RAW.tar`，然后在本地解包提取 10x 三件套
- 或者把白名单放宽（例如下载所有 `.gz` 然后再筛）

In [None]:
# 下载该 GSE 的 matrix-like 文件（示例：只下载这个 acc0）
jobs = []
for fn in matrix_like:
    url = urls0["suppl_dir"] + fn
    out = MATRIX_DIR / acc0 / fn
    jobs.append((url, out))

print("Jobs:", len(jobs))
jobs[:3]

In [None]:
# 执行下载（示例：单线程，便于你观察）
ok = fail = skip404 = 0
for url, out in tqdm(jobs, desc="download matrix-like files"):
    st = download_with_resume(url, out, timeout=240)
    if st == "OK":
        ok += 1
    elif st == "404":
        skip404 += 1
    else:
        fail += 1

print("OK:", ok, "404:", skip404, "FAIL:", fail)
print("Saved under:", MATRIX_DIR / acc0)

## 8. 扩展到批量：把 accepted 列表都跑一遍

把下面 cell 里的 `accepted` 换成你全量跑出来的 accepted（不是 TEST）。

建议：
- 并发下载可以用 `ThreadPoolExecutor`（文件多时更快）
- 但别开太大并发（4~8 够了）

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def download_jobs_for_gse(acc: str):
    urls = gse_urls(acc)
    try:
        files = list_suppl_files(urls["suppl_dir"])
    except Exception:
        files = []
    matrix_like = [f for f in files if is_matrix_file(f)]
    jobs = [(urls["suppl_dir"] + fn, MATRIX_DIR / acc / fn) for fn in matrix_like]
    return jobs

# 这里用测试阶段的 accepted 演示
all_jobs = []
for acc, score, matched in accepted:
    all_jobs.extend(download_jobs_for_gse(acc))

print("Total matrix-like download jobs:", len(all_jobs))

# 并发下载
workers = 4
ok = fail = skip404 = 0
with ThreadPoolExecutor(max_workers=workers) as ex:
    futs = [ex.submit(download_with_resume, url, out, 240) for url, out in all_jobs]
    for fut in tqdm(as_completed(futs), total=len(futs), desc="FILES"):
        st = fut.result()
        if st == "OK":
            ok += 1
        elif st == "404":
            skip404 += 1
        else:
            fail += 1

print("OK:", ok, "404:", skip404, "FAIL:", fail)
print("Matrices saved under:", MATRIX_DIR)

## 9. 下一步（你跑通后再做）

- 把 `TEST_N` 改成全量：`TEST_N = len(gse_list)`
- 如果漏得多：降低 `MIN_SCORE` 或增加 `SC_KEYWORDS`
- 如果误报多：提高 `MIN_SCORE` 或加入更强信号（比如 UMI/barcode 的权重）
- 如果很多 GSE 的矩阵只在 `RAW.tar`：我们可以加一个 cell **自动下载 RAW.tar 并只解包 10x 三件套/对象文件**（避免解出一堆无关文件）
