Single cell data from scRNAseq 10x ([Yao et al 2021](https://doi.org/10.1016/j.cell.2021.04.021))

In [1]:
import math
import urllib.request
from pathlib import Path

import anndata as ad
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
from scipy.sparse import csc_matrix, hstack

In [5]:
data_path = Path("data")

yao_path = (
    data_path
    / "yao/Analysis_Zeng_Hippocampus_10X/data/10x_v2/mouse/processed/YaoHippo2020"
)

de_genes_yao_file = data_path / "yao_2021" / "DE_genes_yao2021.xlsx"

In [None]:
de_genes_yao_file.parent.mkdir(exist_ok=True, parents=True)

url = "https://ars.els-cdn.com/content/image/1-s2.0-S0092867421005018-mmc4.xlsx"
_ = urllib.request.urlretrieve(url, de_genes_yao_file)

In [7]:
def get_anndata(path, n_chunks=20):
    with h5py.File(path, "r") as f:
        data = f["data"]
        samples = pd.DataFrame(index=pd.Index(data["samples"][:]).astype(str))
        genes = pd.DataFrame(index=pd.Index(data["gene"][:]).astype(str))
        shape = data["shape"][:]

        chunksize = math.ceil(data["counts"].shape[1] / n_chunks)

        counts = hstack(
            [
                csc_matrix(data["counts"][:, i * chunksize : (i + 1) * chunksize])
                for i in range(n_chunks)
            ]
        ).transpose()

    return ad.AnnData(X=counts, obs=samples, var=genes)

In [8]:
adata = get_anndata(yao_path / "CTX_Hip_counts_10x.h5")

anno = (
    pd.read_csv(
        yao_path / "CTX_Hip_anno_10x.csv.tar",
        usecols=["sample_name", "supertype_label", "subclass_label"],
    )
    .set_index("sample_name")
    .astype({"subclass_label": "category"})
)

adata.obs = adata.obs.join(anno)

In [9]:
adata.write_h5ad(data_path / "Yao.h5ad")

In [11]:
adata.obs["subclass_label"].value_counts()

subclass_label
L4/5 IT CTX      275960
L6 CT CTX        148752
L2/3 IT CTX      127961
L6 IT CTX         78297
DG                58948
L5 IT CTX         58190
L2/3 IT PPP       46476
Sst               45467
Vip               43684
Lamp5             42144
L5 NP CTX         31703
Pvalb             30461
Car3              22457
L6b/CT ENT        20269
L5 PT CTX         17260
CA1-ProS          15897
L6b CTX           15050
Sncg              13877
L3 IT ENT         13362
Oligo              8987
L2/3 IT ENTl       6433
CT SUB             5769
L5 IT TPE-ENT      5749
L4 RSP-ACA         4593
L2  IT ENTl        4568
SUB-ProS           4187
Astro              3899
L2/3 IT RHP        3096
NP PPP             2612
L2 IT ENTm         2225
Sst Chodl          1961
NP SUB             1870
CA3                1675
L6 IT ENTl         1211
L5 PPP             1183
Endo                960
Micro-PVM           955
IG-FC               328
SMC-Peri            288
CR                  277
VLMC                152
M

In [12]:
# subsample to 5,000 cells per cell type
target_cells = 5_000

subsampled = []
for ct in adata.obs["subclass_label"].cat.categories:
    subsampled.append(adata[adata.obs["subclass_label"] == ct])
    if subsampled[-1].n_obs > target_cells:
        sc.pp.subsample(subsampled[-1], n_obs=target_cells, random_state=1)

adata = ad.concat(subsampled)

In [13]:
adata

AnnData object with n_obs × n_vars = 151060 × 31053
    obs: 'supertype_label', 'subclass_label'

In [14]:
adata.write_h5ad(data_path / "Yao_subsampled.h5ad")

In [8]:
adata = adata[:, adata.X.sum(axis=0) != 0]

# Gene signatures

In [10]:
from utils import celltype_signatures

In [11]:
sc.pp.normalize_total(adata, target_sum=1e4)

In [12]:
celltype_signatures(adata, celltype_col="subclass_label").to_csv(
    "yao_brain_signatures.tsv", sep="\t"
)

In [None]:
sc.pp.log1p(adata)

In [None]:
celltype_signatures(adata, celltype_col="subclass_label").to_csv(
    "yao_brain_signatures_log.tsv", sep="\t"
)