In [1]:
from pathlib import Path

import anndata as ad
import pandas as pd
import scanpy as sc

In [2]:
data_path = Path("data") / "ABC_atlas"

de_genes_file = data_path / "DE_genes.xlsx"

In [3]:
region_annotation = pd.read_csv(data_path / "region_of_interest_metadata.csv")

In [4]:
genes = pd.read_csv(data_path / "gene.csv", index_col=0)

In [5]:
de_genes = pd.read_excel(
    de_genes_file, sheet_name="DE_gene_list", header=None, names=["gene"]
).loc[:, "gene"]

In [6]:
cell_metadata = pd.read_csv(
    data_path / "cell_metadata_with_cluster_annotation.csv",
    usecols=["cell_label", "class", "subclass"],
    index_col="cell_label",
    dtype={"class": "category", "subclass": "category"},
    low_memory=False,
).loc[:, lambda df: ~df.columns.str.contains("color")]

In [7]:
adata = ad.concat(
    [ad.read_h5ad(f) for f in (data_path / "h5ad_files").glob("*10Xv2*.h5ad")]
)

adata.obs = adata.obs.join(cell_metadata[["class", "subclass"]])

adata = adata[adata.obs["subclass"].notna()]

In [8]:
# subsample to 500 cells per subclass
target_cells = 500

subsampled = []
for ct in adata.obs["subclass"].cat.categories:
    subsampled.append(adata[adata.obs["subclass"] == ct])
    if subsampled[-1].n_obs > target_cells:
        sc.pp.subsample(subsampled[-1], n_obs=target_cells, random_state=1)

adata = ad.concat(subsampled)

In [9]:
adata.write_h5ad(data_path / "ABC_subsampled_subclass.h5ad")

In [19]:
# drop empty genes
adata = adata[:, adata.X.sum(axis=0) != 0]

In [20]:
adata.var = adata.var.join(genes["gene_symbol"]).reset_index().set_index("gene_symbol")

# Gene signatures

In [21]:
sc.pp.normalize_total(adata, target_sum=1e4)

In [22]:
adata = adata[:, adata.var_names.isin(de_genes)]

In [27]:
adata

AnnData object with n_obs × n_vars = 106480 × 8454
    obs: 'cell_barcode', 'library_label', 'anatomical_division_label', 'class', 'subclass'
    var: 'gene_identifier'
    uns: 'log1p'

In [23]:
from utils import celltype_signatures

In [24]:
celltype_signatures(adata, celltype_col="subclass").to_csv(
    "abc_brain_signatures_subclass.tsv", sep="\t"
)

In [25]:
sc.pp.log1p(adata)

  view_to_actual(adata)
  utils.warn_names_duplicates("var")


In [26]:
celltype_signatures(adata, celltype_col="subclass").to_csv(
    "abc_brain_signatures_subclass_log.tsv", sep="\t"
)