Download data from BrainAtlas_links.txt

DE genes from [Allen whole brain atlas](https://www.nature.com/articles/s41586-023-06812-z) and [here](https://alleninstitute.github.io/abc_atlas_access/descriptions/WMB-taxonomy.html)

In [1]:
import urllib.request
from pathlib import Path

import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
data_path = Path("data") / "ABC_atlas"

de_genes_file = data_path / "ABC_atlas" / "DE_genes.xlsx"
de_genes_atlas_file = data_path / "ABC_atlas" / "DE_genes_atlas.xlsx"

In [4]:
pd.read_excel(
    de_genes_file, sheet_name="DE_gene_list", header=None, names=["gene"]
).loc[:, "gene"].to_csv(Path(".") / "8k_genes.txt", header=False, index=False)

In [None]:
de_genes_file.parent.mkdir(exist_ok=True, parents=True)

url = "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-023-06812-z/MediaObjects/41586_2023_6812_MOESM6_ESM.xlsx"
_ = urllib.request.urlretrieve(url, de_genes_file)

url = "https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/metadata/WMB-taxonomy/20231215/cl.df_CCN202307220.xlsx"
_ = urllib.request.urlretrieve(url, de_genes_atlas_file)

In [3]:
region_annotation = pd.read_csv(data_path / "region_of_interest_metadata.csv")

In [4]:
genes = pd.read_csv(data_path / "gene.csv", index_col=0)

In [5]:
cell_metadata = pd.read_csv(
    data_path / "cell_metadata_with_cluster_annotation.csv",
    usecols=["cell_label", "class", "subclass"],
    index_col="cell_label",
    dtype={"class": "category", "subclass": "category"},
    low_memory=False,
).loc[:, lambda df: ~df.columns.str.contains("color")]

In [8]:
adata = ad.concat([ad.read_h5ad(f) for f in (data_path / "h5ad_files").glob("*.h5ad")])

adata.obs = adata.obs.join(cell_metadata[["class", "subclass"]])

adata = adata[adata.obs["class"].notna()]

In [11]:
# subsample to 5,000 cells per class
target_cells = 5_000

subsampled = []
for ct in adata.obs["class"].cat.categories:
    subsampled.append(adata[adata.obs["class"] == ct])
    if subsampled[-1].n_obs > target_cells:
        sc.pp.subsample(subsampled[-1], n_obs=target_cells, random_state=1)

adata = ad.concat(subsampled)

In [15]:
adata.write_h5ad(data_path / "ABC_subsampled.h5ad")

In [7]:
# drop empty genes
adata = adata[:, adata.X.sum(axis=0) != 0]

In [10]:
anno = adata.obs["class"].astype(str)
anno.loc[adata.obs["subclass"] == "149 PVT-PT Ntrk1 Glut"] = "149 PVT-PT Ntrk1 Glut"
anno = anno.astype("category")
anno.name = "manual_anno_pvt"

adata.obs = adata.obs.join(anno)

In [15]:
adata.var = adata.var.join(genes["gene_symbol"]).reset_index().set_index("gene_symbol")

# Gene signatures

In [12]:
sc.pp.normalize_total(adata, target_sum=1e4)

In [13]:
from utils import celltype_signatures

In [13]:
celltype_signatures(adata, celltype_col="class").to_csv(
    "abc_brain_signatures.tsv", sep="\t"
)

In [16]:
celltype_signatures(adata, celltype_col="manual_anno_pvt").to_csv(
    "abc_brain_signatures_pvt.tsv", sep="\t"
)

In [14]:
sc.pp.log1p(adata)

In [15]:
celltype_signatures(adata, celltype_col="class").to_csv(
    "abc_brain_signatures_log.tsv", sep="\t"
)