In [None]:
import os
import anndata
import scvi
import numpy
import pickle
import pandas

from aavomics import database

In [None]:
ALLEN_DATA_FILE_PATH = os.path.join(database.DATA_PATH, "reference_databases", "20200331_Allen_Cortex_Hippocampus_10X_v3", "barcode_transcript_counts.h5ad")
ALLEN_FILTERED_DATA_FILE_PATH = os.path.join(database.DATA_PATH, "reference_databases", "20200331_Allen_Cortex_Hippocampus_10X_v3", "barcode_transcript_counts_filtered.h5ad")
CELL_TYPE_MAP_FILE_PATH = os.path.join(database.DATA_PATH, "reference_databases", "neuron_type_map_20210130.pkl")

REGIONS_TO_EXCLUDE = [
    "PL;ILA;ORB",
    "AId;AIv",
    "PAR;POST;PRE;SUB;ProS",
    "HIP"
]

GENDERS_TO_INCLUDE = ["M"]

In [None]:
adata = anndata.read(ALLEN_DATA_FILE_PATH)

In [None]:
cell_type_map = {}

for cell_type_alias_label in adata.obs["cell_type_alias_label"].unique():
    
    regions = adata.obs[adata.obs["cell_type_alias_label"] == cell_type_alias_label]["region_label"].unique()
    cell_type_designation_label = adata.obs[adata.obs["cell_type_alias_label"] == cell_type_alias_label]["cell_type_designation_label"].iloc[0]
    
    if "neuron" not in cell_type_designation_label.lower():
        print("Skipping %s, not a neuron type" % (cell_type_alias_label))
        continue
    
    all_excluded = True
    
    for region in regions:
        if region not in REGIONS_TO_EXCLUDE:
            all_excluded = False
            
    if all_excluded:
        print("Skipping %s, in %s" % (cell_type_alias_label, ",".join(regions)))
        continue
    
    if "L2/3" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "L2/3"
    elif "L4/5" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "L4/5"
    elif "L5/6" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "L5/6"
    elif "L2" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "L2"
    elif "L3" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "L3"
    elif "L5" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "L5"
    elif "L6" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "L6"
    elif "Lamp5" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "Lamp5"
    elif "Pax6" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "Pax6"
    elif "Pvalb" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "Pvalb"
    elif "Sncg" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "Sncg"
    elif "Sst" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "Sst"
    elif "Vip" in cell_type_alias_label:
        cell_type_map[cell_type_alias_label] = "Vip"

In [None]:
with open(CELL_TYPE_MAP_FILE_PATH, "wb") as cell_type_map_file:
    pickle.dump(cell_type_map, cell_type_map_file)

In [None]:
region_mask = ~adata.obs["region_label"].isin(REGIONS_TO_EXCLUDE)
labeled_mask = adata.obs["class_label"] != "nan"
gender_mask = adata.obs["donor_sex_label"].isin(GENDERS_TO_INCLUDE)
cell_type_mask = adata.obs["cell_type_alias_label"].isin(cell_type_map)

all_mask = region_mask & labeled_mask & gender_mask & cell_type_mask

In [None]:
filtered_adata = adata[all_mask].copy()
filtered_adata.X = filtered_adata.X.astype(numpy.uint16)
filtered_adata.write(ALLEN_FILTERED_DATA_FILE_PATH)