##### Notebook for annotationing the celltypes in the MERSCOPE data. This is a first pass annoation that we use for crypt-villus axis calculations. We do a more precise celltyping later on

In [4]:
import scanpy as sc
import numpy as np
from tqdm.notebook import tqdm
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scvi
import glob
import ucell


#### Read in the human combined adata

In [2]:
output_folder = r'/mnt/sata1/Analysis_Alex/human_r1/analysis/cleaned'
adata = sc.read(os.path.join(output_folder, 'concatenated_integrated.h5ad'))



##### Subcluster the data

In [None]:

def get_celltype(celltype, ad_sp):
    ctype = ad_sp[ad_sp.obs.leiden.isin([celltype])]
    sc.pp.neighbors(ctype, use_rep='X_scVI')
    sc.tl.leiden(ctype, resolution= 1.2)
    sc.tl.umap(ctype)
    return ctype
def reunite_with_ad(ad_sp, subset_ad, celltype):
    new_labels  = []
    subclusters = ad_sp.obs.Sub_leiden.values
    clusters = ad_sp.obs.leiden.values
    idex = ad_sp.obs.index.values
    for i in tqdm(range(len(subclusters))):
        if clusters[i] == celltype:
            new_labels.append(subset_ad.obs.loc[idex[i], :].leiden)
        else:
            new_labels.append(subclusters[i])
    ad_sp.obs.Sub_leiden = new_labels
    return ad_sp

adata.obs['Sub_leiden'] = adata.obs['leiden']
for leiden_to_subset in tqdm(np.unique(adata.obs['leiden'].values)):
    mac = get_celltype(leiden_to_subset, adata)
    mac.obs['leiden'] = [leiden_to_subset + '_' + i for i in mac.obs.leiden]
    adata = reunite_with_ad(adata, mac, leiden_to_subset)

In [5]:
adata.obs= adata.obs[[i for i in adata.obs.columns if 'UCell' not in i]]

##### Excel sheet giving a first pass at annotating clusters.

In [142]:
celltypes = pd.read_csv('/mnt/sata1/Analysis_Alex/human_r1/annotated_human_xenium_gut - Sheet1.csv', index_col=0)

In [144]:
subtypes = []
for i in adata.obs['Sub_leiden']:
    subtypes.append(celltypes.loc[i][0])

  subtypes.append(celltypes.loc[i][0])


In [146]:
adata.obs['Subtype'] = pd.Categorical(subtypes)

##### Adding the path to the final rep1 mouse Xenium object to get a general celltype heirarchy

In [147]:
types = sc.read('/mnt/sata1/Analysis_Alex/timecourse_final/analysis/cleaned/final_celltyped_and_axes.h5ad')
types_ = pd.crosstab(types.obs['Type'], types.obs['Subtype'])
class_ = pd.crosstab(types.obs['Class'], types.obs['Type'])
immuno_ = pd.crosstab(types.obs['Immunocentric_Type'], types.obs['Subtype'])

  utils.warn_names_duplicates("obs")


#### Adjust nomenclature and heirarchy and assign first pass at celltypes

In [153]:

type_dictionary = {}
for i in adata.obs['Subtype'].cat.categories:
    if i == 'CD4 T-Cell':
        type_dictionary[i] = 'CD4 T-Cell'
    elif i == 'CD8 AA+ T-Cell':
        type_dictionary[i] = 'CD8 T-Cell'
    elif i == 'CD8 AB+ T-Cell':
        type_dictionary[i] = 'CD8 T-Cell'
    elif i == 'Enterocyte':
        type_dictionary[i] = 'Epithelial_Absorptive'
    elif i == 'MAST':
        type_dictionary[i] = 'MAST'
    elif i == 'Gamma Delta T-Cell':
        type_dictionary[i] = 'T-Cell'
    elif i == 'Other DC':
        type_dictionary[i] = 'DC'
    elif i == 'Plasma Cell':
        type_dictionary[i] = 'Plasma Cell'
    elif i == 'Unknown':
        type_dictionary[i] = 'Unknown'
    else:
        type_dictionary[i] = types_.index.values[np.where(types_[i].values > 0)[0]][0]
all_types = []
for k in adata.obs['Subtype'].values:
    all_types.append(type_dictionary.get(k))
adata.obs['Type'] = all_types

In [154]:
itype_dictionary = {}
for i in adata.obs['Subtype'].cat.categories:
    if i == 'CD4 T-Cell':
        itype_dictionary[i] = 'CD4 T-Cell'
    elif i == 'CD8 AA+ T-Cell':
        itype_dictionary[i] = 'CD8 T-Cell'
    elif i == 'CD8 AB+ T-Cell':
        itype_dictionary[i] = 'CD8 T-Cell'
    elif i == 'Enterocyte':
        itype_dictionary[i] = 'Enterocyte'
    elif i == 'MAST':
        itype_dictionary[i] = 'MAST'
    elif i == 'Gamma Delta T-Cell':
        itype_dictionary[i] = 'Gamma Delta T-Cell'
    elif i == 'Other DC':
        itype_dictionary[i] = 'DC'
    elif i == 'Plasma Cell':
        itype_dictionary[i] = 'Plasma Cell'
    elif i == 'Unknown':
        itype_dictionary[i] = 'Unknown'
    else:
        itype_dictionary[i] = immuno_.index.values[np.where(immuno_[i].values > 0)[0]][0]
all_itypes = []
for k in adata.obs['Subtype'].values:
    all_itypes.append(itype_dictionary.get(k))
adata.obs['Immunocentric_Type'] = all_itypes

In [155]:
class_dictionary = {}
for i in np.unique(adata.obs['Type'].values):
    if i == 'CD4 T-Cell':
        class_dictionary[i] = 'Immune'
    elif i == 'CD8 T-Cell':
        class_dictionary[i] = 'Immune'
    elif i == 'MAST':
        class_dictionary[i] = 'Immune'
    elif i == 'Enterocyte':
        class_dictionary[i] = 'Epithelial'
    elif i == 'T-Cell':
        class_dictionary[i] = 'Immune'
    elif i == 'DC':
        class_dictionary[i] = 'Immune'
    elif i == 'Plasma Cell':
        class_dictionary[i] = 'Immune'
    elif i == 'Unknown':
        class_dictionary[i] = 'Unknown'
    else:
        class_dictionary[i] = class_.index.values[np.where(class_[i].values > 0)[0]][0]
all_classes = []
for k in adata.obs['Type'].values:
    all_classes.append(class_dictionary.get(k))
adata.obs['Class'] = all_classes

In [157]:
adata.write(os.path.join(output_folder, 'celltyped_do_not_touch.h5ad'))