# Cell type assignment

In this notebook, we use a manual annotation approach, based on marker gene expression, to validate the cell type predictions of the automated annotation model. 

In [52]:
import scrabbitpy
import scanpy as sc
import pandas as pd
import numpy as np
from pathlib import Path


### Load data

In [66]:
# Load data
r_data = sc.read_h5ad("../data-in/cell_type_annotation/rabbit_corrected_clustered.h5ad")
orthologs = pd.read_csv("../data-in/orthologs/rabbit_mouse_orthologs.tsv", sep="\t")


In [50]:
m_markers = pd.read_csv("../data-in/cell_type_annotation/celltype_markers.tsv", sep="\t")
m_markers = {i: m_markers[i].dropna().to_list() for i in m_markers.columns}


### Cluster data

Cell type labels are assigned to clusters identified in the high-dimensional gene expression space. Here we perfrom Leiden clustering at various resolutions to identify both coarse and fine-grained populations of cells.

For the sake of brevity, here we have used precomputed clusterings. See `scripts/cluster.py` to run the full clustering.  

In [None]:
# Load clusters from file
clusters = pd.read_csv("../data-in/cell_type_annotation/clusters.tsv",sep="\t")
r_data.obs = r_data.obs.join(clusters)


### Divide dataset

To simplify the annotation process, the dataset is initially divded into broad regions to be annotated independently. 

For each broad region, we plot the automated annotation predictions, along with clusterings of different resolutions. In addition to clustering the entire dataset, Leiden clustering is also performed within each region. These results are compared to UMAP plots of marker gene expression of known cell types as well as differentially expressed genes computationally identified in the annotated mouse dataset.



In [None]:
blood = rabbit[rabbit.obs["leiden_res1"].isin(['4','19','20','8']),]
mesoderm = rabbit[rabbit.obs["leiden_res1"].isin(['17','2','12','22']),]
neural = rabbit[rabbit.obs["leiden_res1"].isin(['3','7']),]
misc = rabbit[rabbit.obs["leiden_res1"].isin(['6','23','21','18']),]
exe_ectoderm = rabbit[rabbit.obs["leiden_res1"].isin(['9','0','16']),]
exe_mesoderm = rabbit[rabbit.obs["leiden_res1"].isin(['5','14','10','1','22']),]
exe_endoderm = rabbit[rabbit.obs["leiden_res1"].isin(['13','15','11']),]



In [79]:
# create directory for each region
# Identify mouse cell types likely to be in each 


In [74]:
def createAnnotationDirs(base_path, clusters):
    """
    
    e.g.
    createAnnotationDirs(base_path = "../figs/celltype_annotation/annotation_pipeline/r_mesoderm/", 
                     clusters = ["leiden_res8","mesoderm_leiden_res2","mesoderm_leiden_res5"])
                     
    """
    
    # Marker expression plots
    Path(base_path + "marker_expression").mkdir(parents=True, exist_ok=True)
    
    # Cluster fraction plots
    Path(base_path + "cluster_fractions").mkdir(parents=True, exist_ok=True)
    
    # Prediction fraction plots
    Path(base_path + "cluster_fractions").mkdir(parents=True, exist_ok=True)
    
    for cluster in clusters:
        # Create directory for each clustering
        Path(base_path + "cluster_fractions/" + cluster).mkdir(parents=True, exist_ok=True)
        Path(base_path + "prediction_fractions/" + cluster).mkdir(parents=True, exist_ok=True)
        
    

In [None]:
def plotMarkerExpression(adata, markers,):
    """
    adata: Scanpy object
    markers: Dictionary where keys are the cell types/obs groups of interest and items are marker gene names which match values in adata.var.index.   
    """
    
    

In [137]:
def computeObsFraction(adata, obsA="leiden_res8",obsB="singler"):
    """
    Computes the fraction of observations B within groups of observation A.
    
    """
    
    df = pd.DataFrame({obsA:adata.obs[obsA], obsB:adata.obs[obsB]},index=adata.obs.index)
    df[obsA + "_ncells"] = df.groupby([obsA]).transform('count')

    df[obsB + "_count"] = 0

    counts = df.groupby([obsA,obsB,obsA + "_ncells"]).count()
    counts[obsB + "_frac"] = counts[obsB + "_count"].groupby(obsA).transform(lambda x: x/x.sum())
    counts.sort_values([obsA,obsB,obsB + "_frac"],ascending=False)
    
    counts= counts.reset_index()
    counts = counts.dropna()
    
    return(counts)

In [None]:
def plotObsFraction()

In [None]:
def plotMultiObsFraction(data, obsA, obsB, ignore_small_groups = True, export_dir=None, obsB_colours=None):
    
    if(isinstance(r_mesoderm,sc.AnnData)):
        data = computeObsFraction(data, obsA = obsA, obsB = obsB)
    
    # Don't plot fractions for observations with very few cells
    if(ignore_small_groups):
        df_plot = df[np.where(data[obsA + "_ncells"] >= thresh),]
        df = df[df["cluster"].isin(clusts_keep)]
    
    # Group obsB with very few cells as 'Other'
    
    # 
    if(export_dir != ""):
        for x in data[obsA].unique():
            fig, ax = plt.subplots()
            
            df_plot = df[df[obsA]==x]
            
            if(obsB_colours is not None):
                obsB_colours = [obsB_colours[i] for i in df_plot[obsB]]
                
            ax.barh(df_plot[obsB], df_plot[obsB + "_frac"], color=obsB_colours)
            
            ax.set_title(obsA + ": " +  x + " (" + str(int(df_plot[obsA + "_ncells"][0])+0.5))+ " cells)")
            ax.set_xlabel("Fraction of cells")
            ax.set_ylabel(obsB)
            
            #plt.tight_layout()
            plt.savefig(out_path + "cluster" + x + ".pdf")
    
    
    

In [None]:
def runAnnotationPlots(adata, clusters, model_predictions, celltypes, markers, export_dir):
    
    
    createAnnotationDirs(base_path = export_dir, clusters = clusters )
    
    
    # Plot UMAPs of SingleR predictions, Clusterings
    sc.settings.figdir = export_dir
    sc.pl.umap(adata, color=[model_predictions,clusters],
               save="_clusters.pdf")
    

    # Plot UMAPs of literature marker genes
    for celltype in celltypes:
        sc.settings.figdir = export_dir + "marker_expression/"
        sc.pl.umap(adata,color=markers[celltype],
                   save="_" + celltype + "_markers.pdf")
    
   
    # Plot bar charts of cluster prediction fractions
    for cluster in clusters:
        sc.settings.figdir = export_dir + "cluster_fractions/" + cluster
        
        plotClusterFraction(adata, clusters=cluster, predictions=model_predictions, 
                            save= export_dir + "cluster_fractions/" + cluster)
        
    
    # Plot bar charts of prediction cluster fractions
    sc.settings.figdir = export_dir + "cluster_fractions/"
        
    plotClusterFraction(r_meosderm, clusters="leiden_res8", predictions="singler", save=True)


### Mesoderm

In [None]:
# Create directory for mesoderm region
Path("../figs/celltype_annotation/annotation_pipeline/r_mesoderm/").mkdir(parents=True, exist_ok=True)

runAnnotationPlots(r_mesoderm, 
                    clusters = ["leiden_res8","mesoderm_leiden_res2","mesoderm_leiden_res5"],
                    model_predictions = "singler",
                   marker_genes = m_markers,
                   export_dir = "../figs/celltype_annotation/annotation_pipeline/r_mesoderm/")