<!--  -->
# Annotation (scANVI)
Adapted from Michael Sterr

2024-05-28


# Setup


In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import h5py
import scipy.sparse as sparse
import anndata as ad
import gc
import scipy.stats as stats
import torch

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors as mcolors
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
from matplotlib import cm
import seaborn as sb

# Analysis
import scanpy as sc
import scanpy.external as sce
import scvi
import muon as mu
from muon import atac as ac # Import a module with ATAC-seq-related functions

import scvelo as scv
import cellrank as cr

## setup matplotlib

In [None]:
# Settings

## Directory
base_dir = '/mnt/hdd/Notebooks/Gut_project/'
sc.settings.figdir = base_dir + 'Figures'
sc.settings.cachedir = base_dir + 'Cache'

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_header()
sc.logging.print_versions()

import warnings
warnings.filterwarnings('ignore')

In [None]:
%run utils.ipynb

In [None]:
mymap = load_RdOrYl_cmap_settings(transparent=False)

# Setup R

In [None]:
#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri
setup_R('/home/scanalysis/mnt/envs/scUV/lib/R')

In [None]:
%%R

.libPaths()

In [None]:
%%R
# Parallelization
library(BiocParallel)
register(MulticoreParam(20, progressbar = TRUE))

library(future)
plan("multicore", workers = 20)
options(future.globals.maxSize = 64 * 1024^2)
plan()

library(doParallel)
registerDoParallel(20)

sessionInfo()

# DElegate

In [None]:
def get_diff_exprs_DElegate(
    adata=None, 
    groupby=None, # groups/condtions to test (e.g stage, genotype, ...)
    groups_restrict=None, #restricht test to gives cell type or cluster
    restrict_to=None, #identity of cell type the should be restricted to. e.g Beta
    layer='raw_counts',
#     group_column = None, 
#     replicate_column = None, 
    method = "edger", 
    filter_ambient_genes=False, 
    rank_genes_groups_key=None, # rank genes group key with markers for groups_restrict
    get_marker=False, # run to rank_genes_groups to identify markers
    min_gene_score=0, # min score a cluster marker should have to be cluster-specific
    min_cluster_size = 100, 
    min_frac_cells = 0.05,
    sample_key=None, # key for samples/replicates
    #additional_variables=[],  # which metadata to keep, e.g. gender, age, etc.
    #replicates_per_sample=3, # number of pseudoreplicates/sample
    #min_cell_per_sample=30,
    #aggr_method='sum',
    plot=True,
    return_results='dict' # or 'top_table'
):
    # copy adata
    adata_temp = adata.copy()
    
#     # set selected layer to .X
#     if layer is not None:
#         adata_temp.X = adata_temp.layers[layer].copy()

    # create results dict and add parametes
    results = dict()
    results['method'] = 'DElegate_pseudobulk_' + method
    results['groupby'] = groupby
    results['groupby_categories'] = []
    results['groups_restrict'] = groups_restrict
    if groups_restrict is not None:
        results['groups_restrict_categories'] = list(adata_temp.obs[groups_restrict].cat.categories)
    if (groups_restrict is not None) & (restrict_to is not None):
        results['restrict_to'] = restrict_to
    else:
        results['restrict_to'] = ''
    results['layer'] = layer
    results['min_cluster_size'] = min_cluster_size
    results['min_frac_cells'] = min_frac_cells
    
    # check if cluster of interest (restrict_to) has enough cells
    if groups_restrict is not None:
        if adata_temp.obs[groups_restrict].value_counts()[restrict_to] < min_cluster_size:
            #print('Group has less than ' + str(min_cluster_size) + ' cells.')
            raise ValueError('Group has less than ' + str(min_cluster_size) + ' cells.') 
    
    # check if key for rank genes groups for the group containing the cluster of interest (groups_restrict) is provided -> rank_genes_group if not
    if (rank_genes_groups_key == None) & (get_marker):
        sc.tl.rank_genes_groups(adata_temp, groupby=groups_restrict)
        rank_genes_groups_key = 'rank_genes_groups'
    
    # subset adata to group provided in restrict_to
    if restrict_to is None:
        adata_temp_test = adata_temp.copy()
    else:
        adata_temp_test = adata_temp[adata_temp.obs[groups_restrict].isin([restrict_to])].copy()
    
    groupby_categories = list(adata_temp_test.obs[groupby].cat.categories)
    results['groupby_categories'] = groupby_categories
    
    groupby_colors = list(adata_temp_test.uns[groupby + '_colors'])
    results['groupby_colors'] = groupby_colors
    
    # filter genes expressed in few cells
    sc.pp.filter_genes(adata_temp_test, min_cells=adata_temp_test.shape[0]*min_frac_cells)
    
    # filter ambient genes
    if filter_ambient_genes:
        if rank_genes_groups_key == None:
            ambi_genes_remove = list(adata_temp.var_names[adata_temp.var_names.isin(list(adata_temp[:,adata_temp.var['is_ambient'] == True].var_names))])
            adata_temp_test = adata_temp_test[:,~adata_temp_test.var_names.isin(ambi_genes_remove)]
            print('\nRemoving ambient genes from analysis: ', ambi_genes_remove)
            results['ambient_genes_removed'] = ambi_genes_remove
        else:
            ambi_genes = list(adata_temp.var_names[adata_temp.var_names.isin(list(adata_temp[:,adata_temp.var['is_ambient'] == True].var_names))])
            marker_genes = list(adata_temp.uns[rank_genes_groups_key]['names'][restrict_to][adata_temp.uns[rank_genes_groups_key]['scores'][restrict_to] > min_gene_score])
            ambi_genes_remove = list(set(ambi_genes).difference(set(marker_genes)))
            adata_temp_test = adata_temp_test[:,~adata_temp_test.var_names.isin(ambi_genes_remove)]
            print('\nRemoving ambient genes from analysis: ', ambi_genes_remove)
            print('\nKeeping group-specific ambient genes: ', set(ambi_genes).difference(set(ambi_genes_remove)),'\n')
            results['ambient_genes_removed'] = ambi_genes_remove
            results['ambient_genes_kept'] = list(set(ambi_genes).difference(set(ambi_genes_remove)))
    
    results['background_genes'] = list(adata_temp_test.var_names)
    
    results['n_genes'] = adata_temp_test.shape[1]
    results['n_cells'] = adata_temp_test.shape[0]
    
    # run edgeR
    print('\nRunning DElegate...')
    top_table = run_DElegate_findDE(adata_temp_test, 
                                    layer = layer, 
                                    group_column=groupby, 
                                    replicate_column=sample_key, 
                                    compare=[groupby_categories[0], groupby_categories[1]], 
                                    method = "edger", 
                                    order_results = True, 
                                    verbosity = 1, 
                                    n_core = 64, 
                                    max_memory = 4)
    
    if return_results == 'dict':
        # convert results
        print('\nConverting results...')
        results = DElegate_to_results(top_table, 
                                   results_dict=results,
                                   ident_1=groupby_categories[0],
                                   ident_2=groupby_categories[1],
                                   ident_1_color=groupby_colors[0],
                                   ident_2_color=groupby_colors[1],
                                   plot=plot,
                                   plot_logfc_limit = 10,
                                   log_pvals_adj_limit = 300,
                                   z_logfc_cut_off=0.5,
                                   z_pval_cut_off=0.25)
    
    del adata_temp
    del adata_temp_test
    
    gc.collect()
    
    if return_results == 'dict':
        return results
    elif return_results == 'top_table':
        return top_table
    
    




##################################################################################################################################################################################
##################################################################################################################################################################################
##################################################################################################################################################################################
##################################################################################################################################################################################


    
    
    
def DElegate_to_results(results_table, 
                     results_dict=dict(),
                     ident_1=None,
                     ident_2=None,
                     ident_1_color='#1f77b4',
                     ident_2_color='#ff7f0e',
                     plot=True,
                     plot_logfc_limit = 10,
                     log_pvals_adj_limit = 300,
                     z_logfc_cut_off=0.5,
                     z_pval_cut_off=0.25
):
    results=results_dict
    names=list(results_table['feature'])
    logfc=np.array(results_table['log_fc'], dtype='float64')
    logexprs=np.array(results_table['ave_expr'], dtype='float64')
    pvals_adj=np.array(results_table['padj'], dtype='float64')
    log_pvals_adj = -np.log10(pvals_adj)
    log_pvals_adj[log_pvals_adj > log_pvals_adj_limit] = log_pvals_adj_limit
    logfc_limit = logfc.copy()
    logfc_limit[logfc_limit > plot_logfc_limit] = plot_logfc_limit
    logfc_limit[logfc_limit < -plot_logfc_limit] = -plot_logfc_limit

    table={'names': names, 'logfc': logfc, 'logexprs': logexprs, 'pvals_adj': pvals_adj, 'log_pvals_adj': log_pvals_adj, 'logfc_limit': logfc_limit}
    table = pd.DataFrame(data=table)
    table = table.sort_values(by=['pvals_adj'], ascending=True)
    table = table.sort_values(by=['logfc'], ascending=True)
    results[ident_1] = table #.loc[(abs(table['logfc']) >= min_logfc) & (table['pvals_adj'] <= max_pval),:]

    table={'names': names, 'logfc': -logfc, 'logexprs': logexprs, 'pvals_adj': pvals_adj, 'log_pvals_adj': log_pvals_adj, 'logfc_limit': -logfc_limit}
    table = pd.DataFrame(data=table)
    table = table.sort_values(by=['pvals_adj'], ascending=True)
    table = table.sort_values(by=['logfc'], ascending=True)
    results[ident_2] = table

    # find cut offs
    # To DO:
    # * avoid error when cut-off cannot be found. e.g. all p-val == 1. 
    # * set pval_cut_off to 0.05 if larger cut off is found  
    try:
        logfc_cut_off = round(min(abs(results[ident_1]['logfc'])[stats.zscore(abs(results[ident_1]['logfc'])) > z_logfc_cut_off]),1) 
    except:
        logfc_cut_off = 0.5
        
    try:
        pval_cut_off = round(min(results[ident_1]['log_pvals_adj'][stats.zscore(results[ident_1]['log_pvals_adj']) > z_pval_cut_off]),0)
    except:
        pval_cut_off = -np.log10(0.05)
        
    if pval_cut_off < -np.log10(0.05):
        pval_cut_off = -np.log10(0.05)
        
    #logfc_cut_off = round(min(abs(results[ident_1]['logfc'])[stats.zscore(abs(results[ident_1]['logfc'])) > z_logfc_cut_off]),1) 
    #pval_cut_off = round(min(results[ident_1]['log_pvals_adj'][stats.zscore(results[ident_1]['log_pvals_adj']) > z_pval_cut_off]),0)

    results['logfc_cut_off'] = logfc_cut_off
    results['pval_cut_off'] = pval_cut_off

    if plot:

        n_diff_logfc = sum(abs(results[ident_1]['logfc']) > logfc_cut_off)
        n_up_logfc = sum(results[ident_1]['logfc'] > logfc_cut_off)
        n_down_logfc = sum(results[ident_1]['logfc'] < -logfc_cut_off)

        with rc_context({'figure.figsize': (8, 2)}):
            sb.distplot(results[ident_1]['logfc'], kde=True, bins=100).set_xlabel('$log_2$ Fold Change')
            plt.axvline(logfc_cut_off, 0, 1)
            plt.axvline(-logfc_cut_off, 0, 1)
            plt.annotate('Down-regulated\n' + str(n_down_logfc), xy=(0.02, 0.92), xycoords='axes fraction', va="top", ha="left")
            plt.annotate('Up-regulated\n' + str(n_up_logfc), xy=(0.98, 0.92), xycoords='axes fraction', va="top", ha="right")
            plt.title(label='$log_2$ Fold Change (' + str(n_diff_logfc) + ' genes passing threshold of ' + str(logfc_cut_off) + ')', fontweight='bold')
            plt.show()

        #############################################################################################################
        #############################################################################################################

        n_diff_pval = sum(abs(results[ident_1]['log_pvals_adj']) > pval_cut_off)
        n_up_pval = sum(results[ident_1]['log_pvals_adj'] > pval_cut_off)
        n_down_pval = sum(results[ident_1]['log_pvals_adj'] < -pval_cut_off)

        with rc_context({'figure.figsize': (8, 2)}):
            sb.distplot(results[ident_1]['log_pvals_adj'], kde=True, bins=100).set_xlabel('$-log_{10}$ Adjusted p-Value')
            plt.axvline(pval_cut_off, 0, 1)
            plt.title(label='$-log_{10}$ Adjusted p-Value (' + str(n_diff_pval) + ' genes passing threshold of ' + str(pval_cut_off) + ')', fontweight='bold')
            plt.show()

        #############################################################################################################
        #############################################################################################################

        min_logfc = logfc_cut_off
        max_pval = 10**-pval_cut_off
        group_order = (0,1)
        y_max_ext_factor=1.13
        x_ext_factor=0.3
        x_max_ext_factor=1.1
        x_min_ext_factor=1.1
        fig_size=(7,6)
                
        results[ident_1].loc[:,'color'] = '#000000'
        results[ident_1].loc[(results[ident_1]['logfc'] > 0) & (abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'color'] = ident_1_color
        results[ident_1].loc[(results[ident_1]['logfc'] < 0) & (abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'color'] = ident_2_color

        n_diff = sum((abs(results[ident_1]['log_pvals_adj']) > pval_cut_off) & (abs(results[ident_1]['logfc']) > logfc_cut_off))
        n_up = sum((abs(results[ident_1]['log_pvals_adj']) > pval_cut_off) & (results[ident_1]['logfc'] > logfc_cut_off))
        n_down = sum((abs(results[ident_1]['log_pvals_adj']) > pval_cut_off) & (results[ident_1]['logfc'] < -logfc_cut_off))

        fig, ax = plt.subplots(1, figsize=fig_size)


        # Make x & y axis longer to make gene name plotting easier
        y_max = max(results[ident_1]['log_pvals_adj'])*y_max_ext_factor
        x_ext = (max(results[ident_1]['logfc_limit']) - min(results[ident_1]['logfc_limit'])) * x_ext_factor
        x_max = max(results[ident_1]['logfc_limit']) + x_ext
        x_min = min(results[ident_1]['logfc_limit']) - x_ext

        x_max = max(results[ident_1]['logfc_limit']) * x_max_ext_factor
        x_min = min(results[ident_1]['logfc_limit']) * x_min_ext_factor

        ax.set_ylim((-1,y_max))
        ax.set_xlim((x_min,x_max))

        # normalize colormap
        vcenter = 0
        vmin, vmax = results[ident_1]['log_pvals_adj'].min(), results[ident_1]['log_pvals_adj'].max()
        #normalize = mcolors.TwoSlopeNorm(vcenter=vcenter, vmin=vmin, vmax=vmax)
        normalize = plt.Normalize(vmin, vmax)
        colormap = cm.RdBu_r

        # Scatter plot
        sb.scatterplot(y='log_pvals_adj', x='logfc_limit',
                        color='#000000',s=20,
                        linewidth=0,
                        data=results[ident_1])
        sb.scatterplot(y='log_pvals_adj', x='logfc_limit',
                        color='#cccccc',s=10,
                        linewidth=0,
                        data=results[ident_1], ax=ax)
        y = results[ident_1].loc[(abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'log_pvals_adj']
        x = results[ident_1].loc[(abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'logfc_limit']
        c = results[ident_1].loc[(abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'color']
        
        sb.scatterplot(y=y, x=x, color='#ffffff', s=10, alpha=1,
                        norm=normalize,
                        cmap=colormap,
                        linewidth=0,
                        ax=ax)
        sb.scatterplot(y=y, x=x, c=c, s=10, alpha=0.5,
                        norm=normalize,
                        cmap=colormap,
                        linewidth=0,
                        ax=ax)

        # annotation
        ax.annotate('Down-regulated\n' + str(n_down), xy=(0.02, 0.98), xycoords='axes fraction', va="top", ha="left")
        ax.annotate('Up-regulated\n' + str(n_up), xy=(0.98, 0.98), xycoords='axes fraction', va="top", ha="right")
        ax.annotate(str(ident_2), xy=(0.02, 0.02), xycoords='axes fraction', va="bottom", ha="left")
        ax.annotate(str(ident_1), xy=(0.98, 0.02), xycoords='axes fraction', va="bottom", ha="right")

        # Lines
        plt.axvline(min_logfc, 0, 1, color='#666666', lw=1).set_linestyle("--")
        plt.axvline(-min_logfc, 0, 1, color='#666666', lw=1).set_linestyle("--")
        plt.axhline(-np.log10(max_pval), 0, 1, color='#666666', lw=1).set_linestyle("--")


        # title & axis labels
        title = 'Combined p-value & fold change threshold\n('  + str(n_diff) + ' genes passing thresholds of ' + str(logfc_cut_off) + ' and ' + str(pval_cut_off) + ')'
        ax.set_title(title, fontweight='bold')
        ax.set_ylabel('$-log_{10}$ Adjusted p-Value')
        ax.set_xlabel('$log_2$ Fold Change')

        plt.show()

        

        #############################################################################################################
        #############################################################################################################

        min_logfc = logfc_cut_off
        max_pval = 10**-pval_cut_off
        group_order = (0,1)
        y_max_ext_factor=1.13
        x_ext_factor=0.3
        x_max_ext_factor=1.1
        x_min_ext_factor=1.1
        fig_size=(7,6)
                
        results[ident_1].loc[:,'color'] = '#000000'
        results[ident_1].loc[(results[ident_1]['logfc'] > 0) & (abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'color'] = ident_1_color
        results[ident_1].loc[(results[ident_1]['logfc'] < 0) & (abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'color'] = ident_2_color

        n_diff = sum((abs(results[ident_1]['log_pvals_adj']) > pval_cut_off) & (abs(results[ident_1]['logfc']) > logfc_cut_off))
        n_up = sum((abs(results[ident_1]['log_pvals_adj']) > pval_cut_off) & (results[ident_1]['logfc'] > logfc_cut_off))
        n_down = sum((abs(results[ident_1]['log_pvals_adj']) > pval_cut_off) & (results[ident_1]['logfc'] < -logfc_cut_off))

        fig, ax = plt.subplots(1, figsize=fig_size)


        # Make x & y axis longer to make gene name plotting easier
        y_max = max(results[ident_1]['log_pvals_adj'])*y_max_ext_factor
#         x_ext = (max(results[ident_1]['logexprs']) - min(results[ident_1]['logexprs'])) * x_ext_factor
#         x_max = max(results[ident_1]['logexprs']) + x_ext
#         x_min = min(results[ident_1]['logexprs']) - x_ext

#         x_max = max(results[ident_1]['logexprs']) * x_max_ext_factor
#         x_min = min(results[ident_1]['logexprs']) * x_min_ext_factor

        ax.set_ylim((-1,y_max))
#         ax.set_xlim((x_min,x_max))

        # normalize colormap
        vcenter = 0
        vmin, vmax = results[ident_1]['log_pvals_adj'].min(), results[ident_1]['log_pvals_adj'].max()
        #normalize = mcolors.TwoSlopeNorm(vcenter=vcenter, vmin=vmin, vmax=vmax)
        normalize = plt.Normalize(vmin, vmax)
        colormap = cm.RdBu_r

        # Scatter plot
        sb.scatterplot(y='log_pvals_adj', x='logexprs',
                        color='#000000',s=20,
                        linewidth=0,
                        data=results[ident_1])
        sb.scatterplot(y='log_pvals_adj', x='logexprs',
                        color='#cccccc',s=10,
                        linewidth=0,
                        data=results[ident_1], ax=ax)
        y = results[ident_1].loc[(abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'log_pvals_adj']
        x = results[ident_1].loc[(abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'logexprs']
        c = results[ident_1].loc[(abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'color']
        
        sb.scatterplot(y=y, x=x, color='#ffffff', s=10, alpha=1,
                        norm=normalize,
                        cmap=colormap,
                        linewidth=0,
                        ax=ax)
        sb.scatterplot(y=y, x=x, c=c, s=10, alpha=0.5,
                        norm=normalize,
                        cmap=colormap,
                        linewidth=0,
                        ax=ax)

        # annotation
        #ax.annotate('Down-regulated\n' + str(n_down), xy=(0.02, 0.98), xycoords='axes fraction', va="top", ha="left")
        ax.annotate('Up-regulated\n' + str(n_up) + '\nDown-regulated\n' + str(n_down), xy=(0.98, 0.98), xycoords='axes fraction', va="top", ha="right")
        #ax.annotate(str(ident_2), xy=(0.02, 0.02), xycoords='axes fraction', va="bottom", ha="left")
        #ax.annotate(str(ident_1), xy=(0.98, 0.02), xycoords='axes fraction', va="bottom", ha="right")

        # Lines
        #plt.axvline(min_logfc, 0, 1, color='#666666', lw=1).set_linestyle("--")
        #plt.axvline(-min_logfc, 0, 1, color='#666666', lw=1).set_linestyle("--")
        plt.axhline(-np.log10(max_pval), 0, 1, color='#666666', lw=1).set_linestyle("--")


        # title & axis labels
        title = 'Combined p-value & fold change threshold\n('  + str(n_diff) + ' genes passing thresholds of ' + str(logfc_cut_off) + ' and ' + str(pval_cut_off) + ')'
        ax.set_title(title, fontweight='bold')
        ax.set_ylabel('$-log_{10}$ Adjusted p-Value')
        ax.set_xlabel('$log_2$ Expression')

        plt.show()
        

        #############################################################################################################
        #############################################################################################################

        min_logfc = logfc_cut_off
        max_pval = 10**-pval_cut_off
        group_order = (0,1)
        x_max_ext_factor=1.13
        y_ext_factor=0.3
        y_max_ext_factor=1.1
        y_min_ext_factor=1.1
        fig_size=(7,6)
                
        results[ident_1].loc[:,'color'] = '#000000'
        results[ident_1].loc[(results[ident_1]['logfc'] > 0) & (abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'color'] = ident_1_color
        results[ident_1].loc[(results[ident_1]['logfc'] < 0) & (abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'color'] = ident_2_color

        n_diff = sum((abs(results[ident_1]['log_pvals_adj']) > pval_cut_off) & (abs(results[ident_1]['logfc']) > logfc_cut_off))
        n_up = sum((abs(results[ident_1]['log_pvals_adj']) > pval_cut_off) & (results[ident_1]['logfc'] > logfc_cut_off))
        n_down = sum((abs(results[ident_1]['log_pvals_adj']) > pval_cut_off) & (results[ident_1]['logfc'] < -logfc_cut_off))

        fig, ax = plt.subplots(1, figsize=fig_size)


        # Make x & y axis longer to make gene name plotting easier
        x_max = max(results[ident_1]['logexprs'])*x_max_ext_factor
        y_ext = (max(results[ident_1]['logfc_limit']) - min(results[ident_1]['logfc_limit'])) * y_ext_factor
        y_max = max(results[ident_1]['logfc_limit']) + y_ext
        y_min = min(results[ident_1]['logfc_limit']) - y_ext

        y_max = max(results[ident_1]['logfc_limit']) * y_max_ext_factor
        y_min = min(results[ident_1]['logfc_limit']) * y_min_ext_factor

#         ax.set_xlim((-1,x_max))
        ax.set_ylim((y_min,y_max))

        # normalize colormap
        vcenter = 0
        vmin, vmax = results[ident_1]['logfc_limit'].min(), results[ident_1]['logfc_limit'].max()
        #normalize = mcolors.TwoSlopeNorm(vcenter=vcenter, vmin=vmin, vmax=vmax)
        normalize = plt.Normalize(vmin, vmax)
        colormap = cm.RdBu_r

        # Scatter plot
        sb.scatterplot(y='logfc_limit', x='logexprs',
                        color='#000000',s=20,
                        linewidth=0,
                        data=results[ident_1])
        sb.scatterplot(y='logfc_limit', x='logexprs',
                        color='#cccccc',s=10,
                        linewidth=0,
                        data=results[ident_1], ax=ax)
        y = results[ident_1].loc[(abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'logfc_limit']
        x = results[ident_1].loc[(abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'logexprs']
        c = results[ident_1].loc[(abs(results[ident_1]['logfc']) >= min_logfc) & (results[ident_1]['pvals_adj'] <= max_pval),'color']
        
        sb.scatterplot(y=y, x=x, color='#ffffff', s=10, alpha=1,
                        norm=normalize,
                        cmap=colormap,
                        linewidth=0,
                        ax=ax)
        sb.scatterplot(y=y, x=x, c=c, s=10, alpha=0.5,
                        norm=normalize,
                        cmap=colormap,
                        linewidth=0,
                        ax=ax)

        # annotation
        ax.annotate(str(ident_1), xy=(0.02, 0.98), xycoords='axes fraction', va="top", ha="left")
        ax.annotate('Up-regulated\n' + str(n_up), xy=(0.98, 0.98), xycoords='axes fraction', va="top", ha="right")
        ax.annotate(str(ident_2), xy=(0.02, 0.02), xycoords='axes fraction', va="bottom", ha="left")
        ax.annotate('Down-regulated\n' + str(n_down), xy=(0.98, 0.02), xycoords='axes fraction', va="bottom", ha="right")

        # Lines
        plt.axhline(min_logfc, 0, 1, color='#666666', lw=1).set_linestyle("--")
        plt.axhline(-min_logfc, 0, 1, color='#666666', lw=1).set_linestyle("--")
        #plt.axhline(-np.log10(max_pval), 0, 1, color='#666666', lw=1).set_linestyle("--")


        # title & axis labels
        title = 'Combined p-value & fold change threshold\n('  + str(n_diff) + ' genes passing thresholds of ' + str(logfc_cut_off) + ' and ' + str(pval_cut_off) + ')'
        ax.set_title(title, fontweight='bold')
        ax.set_ylabel('$log_2$ Fold Change')
        ax.set_xlabel('$log_2$ Expression')

        plt.show()
        
    return results
    




##################################################################################################################################################################################
##################################################################################################################################################################################
##################################################################################################################################################################################
##################################################################################################################################################################################


def dot_plot_DElegate(
    adata,
    results_dict = None,
    keys = None,
    layer = 'sct_logcounts',
    cmap='RdBu_r'
):
    results = results_dict[results_dict['groupby_categories'][0]]

    # copy adata
    adata_temp = adata.copy()

    # set selected layer to .X
    if layer is not None:
        adata_temp.X = adata_temp.layers[layer].copy()

    # subset adata to group provided in restrict_to
    restrict_to = results_dict['restrict_to']
    groups_restrict = results_dict['groups_restrict']

    if restrict_to == None:
        adata_temp_test = adata_temp.copy()
    else:
        adata_temp_test = adata_temp[adata_temp.obs[groups_restrict].isin([restrict_to])].copy()

    # filter genes expressed in few cells
    adata_temp_test = adata_temp_test[:,results['names']]

    # filter keys
    de_genes = results['names'][(abs(results['logfc']) >= results_dict['logfc_cut_off']) & (results['log_pvals_adj'] >= results_dict['pval_cut_off'])]
    keys = [key for key in keys if key in list(de_genes)]

    # plot data
    ## var group pos
    if len(keys) > 0:
            var_group_positions=[(0,results[(results.names.isin(keys)) & (results['logfc'] < 0)].shape[0]-1),(results[(results.names.isin(keys)) & (results['logfc'] < 0)].shape[0],results[(results.names.isin(keys))].shape[0]-1)]
    
    ## colors
    index = pd.Index(results_dict['groupby_categories'], name='groupby')
    color_df = pd.DataFrame([results['logfc'],
                             -results['logfc']],
                           index=index).T
    color_df.index = results['names']
    color_df = color_df.T
    color_df = color_df.loc[:,keys]
    limit = abs(color_df).max().max()
    
    ## plot
    sc.pl.DotPlot(adata_temp_test, 
                  var_names=keys, 
                  groupby=results_dict['groupby'], 
                  dot_color_df=color_df, 
                  var_group_positions=var_group_positions, 
                  var_group_labels=results_dict['groupby_categories'][::-1],
                  vmin=-limit, 
                  vmax=limit, 
                  cmap=cmap).style(color_on='square', 
                                   dot_edge_lw=1, 
                                   grid=True, 
                                   dot_edge_color=None).legend(colorbar_title='log$_2$ Fold Change').show()

    del adata_temp_test
    del results
    gc.collect()


##################################################################################################################################################################################
##################################################################################################################################################################################
##################################################################################################################################################################################
##################################################################################################################################################################################


def run_DElegate_findDE(adata, 
                        layer = None, 
                        group_column = None, 
                        replicate_column = None, 
                        compare = "each_vs_rest", 
                        method = "edger", 
                        order_results = True, 
                        verbosity = 1, 
                        n_core = 64, 
                        max_memory = 4):
    '''
    adata: adata object to normalize
    layer: layer to use for normalization. Default = None -> use .X
    
    There are multiple ways the group comparisons can be specified based on the compare parameter. 
    The default, 'each_vs_rest', does multiple comparisons, one per group vs all remaining cells. 
    'all_vs_all', also does multiple comparisons, covering all group pairs. 
    If compare is set to a length two character vector, e.g. c('T-cells', 'B-cells'), one comparison between those two groups is done. 
    To put multiple groups on either side of a single comparison, use a list of length two. E.g. compare = list(c('cluster1', 'cluster5'), c('cluster3')).
    '''
    
    import rpy2
    import rpy2.robjects as ro
    import gc

       
    print('DE analysis with DElegate:')
    # load packages
    ro.globalenv['n_core'] = n_core
    ro.globalenv['max_memory'] = max_memory
    ro.r('''
    # Packages
    library(DElegate)
    library(Seurat)

    # Parallelization
    library(BiocParallel)
    register(MulticoreParam(n_core, progressbar = TRUE))

    library(future)
    plan("multicore", workers = n_core)
    options(future.globals.maxSize = max_memory * 1024^3)
    plan()
    ''')
    
    # transfer data & parameters
    if group_column is not None:
        ro.globalenv['group_column'] = group_column
    if replicate_column is not None:
        ro.globalenv['replicate_column'] = replicate_column
    ro.globalenv['compare'] = compare
    if type(compare) is list:
        ro.r('''
        compare <- unlist(compare)
        ''')
    
    ro.globalenv['method'] = method
    ro.globalenv['order_results'] = order_results
    ro.globalenv['verbosity'] = verbosity
    
    print('\tTransfer data...')
    if layer is None:
        print('\tUsing adata.X for differntial expression analysis...')
        ro.globalenv['counts'] = adata.X.T#.toarray()
        ro.globalenv['meta_data'] = adata.obs
        ro.globalenv['obs_names'] = adata.obs_names
        ro.globalenv['var_names'] = adata.var_names
    else:
        print('\tUsing layer \'', layer,'\' for differntial expression analysis...')
        ro.globalenv['counts'] = adata.layers[layer].T#.toarray()
        ro.globalenv['meta_data'] = adata.obs
        ro.globalenv['obs_names'] = adata.obs_names
        ro.globalenv['var_names'] = adata.var_names
        
    # generate seurat object
    ro.r('''
    rownames(counts) <- var_names
    colnames(counts) <- obs_names
    seurat <- CreateSeuratObject(counts = counts, meta.data = meta_data)
    ''')
    
    # run DElegate
    print('\tPerform differential gene expression analysis with method:', method,'...')
    
    # replace characters
    if group_column is not None:
        print('\tFixing characters in group_column:', group_column,'...')
        ro.r('''
        seurat@meta.data[group_column] <- gsub("[ -]", "_", get(group_column, seurat@meta.data))
        ''')
        
    if replicate_column is not None:
        print('\tFixing characters in replicate_column:', replicate_column,'...')
        ro.r('''
        seurat@meta.data[replicate_column] <- gsub("[ -]", "_", get(replicate_column, seurat@meta.data))
        ''')
    # run
    print('\tRunning', method,'...')
    ro.r('''
    de_res <- findDE(seurat,
                     meta_data = NULL,
                     group_column = group_column,
                     replicate_column = NULL,
                     compare = compare,
                     method = method,
                     order_results = order_results,
                     verbosity = verbosity)
    ''')
    
    # convert results
#     print('\tConverting results...')
#     ro.r('''
#     de_res <- as.data.frame(de_res)
#     ''')
    
    # transfer data
    print('\tTransfer data...')
    
    # convert results
    print('\tConvert results...')
    results = ro.globalenv['de_res']
    
    with (ro.default_converter + pandas2ri.converter).context():
        results = ro.conversion.get_conversion().rpy2py(results)
    
    #results.loc[:,'log10_padj'] = -np.log10(results.loc[:,'padj'])
    
    # delete
    print('\tClean up...')
    ro.r('''
    rm(list = ls())
    gc()
    ''')

    gc.collect()
    
    print('Done.')
    return results

def vulcano_plot_edger(results_dict=None, genes=[], annotate_top=True, n_top=10, title=None, min_logfc = 0.5, max_pval = 10**-2, group_order = (0,1), y_max_ext_factor=1.2, x_ext_factor=0.3, x_max_ext_factor=1.2, x_min_ext_factor=1.2, fig_size=(7,6), save=None):
    results = results_dict[results_dict['groupby_categories'][group_order[0]]]
    #ident_1 = results_dict['groupby_categories'][group_order[0]]
    #ident_2 = results_dict['groupby_categories'][group_order[1]]
    
    #n_up = sum((abs(results['pvals_adj']) > max_pval) & (results['logfc'] > min_logfc))
    #n_down = sum((abs(results['pvals_adj']) > max_pval) & (results['logfc'] < -min_logfc))
    n_up = sum((abs(results['pvals_adj']) < max_pval) & (results['logfc'] > min_logfc))
    n_down = sum((abs(results['pvals_adj']) < max_pval) & (results['logfc'] < -min_logfc))

    genes = genes + [gene for gene in results['names'] if not gene.startswith('ENSSSC')][0:n_top] + [gene for gene in results.sort_values(by=['log_pvals_adj'], ascending=False)['names'] if not gene.startswith('ENSSSC')][0:n_top]
    genes = genes + [gene for gene in results['names'][::-1] if not gene.startswith('ENSSSC')][0:n_top] + [gene for gene in results.loc[results['logfc']>0,:].sort_values(by=['log_pvals_adj'], ascending=False)['names'] if not gene.startswith('ENSSSC')][0:n_top]
    genes = list(set(genes))

    genes_up = [gene for gene in genes if gene in list(results.loc[(results['logfc'] > 0) & (abs(results['logfc']) >= min_logfc) & (results['pvals_adj'] <= max_pval),'names'])]

    genes_down = [gene for gene in genes if gene in list(results.loc[(results['logfc'] < 0) & (abs(results['logfc']) >= min_logfc) & (results['pvals_adj'] <= max_pval),'names'])]

    gene_2_plot = genes_up + genes_down

    results['log_pvals_adj'][results['log_pvals_adj'] > 300] = 300

    fig, ax = plt.subplots(1, figsize=fig_size)


    # Make x & y axis longer to make gene name plotting easier
    y_max = max(results['log_pvals_adj'])*y_max_ext_factor
    x_ext = (max(results['logfc_limit']) - min(results['logfc_limit'])) * x_ext_factor
    x_max = max(results['logfc_limit']) + x_ext
    x_min = min(results['logfc_limit']) - x_ext

    x_max = max(results['logfc_limit']) * x_max_ext_factor
    x_min = min(results['logfc_limit']) * x_min_ext_factor

    ax.set_ylim((-1,y_max))
    ax.set_xlim((x_min,x_max))

#     # normalize colormap
#     vcenter = 0
#     vmin, vmax = results['scores'].min(), results['scores'].max()
#     normalize = mcolors.TwoSlopeNorm(vcenter=vcenter, vmin=vmin, vmax=vmax)
#     colormap = cm.RdBu_r

    # Scatter plot
    sb.scatterplot(y='log_pvals_adj', x='logfc_limit',
                    color='#000000',s=20,
                    linewidth=0,
                    data=results)
    sb.scatterplot(y='log_pvals_adj', x='logfc_limit',
                    color='#cccccc',s=10,
                    linewidth=0,
                    data=results, ax=ax)
    y = results.loc[(abs(results['logfc']) >= min_logfc) & (results['pvals_adj'] <= max_pval),'log_pvals_adj']
    x = results.loc[(abs(results['logfc']) >= min_logfc) & (results['pvals_adj'] <= max_pval),'logfc_limit']
    c = results.loc[(abs(results['logfc']) >= min_logfc) & (results['pvals_adj'] <= max_pval),'color']

    sb.scatterplot(y=y, x=x, color='#ffffff', s=10, alpha=1,
                    #norm=normalize,
                    #cmap=colormap,
                    linewidth=0,
                    ax=ax)
    sb.scatterplot(y=y, x=x, c=c, s=10, alpha=0.5,
                    #norm=normalize,
                    #cmap=colormap,
                    linewidth=0,
                    ax=ax)

    # annotation
    an1 = ax.annotate(results_dict['groupby_categories'][group_order[1]] + '\n' + str(n_down) + ' genes', xy=(0.02, 0.98), xycoords='axes fraction',
                      va="top", ha="left")
    an2 = ax.annotate(results_dict['groupby_categories'][group_order[0]] + '\n' + str(n_up) + ' genes', xy=(0.98, 0.98), xycoords='axes fraction',  # (1, 0.5) of the an1's bbox
                      va="top", ha="right")

    # Lines
    plt.axvline(min_logfc, 0, 1, color='#666666', lw=1).set_linestyle("--")
    plt.axvline(-min_logfc, 0, 1, color='#666666', lw=1).set_linestyle("--")
    plt.axhline(-np.log10(max_pval), 0, 1, color='#666666', lw=1).set_linestyle("--")

    # Labels
    x_lim=ax.get_xlim()
    for gene_set,direction,ha in [(genes_down,-1,'right'),(genes_up,1,'left')]:
        labels = []
        for gene in gene_set:
            if gene.startswith('ENSSSC'):
                continue
            x=float(results.loc[results['names']==gene,'logfc_limit'])
            y=float(results.loc[results['names']==gene,'log_pvals_adj'])
            labels.append(plt.text(x, y, gene, color='#000000', fontsize=8))
        ax.set_xlim(sorted([x_lim[::direction][1],0.5*direction]))
        adjust_text(labels, expand_points=(1.5,1.5), expand_text=(2,2), expand_objects=(2,2), force_text=(0.75, 0.5), force_points=(0.75, 1), force_objects=(1, 0.5), ha=ha, precision=0.00001, lim=5000, autoalign='y', arrowprops=dict(arrowstyle="-",  color='k',  lw=0.5), ax=ax)
    ax.set_xlim(x_lim)

    # title & axis labels
    if title == None:
        title = 'Differential Gene Expression in ' + results_dict['restrict_to'] + ' Cells\n' + results_dict['groupby_categories'][group_order[0]] + ' vs ' + results_dict['groupby_categories'][group_order[1]]
    ax.set_title(title)
    ax.set_ylabel('$-log_{10}$ Adjusted p-Value')
    ax.set_xlabel('$log_2$ Fold Change')

    if save is not None:
        plt.savefig(save)




##################################################################################################################################################################################
##################################################################################################################################################################################
##################################################################################################################################################################################
##################################################################################################################################################################################



def run_DElegate_findMarkers(adata, 
                        layer = None, 
                        group_column = None, 
                        replicate_column = None, 
                        method = "edger", 
                        min_rate = 0.05,
                        min_fc = 1,
                        verbosity = 1, 
                        n_core = 64, 
                        max_memory = 4):
    '''
    adata: adata object to normalize
    layer: layer to use for normalization. Default = None -> use .X
    
    There are multiple ways the group comparisons can be specified based on the compare parameter. 
    The default, 'each_vs_rest', does multiple comparisons, one per group vs all remaining cells. 
    'all_vs_all', also does multiple comparisons, covering all group pairs. 
    If compare is set to a length two character vector, e.g. c('T-cells', 'B-cells'), one comparison between those two groups is done. 
    To put multiple groups on either side of a single comparison, use a list of length two. E.g. compare = list(c('cluster1', 'cluster5'), c('cluster3')).
    '''
    
    import rpy2
    import rpy2.robjects as ro
    import gc

       
    print('DE analysis with DElegate:')
    # load packages
    ro.globalenv['n_core'] = n_core
    ro.globalenv['max_memory'] = max_memory
    ro.r('''
    # Packages
    library(DElegate)
    library(Seurat)

    # Parallelization
    library(BiocParallel)
    register(MulticoreParam(n_core, progressbar = TRUE))

    library(future)
    plan("multicore", workers = n_core)
    options(future.globals.maxSize = max_memory * 1024^3)
    plan()
    ''')
    
    # transfer data & parameters
    if group_column is not None:
        ro.globalenv['group_column'] = group_column
    if replicate_column is not None:
        ro.globalenv['replicate_column'] = replicate_column
    
    ro.globalenv['method'] = method
    ro.globalenv['min_rate'] = min_rate
    ro.globalenv['min_fc'] = min_fc
    ro.globalenv['verbosity'] = verbosity
    
    print('\tTransfer data...')
    if layer is None:
        print('\tUsing adata.X for differntial expression analysis...')
        ro.globalenv['counts'] = adata.X.T#.toarray()
        ro.globalenv['meta_data'] = adata.obs
        ro.globalenv['obs_names'] = adata.obs_names
        ro.globalenv['var_names'] = adata.var_names
    else:
        print('\tUsing layer \'', layer,'\' for differntial expression analysis...')
        ro.globalenv['counts'] = adata.layers[layer].T#.toarray()
        ro.globalenv['meta_data'] = adata.obs
        ro.globalenv['obs_names'] = adata.obs_names
        ro.globalenv['var_names'] = adata.var_names
        
    # generate seurat object
    ro.r('''
    rownames(counts) <- var_names
    colnames(counts) <- obs_names
    seurat <- CreateSeuratObject(counts = counts, meta.data = meta_data)
    ''')
    
    # run DElegate
    print('\tPerform differential gene expression analysis with method:', method,'...')
    
    # replace characters
    if group_column is not None:
        print('\tFixing characters in group_column:', group_column,'...')
        ro.r('''
        seurat@meta.data[group_column] <- gsub("[ -]", "_", get(group_column, seurat@meta.data))
        ''')
        
    if replicate_column is not None:
        print('\tFixing characters in replicate_column:', replicate_column,'...')
        ro.r('''
        seurat@meta.data[replicate_column] <- gsub("[ -]", "_", get(replicate_column, seurat@meta.data))
        ''')
    # run
    print('\tRunning', method,'...')
    ro.r('''
    de_res <- FindAllMarkers2(seurat,
                     meta_data = NULL,
                     group_column = group_column,
                     replicate_column = NULL,
                     method = method,
                     min_rate = min_rate,
                     min_fc = min_fc,
                     verbosity = verbosity)
    ''')
    
    # convert results
#     print('\tConverting results...')
#     ro.r('''
#     de_res <- as.data.frame(de_res)
#     ''')
    
    # transfer data
    print('\tTransfer data...')
    
    # convert results
    print('\tConvert results...')
    results = ro.globalenv['de_res']
    
    with (ro.default_converter + pandas2ri.converter).context():
        results = ro.conversion.get_conversion().rpy2py(results)
    
    #results.loc[:,'log10_padj'] = -np.log10(results.loc[:,'padj'])
    
    # delete
    print('\tClean up...')
    ro.r('''
    rm(list = ls())
    gc()
    ''')

    gc.collect()
    
    print('Done.')
    return results


# Read AnnData

In [None]:
adata = sc.read_h5ad('/mnt/hdd/data/Healthy/adata_imputed.h5ad')

In [None]:
# Open the HDF5 file
from anndata._io.specs import read_elem
with h5py.File('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_rmDoublets_integrated_all_imputed.h5ad', 'r') as f:
    # Read specific columns from `obs`
    #sample_column = f['obs/sample'][:]
    #n_counts_column = f['obs/n_counts'][:]
    #https://github.com/scverse/anndata/issues/436:
    #cell_types = read_elem(f["obs/celltype"])
    #umap = read_elem(f["obsm/X_umap"])
    raw_counts = read_elem(f["layers/raw_counts"])
    

In [None]:
adata.obs['initial_cell_type'] = adata.obs['initial_cell_type'].replace('Doublets', 'NA')

In [None]:
adata

In [None]:
adata.layers['raw_counts']=raw_counts

In [None]:
del raw_counts
gc.collect()

In [None]:
adata.X = adata.layers['log_dca_counts']

# Embedding

In [None]:
sc.pp.neighbors(adata, use_rep="X_scANVI", n_pcs=50, n_neighbors=20)

In [None]:
sc.tl.paga(adata, groups='initial_cell_type')

In [None]:
sc.pl.paga(adata, save = 'Paga_scANVI-imputed.png', threshold=0.05, max_edge_width=3, min_edge_width=0.01, node_size_scale=3, fontsize=10, fontoutline=True)

In [None]:
sc.pl.paga(adata, save = 'Paga_scANVI-imputed.png')#, threshold=0.05, max_edge_width=3, min_edge_width=0.01, node_size_scale=3, fontsize=10, fontoutline=True)

In [None]:
adata.uns['paga']['pos'][:,0]

In [None]:

adata.uns['paga']['pos'][:,1]

In [None]:
x=[ 0.74707301,  0.16060124, -1.06248722, -1.38000582, -1.80098163,
        0.3710921 ,  2.25909226,  -2.48950279,  0.90816349,  1.92492535,
        1.40230228]

y=[-0.58546953, -1.03604595,  0.54833745,  1.48434607,  0.11088378,
        1.12446191,  0.39605307,  -1.01840312, -1.40682015, -1.41092023,
       -0.52664075]

In [None]:
sb.scatterplot(x=x, y=y)

In [None]:
init_pos = np.array([x, y]).T

In [None]:
adata.uns['paga']['pos'] = init_pos

In [None]:
gc.collect()

In [None]:
adata

In [None]:
sc.tl.umap(adata, min_dist=0.18, spread=0.55, negative_sample_rate=1, gamma=0.25, init_pos='paga') #0.18 mindist, 0.85 spread

In [None]:
sc.pl.umap(adata, color=['sample','leiden','initial_cell_type','phase'], size=5, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), ncols=2, wspace =0.8, color_map=mymap, title = ['samples','leiden clustering','initial cell type','cell cycle'],legend_fontweight='light',legend_fontsize=10, save = 'Umap_scANVI_5_cluster_initial_CT_CC_imputed.png',)

In [None]:
## add metadata
metadata_df =read_excel_metadata(f'/mnt/hdd/data/metadata_mouse_gut.xlsx')
# Ensure folder name is the index in metadata for easier access
metadata_df.drop(metadata_df[metadata_df['kit'] == 'Multiome_ATAC_v1'].index, inplace=True)
metadata_df.drop(metadata_df[~metadata_df['condition'].isin(['Ctr','Ctr/WT'])].index, inplace=True)
metadata_df.set_index('folder name', inplace=True)
metadata_df.drop(['Sample Pooling - confounded with Project?','sample number Minas','Read Length', 'Internal ID', 'SeqID','date',"modality (confounded with 'sequencing'",'Project Name','Link_id','sample name','Cell Count [cells/µl]','Viable Cells [%]','Lib. Concentration [ng/µl]','Lib. Molarity [nM]','Average Lib. Size [bp]','cDNA Cycles','Lib. Cycles','10x Sample Index','Sequencing Depth [reads/cell]','exclusion, reason'], axis=1, inplace=True)

In [None]:
# Function to update adata.obs with metadata using a lambda function
for col in metadata_df.columns:
    try:
        adata.obs[col] = adata.obs['sample'].apply(lambda x: metadata_df.at[x, col])
    except KeyError as err:
        print(f'no such key: {err} in col {col}')

In [None]:
del metadata_df
gc.collect()

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=['pretty name','sequencing','initial_cell_type','phase','enriched','enrichment proportion','line','sequencing machine'], size=7, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), ncols=2, wspace =0.6,legend_fontsize=10, color_map='tab20c', title=['samples','sequencing input','initial cell type','cell cycle','enrichment reporter','enrichment proportion','mouse line','sequencing machine'], save = 'Umap_scANVI_5_metadata_imputed.png')

# Clustering

In [None]:
sc.tl.leiden(adata, resolution=1, key_added='leiden_1')

In [None]:
sc.pl.umap(adata, color=['leiden_1'], size=7, add_outline=True, alpha=0.9, outline_width=(0.3, 0.0), title='leiden clusters with 1 resolution',color_map='tab20', save = 'Umap_scANVI_5_leiden_1.25_imputed_legend.png', legend_fontsize=8)

In [None]:
sc.pl.umap(adata, color=['leiden_1'], size=7, add_outline=True, alpha=1, outline_width=(0.3, 0.0), title='leiden clusters with 1.25 resolution',legend_fontsize=9, legend_fontweight='black', color_map='tab20c', legend_loc="on data", save = 'Umap_scANVI_5_leiden_1.25_imputed.png')

## Marker Genes

In [None]:
results = run_DElegate_findMarkers(adata, 
                        layer = 'raw_counts', 
                        group_column = 'leiden_1', 
                        replicate_column = None, 
                        method = "edger", 
                        min_rate = 0.05,
                        min_fc = 1,
                        verbosity = 1, 
                        n_core = 20, 
                        max_memory = 4)

In [None]:
for cluster in set(adata.obs['leiden_1']):
    print(cluster)    
    sc.pl.umap(adata, color=list(results.loc[results['group1']==cluster.replace(" ","_"),"feature"][0:10]), layer='log_dca_counts', size=10, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), ncols=5, show=False , save = f'cluster{cluster}_Delegate_markers_scANVI.png')

In [None]:
marker_genes = ['Lgr5','Olfm4','Dmbt1','Arg2','Sis','Dclk1','Sox4','Pou2f3','Muc2','Tff3','Dll1','Atoh1','Spdef','Lyz1','Neurog3','Neurod1','Arx','Pax4','Lmx1a','Reg4','Isl1','Sst','Gcg','Cck','Gip','Ghrl']

In [None]:
sc.tl.dendrogram(adata, groupby='leiden_1', var_names=marker_genes, key_added='marker_gene_dendrogram')

In [None]:
sc.pl.dotplot(adata, dendrogram=True,var_names=marker_genes, groupby='leiden_1', standard_scale='var', cmap=mymap, use_raw=False, layer='sct_logcounts', categories_order=adata.uns['marker_gene_dendrogram']['categories_ordered'], save = 'Dotplot_marker_genes.png')#.style(color_on='square', dot_edge_lw=1, grid=True, dot_min=0.15, dot_edge_color=None, save = 'Dotplot_marker_genes.png').show()

In [None]:
sc.pl.umap(adata, color= marker_genes, layer='log_dca_counts',cmap = mymap, use_raw=False, size=7, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, save = 'Umap_scANVI_marker_genes_expr.png')

## Subclustering

### Tuft

In [None]:
leiden_in = 'leiden_1'
leiden_out = 'leiden_sub1'
restrict_to = ['11']
resolution = 0.2
layer = 'log_dca_counts'
genes = ['Dclk1','Sox4','Pou2f3']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'phase'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=6, ncols=3, save = 'Umap_scANVI_subcluster_tuft_phase.png', title = ['leiden subcluster', 'cell cycle'])

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'pretty name'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=7, ncols=3,wspace=0.6, title=['leiden subcluster', 'samples'], save = 'Umap_scANVI_subcluster_tuft_sample.png')

In [None]:

sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save = 'violin_scANVI_subcluster_tuft.png')

### Goblet & Early Progenitors 1

In [None]:
leiden_in = 'leiden_sub1'
leiden_out = 'leiden_2'
restrict_to = ['8']
resolution = 0.2
layer = 'log_dca_counts'
genes = ['Sox4','Atoh1','Foxa2','Dll1','Spdef','Creb3l1','Klf4','Neurog3','Pou2f3','Top2a']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'pretty name'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=6, ncols=4, title = ['leiden subclusters', 'sequencing input', 'cell cycle', 'samples'], save= 'Umap_GC_secr_prog_1.png')

In [None]:

sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save= 'violin_GC_secr_prog_1.png')

### Goblet & Early Prog. 2

In [None]:
leiden_in = 'leiden_2'
leiden_out = 'leiden_3'
restrict_to = ['8,1']
resolution = 0.4
layer = 'log_dca_counts'
genes = ['Sox4','Atoh1','Foxa2','Dll1','Spdef','Creb3l1','Klf4','Neurog3','Pou2f3','Top2a']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=9, ncols=4, title = ['leiden subclusters', 'sequencing input', 'cell cycle', 'samples'], save= 'Umap_GC_secr_prog_2.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer)

###  EEC Prog

In [None]:
adata.obs['leiden_3'].cat.categories

In [None]:
leiden_in = 'leiden_3'
leiden_out = 'leiden_4'
restrict_to = ['12', '8,1,2']
resolution = 0.4
layer = 'log_dca_counts'
genes = ['Dll1','Sox4','Atoh1','Foxa2','Neurog3','Neurod1','Fev','Arx','Pax4']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=9, ncols=4, title = ['leiden subclusters', 'sequencing input', 'cell cycle', 'samples'], save= 'Umap_EEC_prog.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save= 'violin_EEC_prog_1.png')

###  EECs Pep.

In [None]:
gc.collect()

In [None]:
adata.obs['leiden_4'].cat.categories

In [None]:
leiden_in = 'leiden_4'
leiden_out = 'leiden_5'
restrict_to = ['10','16']
resolution = 0.5
layer = 'log_dca_counts'
genes = ['Dll1','Sox4','Foxa2','Neurog3','Neurod1','Fev','Arx','Pax4','Tac1','Tph1','Cck','Ghrl']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=9, ncols=4, save= 'Umap_EEC_subs.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save= 'violin_EEC_subs.png')

###  EEC Pep. Prog

In [None]:
adata.obs['leiden_5'].cat.categories

In [None]:
leiden_in = 'leiden_5'
leiden_out = 'leiden_6'
restrict_to = ['12-8,1,2,2','10-16,2']
resolution = 0.4
layer = 'log_dca_counts'
genes = ['Dll1','Sox4','Foxa2','Neurog3','Neurod1','Fev','Arx','Pax4','Tac1','Tph1','Cck','Ghrl']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=0.9, outline_width=(0.3, 0.0), legend_fontsize=9, ncols=4, save= 'Umap_EEC_P_prog.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save= 'violin_EEC_prog_p.png')

###  EECs Pep. 2

In [None]:
sc.pl.umap(adata, color=['leiden_1','leiden_6'], size=9, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), wspace =0.55)

###  ECs

In [None]:
sc.pl.umap(adata, color=['leiden_6'], size=10, legend_loc='on data', legend_fontsize=6, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0))

In [None]:
adata.obs['leiden_6'].cat.categories

In [None]:
leiden_in = 'leiden_6'
leiden_out = 'leiden_7'
restrict_to = ['9']
resolution = 0.4
layer = 'log_dca_counts'
genes = ['Dll1','Sox4','Foxa2','Neurog3','Neurod1','Fev','Arx','Pax4','Tph1','Tac1','Reg4']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=9, ncols=4, save= 'Umap_EC_subs.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save= 'violin_EC.png')

### Goblet

In [None]:
with plt.rc_context({"figure.figsize": (10, 10)}):
    sc.pl.umap(adata, color=['leiden_7'], size=10, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), legend_loc='on data', legend_fontsize=8)

In [None]:
leiden_in = 'leiden_7'
leiden_out = 'leiden_8'
restrict_to = ['1']
resolution = 0.3
layer = 'log_dca_counts'
genes = ['Sox4','Atoh1','Foxa2','Spdef','Creb3l1','Klf4','Muc2','Tff3','Lyz1','Defa24']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=9, ncols=4, save= 'Umap_GC_subs.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer , save= 'violin_GC.png')

In [None]:
gc.collect()

###  ISCs

In [None]:
with rc_context({'figure.figsize': (8,4)}):
    sc.pl.violin(adata, use_raw=False, keys=['Lgr5','Olfm4','Dmbt1'], groupby='leiden_1', rotation=90, layer=layer)

In [None]:
leiden_in = 'leiden_8'
leiden_out = 'leiden_9'
restrict_to = ['2','5','4','17','19']
resolution = 0.5
layer = 'log_dca_counts'
genes = ['Lgr5','Olfm4','Dmbt1','Arg2','Sis', 'Guca2a', 'Guca2b', 'Bmi1', 'Tert', 'Lrig1'] #Ca7

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=9, ncols=4, save= 'Umap_ISC_sub.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer,show = False, save= 'violin_ISC.png')

In [None]:
gc.collect()

In [None]:
adata.write('annoationNB10_until_TA_imputed.h5ad')

In [None]:
adata = sc.read_h5ad('annoationNB10_until_TA_imputed.h5ad')

###  TA & early Enterocytes

ta markers = Reg1a, Dmbt1, Ccl25, Tmem238, Slc12a2, Mlec, Adh1c, Mt-rnr2, Mt-rnr1, Mtatp6p1, Mki67, Cenpf, Hnrnpab, Trac, Wdr43 
enterocyte = Gsta1, Adirf, Fabp6, Apoa4, Cdhr5, Khk, Amn, C3orf85, Selenop, Apoa1
enterocyte of SI = Cyp3a5, Ptgr1, Mme, Slc13a2, Cldn15, Akr7a3, Ugt2b17, Ace, Cyp3a4, Fabp2
enterocyte+enterocyte of SI = Anpep, Aldob, Ces2, Rbp2, Atp1a1, Slc5a1, Prap1, Si, Fabp1, Mttp, Dgat1, Apob, Cbr1, Smim24, Fabp2
SI_EC +TA = Pigr

In [None]:
with rc_context({'figure.figsize': (10,6)}):
    sc.pl.violin(adata, use_raw=False, keys=['Olfm4','Dmbt1','Arg2','Sis','Apoa1','Fabp1'],layer = 'log_dca_counts', groupby='leiden_1', rotation=90)

In [None]:
with plt.rc_context({"figure.figsize": (10, 10)}):
    sc.pl.umap(adata, color=['leiden_9'], size=4, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), legend_loc='on data', legend_fontsize=7)

In [None]:
adata.obs['leiden_9'].cat.categories

In [None]:
leiden_in = 'leiden_9'
leiden_out = 'leiden_10'
restrict_to = ['2-5-4-17-19,1', '2-5-4-17-19,2','2-5-4-17-19,4',
       '2-5-4-17-19,5', '2-5-4-17-19,6', '2-5-4-17-19,7']
resolution = 0.3
layer = 'log_dca_counts'
genes = ['Lgr5','Olfm4','Dmbt1','Arg2','Sis','Apoa1','Fabp1','Sox4','Atoh1','Tfrc','Hnf4a','Reg3b','Mki67','Rpl3']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=9, ncols=4, save= 'Umap_TA_early_Enterocytes_sub.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save= 'violin_TA_early_Enterocytes.png')

In [None]:
gc.collect()

### Enterocytes 2

In [None]:
adata.obs['leiden_10'].cat.categories

In [None]:
leiden_in = 'leiden_10'
leiden_out = 'leiden_11'
restrict_to = ['6','15','20','3']
resolution = 0.3
layer = 'log_dca_counts'
genes = ['Lgr5','Olfm4','Dmbt1','Arg2','Sis','Apoa1','Fabp1','Sox4','Atoh1','Tfrc','Hnf4a','Reg3b','Mki67','Rpl3']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=9, ncols=4, save= 'Umap_enterocytes_2_sub.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save= 'violin_Enterocytes_2.png')

In [None]:
gc.collect()

In [None]:
adata

### Paneth

In [None]:
leiden_in = 'leiden_11'
leiden_out = 'leiden_12'
restrict_to = ['0','7','13']
resolution = 0.2
layer = 'log_dca_counts'
genes = ['Lyz1','Lyz2','Defa24','Defa34','Bambi','Mmp7','Itln1', 'Acvr1c', 'Mmp7','Agr2','Sox9','Epcam','Nupr1', 'Dll4', 'Actg1', 'Tff3', 'Guca2a', 'Muc2', 'Sox9', 'Atoh1', 'Mptx1', 'Mptx2'] #Defa5, Defa6, Reg3a, Prss1, Reg1a, Lyz, Prss3, Lcn2

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=9, ncols=4, save= 'Umap_Paneth_sub.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save= 'violin_Paneth.png')

In [None]:
gc.collect()

In [None]:
adata.obs['leiden_12'].cat.categories

In [None]:
with plt.rc_context({"figure.figsize": (10, 10)}):
    sc.pl.umap(adata, color=['leiden_12'], size=10, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), legend_loc='on data', legend_fontsize=8)

### Paneth prg./like

In [None]:
leiden_in = 'leiden_12'
leiden_out = 'leiden_13'
restrict_to = ['1,3','14','0-7-13,2']
resolution = 0.2
layer = 'log_dca_counts'
genes = ['Lyz1','Defa24','Defa34','Bambi','Mmp7','Itln1', 'Acvr1c', 'Mmp7','Agr2','Sox9','Epcam','Nupr1', 'Dll4', 'Actg1', 'Tff3', 'Guca2a', 'Muc2', 'Lyz2', 'Sox9', 'Atoh1', 'Mptx1', 'Mptx2'] #Defa5, Defa6, Reg3a, Prss1, Reg1a, Lyz, Prss3, Lcn2

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=6, ncols=4, save= 'Umap_Paneth_sub2.png')

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save= 'violin_Paneth2.png')

In [None]:
gc.collect()

### Paneth prog.

In [None]:
with rc_context({'figure.figsize': (10,14), 'font.family':'sans-serif'}):
    sc.pl.umap(adata, color=['leiden_13'], layer=layer, size=15, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=8, legend_loc='on data', title = ['leiden subclusters at state 13'], show = False, save= 'Umap_leiden_state_13.png')

In [None]:
adata.obs['leiden_13'].cat.categories

In [None]:
leiden_in = 'leiden_13'
leiden_out = 'leiden_14'
restrict_to = ['14']
resolution = 0.1
layer = 'log_dca_counts'
genes = ['Sox4','Atoh1','Foxa2','Dll1','Spdef','Muc2','Tff3','Creb3l1','Klf4','Defa24','Lyz1','Pou2f3','Top2a','Lyz1','Defa24','Defa34','Bambi','Mmp7','Itln1', 'Acvr1c', 'Mmp7','Agr2','Sox9','Epcam','Nupr1', 'Dll4','Ctse', 'Slc26a3', 'Golm1', 'Tff2', 'Muc1', 'Dmbt1']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase', 'sample'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=6, ncols=4, title = ['leiden subclusters', 'sequencing input', 'cell cycle', 'samples'], save= 'Umap_GC_PC_zwitters1.png', wspace = 0.65)

In [None]:
sc.pl.violin(adata[adata.obs[leiden_in].isin(restrict_to)], use_raw=False, keys=genes, groupby=leiden_out, rotation=90, layer=layer, save= 'violin_GC_PC_zwitters1.png')

### M cells

In [None]:
layer = 'log_dca_counts'
genes = ['Ctse', 'Slc26a3', 'Golm1', 'Tff2', 'Muc1', 'Dmbt1']

In [None]:
sc.pl.umap(adata, color=genes, layer=layer, size=15, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=8,ncols=3 show = True, cmap = mymap, save= 'Umap_M_cell_markers.png')

### save adata

In [None]:
adata.write('scANVI_imputed_subclustering_done.h5ad')

In [None]:
adata = sc.read_h5ad('scANVI_imputed_subclustering_done.h5ad')

In [None]:
with rc_context({'figure.figsize': (8,8)}):
    sc.pl.umap(adata, color='leiden_13', size=6, add_outline=True, alpha=1, outline_width=(0.3, 0.0), title='Final subclustering')#, legend_loc='on data')

In [None]:
gc.collect()

# Annotation

In [None]:
def rename_cluster(adata, df, cluster_key, indices, added_key):
    keys = list(indices)
    values = [index.split('_')[0] + '_' + added_key for index in indices]
    idx_dict = dict(zip(keys, values))

    adata.obs[cluster_key] = adata.obs[cluster_key].cat.rename_categories(idx_dict)
    df = df.rename(index=idx_dict)
    return(adata,df)

def annotate_cluster(adata, cluster_key, annotation_key):
    keys = list(adata.obs[cluster_key].cat.categories)
    values = [] 
    for index in adata.obs[cluster_key].cat.categories:
        if len(index.split('_')) < 2:
            values = values + [index]
        else:
            values = values + [index.split('_')[1]]
    idx_dict = dict(zip(keys, values))

    adata.obs[annotation_key] = adata.obs[cluster_key].map(idx_dict).astype('category')
    return(adata)

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [None]:
gc.collect()

## Marker Gene Expression

In [None]:
group_by = 'leiden_13'

In [None]:
adata.obs[group_by] = adata.obs[group_by].cat.rename_categories(dict(zip(adata.obs[group_by].cat.categories,[str(n) for n in range(0,len(adata.obs[group_by].cat.categories))])))

In [None]:
sc.pl.umap(adata, color=group_by, size=5, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, color_map=mymap, legend_loc='on data', legend_fontsize=7)

In [None]:
marker_genes = ['Lgr5','Olfm4','Dmbt1','Arg2','Sis','Dclk1','Sox4','Pou2f3','Muc2','Dll1','Ccl25','Lyz1','Neurog3','Neurod1','Arx','Pax4','Spdef','Lmx1a','Reg4','Isl1','Sst','Gcg','Cck','Gip','Ghrl','Sct','Fev','Lbh', 'Rnase4','Ctse', 'Slc12a8','Reg1','Slc2a2','Ada', 'Golm1', 'Tff2', 'Muc1', 'Dmbt1']

In [None]:
sc.pl.umap(adata, color=['Reg1', 'Tma7', 'Gpx2', 'Ccl25', 'Reg3a', 'Slc7a8', 'Slc2a2','Apob','Tbk1', 'Pkib','Nts', 'Ada', 'Cdh1', 'Lct','Alpi', 'Lbr','Ndc1','Prkdc','Mcm5', 'Gfi1', 'Klf4', 'Hspd1', 'Slc4a4','Pck1','Spink4','Pfkfb2', 'Fabp5', 'Spink4', 'Rgs13', 'Cd44', 'Chga', 'Chgb', 'Stmn1'], layer='log_dca_counts', size=2, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, show = False, save = 'umapUmap_scANVI_5_markers_prox_dist_etc.png')

In [None]:
sc.pl.umap(adata, color=marker_genes, layer='log_dca_counts', size=2, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, cmap=mymap)

In [None]:
sc.pl.umap(adata, color=['Reg1', 'Tma7', 'Gpx2', 'Ccl25', 'Reg3a', 'Slc7a8', 'Slc2a2','Apob','Tbk1', 'Pkib','Nts', 'Ada', 'Cdh1', 'Lct','Alpi', 'Lbr','Ndc1','Prkdc','Mcm5', 'Gfi1', 'Klf4', 'Hspd1', 'Slc4a4','Pck1','Spink4','Pfkfb2', 'Fabp5', 'Spink4', 'Rgs13', 'Cd44', 'Chga', 'Chgb', 'Stmn1'], layer='log_dca_counts', size=2, add_outline=True, alpha=1, outline_width=(0.3, 0.0), cmap = mymap,ncols=4)

In [None]:
sc.pl.umap(adata, color=marker_genes, layer='log_dca_counts', size=7, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, show = False, save ='UMAP_markers_containing_M_cells_and_more.png')

In [None]:
gc.collect()

In [None]:
sc.tl.dendrogram(adata, groupby=group_by, var_names=marker_genes, key_added='marker_gene_dendrogram')
sc.pl.DotPlot(adata, var_names=marker_genes, groupby=group_by, standard_scale='var', cmap=mymap, layer='log_dca_counts', use_raw=False, categories_order=adata.uns['marker_gene_dendrogram']['categories_ordered']).style(color_on='square', dot_edge_lw=1, grid=True, dot_min=0.15, dot_edge_color=None).show()

In [None]:
sc.pl.dotplot(adata, var_names=marker_genes, dendrogram= 'marker_gene_dendrogram', groupby=group_by, standard_scale='var', cmap=mymap, layer='log_dca_counts', use_raw=False, show = False, save = 'Dotplot_markers_subclusters.png')

In [None]:
adata

In [None]:
gc.collect()

## create df

In [None]:
clusters_manual = ['9','16','23','34','41']

In [None]:
groupby = 'leiden_13'

df = pd.DataFrame(data = adata[:,np.in1d(adata.var_names,marker_genes)].X.toarray(), 
                  index = adata.obs_names, 
                  columns=adata.var_names[np.in1d(adata.var_names, marker_genes)].values)

df[groupby]= pd.Series(adata.obs[groupby][~adata.obs[group_by].isin(clusters_manual)], index=df.index)


if 'df_all' in globals():
    del df_all
    
for i,marker in enumerate(marker_genes):
    if i == 0:
        df_all = pd.DataFrame(df.groupby(by=groupby)[marker].apply(np.mean).values, index=df.groupby(by=groupby)[marker].apply(np.mean).index, columns=['mean_'+marker])
    else:
        df_all['mean_'+marker] = df.groupby(by=groupby)[marker].apply(np.mean).values

df_all.dropna(inplace=True)
        
# for key in fate_probability_keys:
#     df_all['mean_'+key] = adata.obs.groupby(by=groupby)[key].apply(np.mean).values
        
df_all/df_all.max(axis=0)

In [None]:
for marker in marker_genes:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.5, 10], labels=['low', 'high'])

for marker in ['Olfm4','Sis']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.4, 0.6, 10], labels=['low', 'mid', 'high'])

for marker in ['Arg2']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.35, 0.6, 10], labels=['low', 'mid', 'high'])

for marker in ['Neurog3']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.55, 10], labels=['low', 'high'])

for marker in ['Ghrl','Cck']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.2, 0.9, 10], labels=['low', 'mid', 'high'])

for marker in ['Sox4','Dclk1']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.4, 0.75, 10], labels=['low', 'mid', 'high'])

for marker in ['Muc2']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.3, 0.6, 10], labels=['low', 'mid', 'high'])

for marker in ['Pou2f3']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.3, 0.5, 10], labels=['low', 'mid', 'high'])

for marker in ['Golm1', 'Rnase4']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.5, 0.7, 10], labels=['low', 'mid', 'high'])

for marker in ['Muc1']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.15, 10], labels=['low', 'high'])

for marker in ['Lyz1']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.4, 0.7, 10], labels=['low', 'mid', 'high'])

for marker in ['Tff2']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.8, 10], labels=['low', 'high'])

for marker in ['Sct','Dmbt1']:
    df_all[marker+'_lowhigh'] = pd.cut(df_all['mean_'+marker]/max(df_all['mean_'+marker]), bins=[-10, 0.2, 0.5, 10], labels=['low','mid', 'high'])


df_all.iloc[:,len(marker_genes):]

## Clustering-Based Annotation with EEC Subtypes

In [None]:
gc.collect()

In [None]:
adata.obs['leiden_13'] = adata.obs['leiden_13_save']

In [None]:
clusters_manual_dict = {'9':'unknown0',
                        '41':'Tuft prog. 2',
                        '16':'TA',
                        '34':'EC 2',
                        '23':'early Enterocyte'}

In [None]:
annotation_key = 'cell_type_annotation_lv1'


In [None]:
cluster_key = 'leiden_13'

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Lyz1_lowhigh']=='high')& 
                                          (df_all['Muc2_lowhigh']=='high') & (df_all['Lbh_lowhigh']=='high')& 
                                          (df_all['Reg4_lowhigh']=='high')].index, 'Goblet-Paneth-like cells')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Lyz1_lowhigh']=='high') & 
                                                                  (df_all['Lbh_lowhigh']=='high') & 
                                                                    (df_all['Reg4_lowhigh']=='high')].index, 'Paneth')

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Lyz1_lowhigh']=='high')& 
                                          ((df_all['Olfm4_lowhigh']!='low')| (df_all['Dmbt1_lowhigh']!='low'))& (df_all['Pou2f3_lowhigh']=='low') ].index, 'Paneth prog.')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[df_all['Lgr5_lowhigh']=='high'].index, 'ISC')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Dll1_lowhigh']=='high') & (df_all['Spdef_lowhigh']!='high')].index, 'Goblet/EEC prog. (early)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Lmx1a_lowhigh']=='high') & 
                                          (df_all['Neurog3_lowhigh']=='low') & 
                                          (df_all['Reg4_lowhigh']=='low')].index, 'EC (immature)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[((df_all['Lmx1a_lowhigh']=='high') & 
                                          (df_all['Reg4_lowhigh']=='high'))|((df_all['Neurod1_lowhigh']=='high') & 
                                                                             (df_all['Neurog3_lowhigh']=='low') & 
                                          (df_all['Pax4_lowhigh']=='low')&(df_all['Reg4_lowhigh']=='high'))].index, 'EC (mature)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Neurog3_lowhigh']=='high') & 
                                          #(df_all['Neurod1_lowhigh']=='high') & 
                                          (df_all['Arx_lowhigh']=='low') & 
                                          (df_all['Pax4_lowhigh']=='low')].index, 'EEC prog. (mid)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Neurog3_lowhigh']=='high') & 
                                          #(df_all['Neurod1_lowhigh']=='high') & 
                                          (df_all['Arx_lowhigh']=='low') & 
                                          (df_all['Lmx1a_lowhigh']=='high')].index, 'EC prog. (late)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Neurog3_lowhigh']=='high') & 
                                          (df_all['Neurod1_lowhigh']=='high') & 
                                          (df_all['Arx_lowhigh']=='low') & 
                                          (df_all['Pax4_lowhigh']=='high')& 
                                          (df_all['Ghrl_lowhigh']=='low')].index, 'EC prog. (late)')

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[((df_all['Neurog3_lowhigh']=='high')|(df_all['Neurod1_lowhigh']=='high')) & 
                                          ((df_all['Arx_lowhigh']=='high')|
                                          (df_all['Pax4_lowhigh']=='high'))& 
                                          (df_all['Ghrl_lowhigh']!='low')].index, 'EEC prog. (late/Peptide)')

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Neurog3_lowhigh']=='low') & (df_all['Sct_lowhigh']!='low') & 
                                          (df_all['Neurod1_lowhigh']=='high') & 
                                          (df_all['Isl1_lowhigh']=='high')].index, 'EEC (Peptide/immature)')


# adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Neurog3_lowhigh']=='low') & 
#                                           (df_all['Isl1_lowhigh']=='high')].index, 'Other Endocrine')


# adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Sct_lowhigh']=='high')].index, 'S-cell (Sct+)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Sst_lowhigh']=='high') & 
                                                                  ((df_all['Neurod1_lowhigh']=='high')|(df_all['Isl1_lowhigh']=='high'))].index, 'D-cell (Sst+)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Cck_lowhigh']=='high')].index, 'L/I-cell (Glp1+/Cck+)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Gip_lowhigh']=='high')].index, 'K-cell (Gip+)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Gcg_lowhigh']=='high') &
                                                                  (df_all['Cck_lowhigh']=='low')].index, 'L-cell (Glp1+)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Ghrl_lowhigh']=='high')].index, 'X-cell (Ghrl+)')


#adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['fp_EEC_lowhigh']=='low') & 
#                                          (df_all['fp_Goblet_lowhigh']=='high')].index, 'Goblet prog. (early)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Muc2_lowhigh']!='low') & 
                                          (df_all['Lyz1_lowhigh']=='low') & (df_all['Dll1_lowhigh']!='low')& (df_all['Spdef_lowhigh']=='high')].index, 'Goblet prog. (late)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Muc2_lowhigh']=='high') & 
                                          (df_all['Lyz1_lowhigh']!='high') ].index, 'Goblet') #& (df_all['Dll1_lowhigh']!='high')

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Muc2_lowhigh']=='high') & (df_all['Dll1_lowhigh']=='low') 
                                                                  & (df_all['Golm1_lowhigh']=='low') ].index, 'Goblet2')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Pou2f3_lowhigh']=='high') & 
                                          (df_all['Sox4_lowhigh']=='high')].index, 'Tuft prog. (early)')

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Pou2f3_lowhigh']!='low') & 
                                          (df_all['Sox4_lowhigh']!='low')].index, 'Tuft prog.')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Dclk1_lowhigh']=='mid') & 
                                          (df_all['Sox4_lowhigh']=='mid')].index, 'Tuft prog. (late)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Dclk1_lowhigh']=='high') & 
                                          (df_all['Sox4_lowhigh']=='low')].index, 'Tuft')

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Olfm4_lowhigh']=='high') & 
                                          (df_all['Dmbt1_lowhigh']=='high')].index, 'TA')

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Arg2_lowhigh']!='low') & (df_all['Olfm4_lowhigh']=='high') &
                                          (df_all['Dmbt1_lowhigh']!='low')].index, 'Enterocyte prog. (TA)')

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Ada_lowhigh']=='high') & 
                                                                  (df_all['Ccl25_lowhigh']=='high') & 
                                                                  (df_all['Olfm4_lowhigh']!='low') &
                                          (df_all['Dmbt1_lowhigh']=='high')].index, 'TA (prox.))')

#adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Arg2_lowhigh']=='high') & 
#                                          (df_all['Dmbt1_lowhigh']=='high')].index, 'Enterocyte prog. (TA)')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Arg2_lowhigh']=='high') & 
                                          (df_all['Sis_lowhigh']=='mid')].index, 'early Enterocyte') 

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Olfm4_lowhigh']=='mid') & 
                                          (df_all['Sis_lowhigh']!='low')& (df_all['Dmbt1_lowhigh']=='high')].index, 'Enterocyte2')


adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Arg2_lowhigh']=='high') & 
                                          (df_all['Sis_lowhigh']=='high')].index, 'Enterocyte')

adata, df_all = rename_cluster(adata, df_all, cluster_key, df_all[(df_all['Tff2_lowhigh']=='high') & 
                                          (df_all['Muc1_lowhigh']=='high')& 
                                          (df_all['Dmbt1_lowhigh']!='low')& 
                                          (df_all['Ctse_lowhigh']=='high')].index, 'M cells')

In [None]:
adata = annotate_cluster(adata, cluster_key, annotation_key)

In [None]:
adata.obs[annotation_key].value_counts()

In [None]:
adata.obs[annotation_key] = adata.obs[annotation_key].astype(str)
for cluster in clusters_manual:
    adata.obs[annotation_key][adata.obs['leiden_13']==cluster] = clusters_manual_dict[cluster]

In [None]:
with rc_context({'figure.figsize': (10,14)}):
    sc.pl.umap(adata, color=[annotation_key], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), cmap = 'tab20c', legend_fontsize=10, legend_fontweight='heavy')

In [None]:
cat_list = set(adata.obs[annotation_key].values.tolist())


In [None]:
cat_list

In [None]:
adata.obs[annotation_key] = adata.obs[annotation_key].cat.reorder_categories(['ISC', 'D-cell (Sst+)',
 'EC (immature)', 'EC (mature)','EC 2','EEC (Peptide/immature)', 'EC prog. (late)', 'EEC prog. (late/Peptide)', 'EEC prog. (mid)',
 'Enterocyte', 'early Enterocyte', 'Goblet', 'Goblet prog. (late)', 'Goblet/EEC prog. (early)', 'K-cell (Gip+)', 'L/I-cell (Glp1+/Cck+)', 'Paneth', 'Paneth prog.',
 'Tuft', 'Tuft prog.', 'Tuft prog. 2', 'X-cell (Ghrl+)', 'TA', 'TA (prox.))',
 'unknown0'])

In [None]:
adata.obs[annotation_key].cat.categories

In [None]:
adata.uns[annotation_key + '_colors'] = [    
    '#d0d0d0',  # ISC (Grey)
    '#6a51a3',  # D-cell (Sst+) (Red)
    '#725dae',  # EC (immature) (Purple)
    '#594495',  # EC (mature) (Blue)
    '#cb181d',  # EC 2 (mature) (Blue) 
    '#6ec8c1',  # EEC (peptide/immature) (Teal)
    '#afa3d5',  # EC prog. (late) (Lavender)
    '#69a9cf',  # EEC prog. (late/Peptide) (Dark Teal)
    '#9bc5df',  # EEC prog. (mid) (Medium Blue)
    '#761925',  # Enterocyte (Light Green)
    '#fc9272',  # early Enterocyte (Green)
    #'#fb6a4a',  # early Enterocyte (Light Green) 
    #'#fcbba1',  # early Enterocyte prog. (TA) (Green)
    #'#cb181d',  # Enterocyte2 (Cyan)
    '#ec7014',  # Goblet (Dark Orange)
    '#fec44f',  # Goblet prog. (late) (Orange)
    #'#addd8e',  # Goblet-Paneth-like cells (Salmon)
    '#fdd49e',  # Goblet/EEC prog. (early) (Light Orange)
    #'#fdae6b',  # Goblet2 (different Orange)
    '#74a9cf',  # K-cell (Gip+) (Blue)
    '#d0d1e6',  # L/I-cell (Glp1+/Cck+) (Light Green)
    #'#ffeda0',  # M cells (Light Purple)
    '#238b45',  # Paneth (Medium Purple)
    '#fb6a4a',  # Paneth prog. (Dark Purple) 
    #'#41ae76',  # Paneth2 (Lavender)
    '#ce1256',  # Tuft (Dark Red)
    '#e7298a',  # Tuft prog. (Light Red)
    '#df65b0',  # Tuft prog. 2 (Red)
    '#368cbf',  # X-cell (Ghrl+) (Teal)
    '#fee0d2',  # TA (Gold)
    '#ccece6',  # TA prox (Gold) 
    '#ac9470',  # unknown0 (Light Brown)
    ]

In [None]:
with rc_context({'figure.figsize': (10,14)}):
    sc.pl.umap(adata, color=[annotation_key], size=8, add_outline=True, alpha=1, outline_width=(0.3, 0.0), layer='log_dca_counts', wspace=1, save = 'umap_lv1_cell_type_annotation.png')

In [None]:
sc.pl.umap(adata[adata.obs[annotation_key].isin(['Goblet/EEC prog. (early)','D-cell (Sst+)', 'EC (immature)', 'EC (mature)', 'EC 2',
       'EEC (Peptide/immature)', 'EC prog. (late)', 'EEC prog. (late/Peptide)',
       'EEC prog. (mid)','EEC (peptide)', 'EEC prog. (mid)', 'EEC prog. (late/Peptide)', 'EEC prog. (late/EC)','EC (immature)','EC (mature)', 
                                                                          'D-cell (Sst+)', 'I-cell (Cck+)', 'K-cell (Gip+)','L-cell (Glp1+)','X-cell (Ghrl+)','L/I-cell (Glp1+/Cck+)','Other Endocrine'])], 
           color=[annotation_key,cluster_key], size=7, wspace=0.5, add_outline=True, alpha=1, outline_width=(0.3, 0.0),title =[' fine grained cell type', 'leiden 14 subclusters'])

# Save AnnData

In [None]:
adata.obsm['X_umap_scANVI'] = adata.obsm['X_umap'].copy()

In [None]:
adata.write('adata_markedDoublets_mergedPeaks_normalized_initialAnno_rmDoublets_integrated_imputed_annotated.h5ad')

In [None]:
adata = sc.read_h5ad('adata_markedDoublets_mergedPeaks_normalized_initialAnno_rmDoublets_integrated_imputed_annotated.h5ad')

In [None]:
gc.collect()

## run Delegate again

In [None]:
results = run_DElegate_findMarkers(adata, 
                        layer = 'raw_counts', 
                        group_column = 'cell_type_annotation_lv1', 
                        replicate_column = None, 
                        method = "edger", 
                        min_rate = 0.05,
                        min_fc = 1,
                        verbosity = 1, 
                        n_core = 20, 
                        max_memory = 4)

In [None]:
results["group1"].value_counts()

In [None]:
len(set(adata.obs['cell_type_annotation_lv1']))

In [None]:
import os
for cluster in set(adata.obs['cell_type_annotation_lv1']):
    print(cluster) 
    if os.path.exists(f'/mnt/hdd/Notebooks/Gut_project/Figures/umapcluster{cluster}_Delegate_markers_scANVI_sub_round2.png'):
        continue
    try: 
        cluster_orig = cluster
        cluster = str(cluster).replace(" ","_").replace("-",'_')
        if os.path.exists(f'/mnt/hdd/Notebooks/Gut_project/Figures/umapcluster{cluster}_Delegate_markers_scANVI_sub_round2.png'):
            continue
        print(cluster) 
        sc.pl.umap(adata, color=list(results.loc[results['group1']==cluster,"feature"][0:10]), layer='log_dca_counts', size=10, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), ncols=5, show=False , save = f'cluster{cluster}_Delegate_markers_scANVI_sub_round2.png')
    except:
        cluster = str(cluster_orig).replace(" ","_").replace("-",'_')
        cluster2 = cluster.replace('.','')
        cluster2 = cluster2.replace('/','_')
        if os.path.exists(f'/mnt/hdd/Notebooks/Gut_project/Figures/umapcluster{cluster2}_Delegate_markers_scANVI_sub_round2.png'):
            continue
        print(cluster2)
        sc.pl.umap(adata, color=list(results.loc[results['group1']==cluster,"feature"][0:10]), layer='log_dca_counts', size=10, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), ncols=5, show=False , save = f'cluster{cluster2}_Delegate_markers_scANVI_sub_round2.png')


In [None]:
adata.obs['leiden_14'].value_counts()

### relabel unknown und goblet-paneth-like cells

In [None]:
adata.obs['cell_type_annotation_lv1'] = adata.obs['cell_type_annotation_lv1'].apply(
    lambda x: 'Paneth' if x == 'unknown0' else x
)

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata,color='leiden', size=10,legend_loc='on data')

In [None]:
leiden_in = 'leiden'
leiden_out = 'leiden_2'
restrict_to = ['0','1','13','18','14','7']
resolution = 0.1
layer = 'log_dca_counts'
genes = ['Sox9','Atoh1','Foxa2','Dll1','Spdef','Muc2','Lyz1','Hmgb2']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase'] + genes, layer=layer, size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=6, ncols=4)

In [None]:
adata.obs['leiden_2'].value_counts()

In [None]:
leiden_in = 'leiden_2'
leiden_out = 'leiden_3'
restrict_to = ['0-1-13-18-14-7,0','0-1-13-18-14-7,1']
resolution = 0.4
genes = ['Sox9','Atoh1','Foxa2','Dll1','Spdef','Muc2','Lyz1','Hmgb2']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase'] + genes, layer=layer, size=3, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=6, ncols=4)

In [None]:
adata.obs['leiden_3'].value_counts()

In [None]:
leiden_in = 'leiden_3'
leiden_out = 'leiden_4'
restrict_to = ['0-1-13-18-14-7,0-0-1-13-18-14-7,1,3']
resolution = 0.2
genes = ['Sox9','Atoh1','Foxa2','Dll1','Spdef','Muc2','Lyz1','Hmgb2']

In [None]:
sc.tl.leiden(adata, restrict_to=(leiden_in, restrict_to), resolution=resolution, key_added=leiden_out)

In [None]:
sc.pl.umap(adata[adata.obs[leiden_in].isin(restrict_to)], color=[leiden_out, 'sequencing', 'phase'] + genes, layer=layer, size=7, add_outline=True, alpha=1, outline_width=(0.3, 0.0), legend_fontsize=6, ncols=4)

In [None]:
clusters_manual=['0-1-13-18-14-7,2','0-1-13-18-14-7,0-0-1-13-18-14-7,1,3,3','0-1-13-18-14-7,0-0-1-13-18-14-7,1,4','0-1-13-18-14-7,0-0-1-13-18-14-7,1,3,0','0-1-13-18-14-7,0-0-1-13-18-14-7,1,3,2','0-1-13-18-14-7,0-0-1-13-18-14-7,1,3,1']

In [None]:
clusters_manual_dict = {'0-1-13-18-14-7,2':'Paneth prog.',
                        '0-1-13-18-14-7,0-0-1-13-18-14-7,1,3,0':'Paneth prog.',
                        '0-1-13-18-14-7,0-0-1-13-18-14-7,1,3,3':'Goblet-Paneth-like(cycling)',
                        '0-1-13-18-14-7,0-0-1-13-18-14-7,1,3,2':'Paneth',
                        '0-1-13-18-14-7,0-0-1-13-18-14-7,1,3,1':'Paneth',
                        '0-1-13-18-14-7,0-0-1-13-18-14-7,1,4':'Goblet-Paneth-like'}

In [None]:
adata.obs['cell_type_annotation_lv1'] = adata.obs['cell_type_annotation_lv1'].astype(str)

In [None]:
adata.obs['cell_type_annotation_lv1'].value_counts()

In [None]:
for cluster in clusters_manual:
    adata.obs['cell_type_annotation_lv1'][adata.obs['leiden_4']==cluster] = clusters_manual_dict[cluster]

In [None]:
adata.uns['cell_type_annotation_lv1' + '_colors'] =[
    '#d0d0d0',  # ISC
 '#eebcbc',  # TA
 '#fee0d2',  # TA prox
 '#c67a84',  # early Enterocyte
 '#bb4353',  # Enterocyte
 '#eca4d0',  # Tuft prog.
 '#df65b0',  # Tuft prog. 2
 '#e7298a',  # Tuft
 '#f9e1f4',  # Goblet/EEC prog.
 '#d9edf7',  # EEC prog
 '#85c6e6',  # EEC prog. (late/Peptide)
 '#46a8d9',  # EEC (peptide/immature)
 '#339a98',  # X-cell (Ghrl+)
 '#368cbf',  # K-cell (Gip+)
 '#5a72dd',  # L/I-cell (Glp1+/Cck+)
 '#243dae',  # D-cell (Sst+)
 '#d0d1e6',  # EC prog.
 '#aa9dce',  # EC (imm.)
 '#594495',  # EC (mature)
 '#725dae',  # EC 2
 '#fec44f',  # Goblet prog.
 '#dd894e',  # Goblet
 '#cedf76',   #Goblet-Paneth-like 
 '#7BB98F',   #Goblet-Paneth-like (cycling) 
 '#d5f4c5',  # Paneth prog.
 '#238b45',  # Paneth
]

In [None]:
annotation_key = 'cell_type_annotation_lv1'

In [None]:
adata.obs[annotation_key] = adata.obs[annotation_key].astype('category')

In [None]:
adata.obs[annotation_key] = adata.obs[annotation_key].cat.reorder_categories(['ISC', 'TA', 'TA (prox.))', 'early Enterocyte', 'Enterocyte', 
'Tuft prog.', 'Tuft prog. 2', 'Tuft', 
'Goblet/EEC prog. (early)', 'EEC prog. (mid)', 'EEC prog. (late/Peptide)', 'EEC (Peptide/immature)', 'X-cell (Ghrl+)',  'K-cell (Gip+)', 'L/I-cell (Glp1+/Cck+)', 'D-cell (Sst+)',
'EC prog. (late)', 'EC (immature)', 'EC (mature)','EC 2', 
 'Goblet prog. (late)', 'Goblet', 'Goblet-Paneth-like', 'Goblet-Paneth-like(cycling)', 'Paneth prog.', 'Paneth'])#, 'unknown0' ])

In [None]:
sc.pl.umap(adata,color=['cell_type_annotation_lv1'],size=5)

In [None]:
with rc_context({'figure.figsize':(7,8)}):  
    sc.pl.umap(adata,color=['cell_type_annotation_lv1'],title= 'Fine cell type annotation level 1',size=4, legend_fontsize=11, save= 'healthy_UMAP_adjusted_cellt_type_anno_1.png')

## correct metadata

In [None]:
def read_excel_metadata(path, ix_col=None):
    metadata = pd.read_excel(path, index_col=ix_col)
    #print(metadata)
    return metadata     

In [None]:
import pandas as pd

In [None]:
## add metadata
metadata_df =read_excel_metadata(f'/mnt/hdd/data/metadata_mouse_gut.xlsx')
# Ensure folder name is the index in metadata for easier access
metadata_df.drop(metadata_df[metadata_df['kit'] == 'Multiome_ATAC_v1'].index, inplace=True)
#metadata_df.drop(metadata_df[metadata_df['condition'].isin(['Ctr','Ctr/WT'])].index, inplace=True)
metadata_df.set_index('folder name', inplace=True)
metadata_df.drop(['Sample Pooling - confounded with Project?','date','Project Name','Link_id','sample name','Cell Count [cells/µl]','Viable Cells [%]','Lib. Concentration [ng/µl]','Lib. Molarity [nM]','Average Lib. Size [bp]','cDNA Cycles','Lib. Cycles','10x Sample Index','Sequencing Depth [reads/cell]','exclusion, reason'], axis=1, inplace=True)

In [None]:
# Function to update adata.obs with metadata using a lambda function
for col in metadata_df.columns:
    try:
        adata.obs[col] = adata.obs['sample'].apply(lambda x: metadata_df.at[x, col])
    except KeyError as err:
        print(f'no such key: {err} in col {col}')

In [None]:
adata.obs.drop(['sample number Minas'],axis=1,inplace=True)

## Marker Gene Expression

In [None]:
adata

In [None]:
from anndata._io.specs import read_elem
with h5py.File('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_rmDoublets_integrated_all.h5ad', 'r') as f:
    # Read specific columns from `obs`
    #sample_column = f['obs/sample'][:]
    #n_counts_column = f['obs/n_counts'][:]
    #https://github.com/scverse/anndata/issues/436:
    #cell_types = read_elem(f["obs/celltype"])
    #umap = read_elem(f["obsm/X_umap"])
    logsct = read_elem(f["layers/sct_logcounts"])

In [None]:
adata_sct = sc.read_h5ad('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_rmDoublets_integrated_all.h5ad')

In [None]:
genes = [name for name in adata.var_names]

In [None]:
adata_sct = adata_sct[:,genes]

In [None]:
adata.layers['sct_logcounts']=adata_sct.layers['sct_logcounts']

In [None]:
del adata_sct
gc.collect()

In [None]:
group_by = 'cell_type_annotation_lv1'

In [None]:
del adata.raw

In [None]:
with rc_context({'figure.figsize':(8,8)}): 
    sc.pl.umap(adata, color=group_by, size=12, add_outline=True, alpha=1, outline_width=(0.3, 0.0), color_map=mymap, legend_fontsize=10)

In [None]:
with rc_context({'figure.figsize':(8,10)}): 
    sc.pl.umap(adata, color=group_by, size=5, add_outline=True, alpha=1, outline_width=(0.3, 0.0), layer='log_dca_counts',ncols=4, color_map=mymap, legend_loc='on data', legend_fontsize=10, legend_fontweight='heavy')

In [None]:
adata.write('adata_markedDoublets_mergedPeaks_normalized_initialAnno_rmDoublets_integrated_imputed_annotated_updated.h5ad')

In [None]:
marker_genes = ['Lgr5','Olfm4','Dmbt1','Arg2','Sis','Dclk1','Sox4','Pou2f3','Muc2','Dll1','Ccl25','Lyz1','Neurog3','Neurod1','Arx','Pax4','Spdef','Lmx1a','Reg4','Isl1','Sst','Gcg','Cck','Gip','Ghrl','Sct','Fev','Lbh', 'Rnase4','Ctse', 'Slc12a8','Reg1','Slc2a2','Ada', 'Golm1']

In [None]:
sc.pl.umap(adata, color=['Reg1', 'Tma7', 'Gpx2', 'Ccl25', 'Reg3a', 'Slc7a8', 'Slc2a2','Apob','Tbk1', 'Pkib','Nts', 'Ada', 'Cdh1', 'Lct','Alpi', 'Lbr','Ndc1','Prkdc','Mcm5', 'Gfi1', 'Klf4', 'Hspd1', 'Slc4a4','Pck1','Spink4','Pfkfb2', 'Fabp5', 'Rgs13', 'Cd44', 'Chga', 'Chgb', 'Stmn1'], layer='log_dca_counts', size=2, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, save = 'umapUmap_scANVI_5_markers_prox_dist_etc_corr.png')

In [None]:
with rc_context({'figure.figsize':(5,5)}): 
    sc.pl.umap(adata, color=marker_genes, layer='log_dca_counts', size=5, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap=mymap, save = 'umapUmap_scANVI_NB5_markers.png')

In [None]:
sc.pl.umap(adata, color=['Tma7', 'Gpx2', 'Cdh1', 'Hspd1', 'Slc4a4','Tbk1','Reg3a', 'Slc7a8','Apob', 'Lct','Alpi', 'Lbr','Ndc1','Prkdc','Mcm5', 'Cd44', 'Stmn1','Gfi1', 'Klf4', 'Pck1', 'Spink4', 'Pfkfb2', 'Fabp5','Rgs13', 'Chga', 'Chgb','Pkib', 'Nts'], layer='log_dca_counts', size=2, add_outline=True, alpha=1, outline_width=(0.3, 0.0), cmap = mymap,ncols=5)

In [None]:
sc.pl.umap(adata, color=marker_genes, layer='log_dca_counts', size=7, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, show = False, save ='UMAP_markers_containing_M_cells_and_more_corr.png')

In [None]:
gc.collect()

In [None]:
sc.tl.dendrogram(adata, groupby=group_by, var_names=marker_genes, key_added='marker_gene_dendrogram')
sc.pl.DotPlot(adata, var_names=marker_genes, groupby=group_by, standard_scale='var', cmap=mymap, layer='sct_logcounts', use_raw=False, categories_order=adata.uns['marker_gene_dendrogram']['categories_ordered']).style(color_on='square', dot_edge_lw=1, grid=True, dot_min=0.15, dot_edge_color=None).show()

In [None]:
sc.pl.dotplot(adata, var_names=marker_genes, dendrogram= 'marker_gene_dendrogram', groupby=group_by, standard_scale='var', cmap=mymap, layer='sct_logcounts', use_raw=False, show = False, save = 'Dotplot_markers_subclusters_logsct.png')

In [None]:
sc.tl.dendrogram(adata, groupby='leiden_13_save', var_names=marker_genes, key_added='marker_gene_dendrogram')

In [None]:
sc.pl.dotplot(adata, var_names=marker_genes, dendrogram= 'marker_gene_dendrogram', groupby='leiden_13_save', standard_scale='var', cmap=mymap, layer='sct_logcounts', use_raw=False, show = False, save = 'Dotplot_markers_subclusters_logsct_leiden.png')

In [None]:
gc.collect()

### recalculation, load adata

In [None]:
adata= sc.read_h5ad('adata_markedDoublets_mergedPeaks_normalized_initialAnno_rmDoublets_integrated_imputed_annotated_updated.h5ad')

In [None]:
adata