
# Embedding, Clustering & Annotation
Adapted from Michael Sterr

2024-06-13


# Setup


In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import session_info
import gc # Free memory #gc.collect()
import scipy.stats as stats

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
from matplotlib import cm
import seaborn as sb

# Analysis
import scanpy as sc
import anndata as ad

In [None]:
import h5py
from anndata._io.specs import read_elem

In [None]:
# Settings

import warnings
warnings.filterwarnings("ignore")

## Directory
base_dir = '/mnt/hdd/'
data_dir = 'data/Healthy/'
nb_dir = 'Notebooks/Gut_project/'
sc.settings.figdir = base_dir + nb_dir + 'Figures'
sc.settings.cachedir = base_dir + 'Cache'

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()
session_info.show()

In [None]:
%run utils.ipynb

In [None]:
mymap = load_RdOrYl_cmap_settings()

# Setup R

In [None]:
#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri
setup_R('/home/scanalysis/mnt/envs/scUV/lib/R')

In [None]:
%%R

.libPaths()

In [None]:
%%R
# Parallelization
library(BiocParallel)
register(MulticoreParam(20, progressbar = TRUE))

library(future)
plan("multicore", workers = 20)
options(future.globals.maxSize = 64 * 1024^2)
plan()

library(doParallel)
registerDoParallel(20)

sessionInfo()

# Load Data

In [None]:
adata = sc.read_h5ad('/mnt/hdd/Notebooks/Gut_project/Dbtl_detected_velocyto_scran_diseased_sct.h5ad')

In [None]:
adata

In [None]:
all_cc_genes, s_genes_regev, g2m_genes_regev, cc_genes_regev, cc_genes_macosko, s_genes_macosko, g2m_genes_macosko, m_genes_macosko, mg1_genes_macosko, g1s_genes_macosko = load_cell_cycle_genes(adata, genome ='mus_musculus')

# Cell Cycle Scoring

In [None]:
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes_regev, g2m_genes=g2m_genes_regev)

In [None]:
adata.obs['proliferation'] = list(adata.obs['phase'].isin(['G2M','S']))
adata.obs['proliferation'][adata.obs['proliferation']==True] = 'Cycling'
adata.obs['proliferation'][adata.obs['proliferation']==False] = 'Non-Cycling'

In [None]:
sc.pl.umap(adata, color=['S_score', 'G2M_score','phase','proliferation'], size=15, add_outline=True, alpha=1, outline_width=(0.3, 0.0), cmap = mymap)

# WNN

Using Scran and TF-IDF normalizations

In [None]:
adata.X = adata.layers['scran_counts'].copy()
sc.pp.highly_variable_genes(adata)

In [None]:
# Get HVGs and overlap with cell cycle & ambient genes

## HVGs from SCT
hvgs = pd.Series(adata.var_names[adata.var['highly_variable']])
print('\nHighly variable genes before filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

# overlap HVGs with CC genes
hvcc = list(hvgs[hvgs.isin(all_cc_genes)])
print('\nHighly variable cell cycle genes:',len(hvcc),'\n',hvcc)

# overlap HVGs with ambient genes
hvambi = list(hvgs[hvgs.isin(list(adata[:,adata.var['is_ambient'] == True].var_names))])
print('\nHighly variable ambient genes:',len(hvambi),'\n',hvambi)

# remove cell cycle genes
adata.var.loc[hvcc,'highly_variable'] = False

# remove ambient genes
adata.var.loc[hvambi,'highly_variable'] = False

print('\nHighly variable genes after filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

In [None]:
sc.pp.pca(adata, svd_solver='arpack', use_highly_variable=True)
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['sample','final_doublets_cat','doublet_calls'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, wspace=1.05, cmap = mymap)

# Initial Clustering

In [None]:
#sc.tl.leiden(adata, resolution=0.1, key_added='leiden_r0.1')
#sc.tl.leiden(adata, resolution=0.2, key_added='leiden_r0.2')
#sc.tl.leiden(adata, resolution=0.3, key_added='leiden_r0.3')
#sc.tl.leiden(adata, resolution=0.4, key_added='leiden_r0.4')
#sc.tl.leiden(adata, resolution=0.5, key_added='leiden_r0.5')
#sc.tl.leiden(adata, resolution=0.6, key_added='leiden_r0.6')
sc.tl.leiden(adata, resolution=0.7, key_added='leiden_r0.7')
sc.tl.leiden(adata, resolution=0.8, key_added='leiden_r0.8')
sc.tl.leiden(adata, resolution=0.9, key_added='leiden_r0.9')
sc.tl.leiden(adata, resolution=1, key_added='leiden_r1')

sc.tl.leiden(adata, resolution=1.25, key_added='leiden_r1.25')
sc.tl.leiden(adata, resolution=1.5, key_added='leiden_r1.5')
sc.tl.leiden(adata, resolution=1.75, key_added='leiden_r1.75')
sc.tl.leiden(adata, resolution=2, key_added='leiden_r2')
sc.tl.leiden(adata, resolution=2.25, key_added='leiden_r2.25')
sc.tl.leiden(adata, resolution=2.5, key_added='leiden_r2.5')
#sc.tl.leiden(adata, resolution=2.75, key_added='leiden_r2.75')
#sc.tl.leiden(adata, resolution=3, key_added='leiden_r3')

In [None]:
# Generate reduced adata object to pass to R
adata_r = ad.AnnData(X = adata.layers['sct_logcounts'].copy())
adata_r.var_names = adata.var_names.copy()
adata_r.obs_names = adata.obs_names.copy()
adata_r.obs = adata.obs.loc[:,['leiden_r0.7','leiden_r0.8','leiden_r0.9','leiden_r1','leiden_r1.25','leiden_r1.5','leiden_r1.75','leiden_r2','leiden_r2.25','leiden_r2.5']].copy()
#adata_r.obs = adata.obs.loc[:,['leiden_r0.1','leiden_r0.2','leiden_r0.3','leiden_r0.4','leiden_r0.5','leiden_r0.6','leiden_r0.7','leiden_r0.8','leiden_r0.9','leiden_r1','leiden_r1.25','leiden_r1.5','leiden_r1.75','leiden_r2','leiden_r2.25','leiden_r2.5','leiden_r2.75','leiden_r3']].copy()

In [None]:
%%R
library(SingleCellExperiment)
library(clustree)

In [None]:
%%R -i adata_r

clustree(adata_r, prefix = 'leiden_r', exprs='X')

In [None]:
sc.pl.umap(adata, color=['leiden_r0.7','leiden_r0.8','leiden_r0.9','leiden_r1','leiden_r1.25','leiden_r1.5','leiden_r1.75','leiden_r2','leiden_r2.25','leiden_r2.5'], size=5, add_outline=True, alpha=1,wspace = 1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
adata.X = adata.layers['sct_logcounts'].copy()

In [None]:
sc.pl.umap(adata, color=['Neurog3','Tph1','Isl1','Pou2f3','Lgr5','Dmbt1','Hmgb2','Top2a','Defa24','Sis','Sox4','Spdef','Dll1','Lyz1', 'Cd52'], size=12, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, color_map=mymap)

In [None]:
adata.obs['leiden']=adata.obs['leiden_r0.7'].copy()
adata.uns['leiden_colors'] = adata.uns['leiden_r0.7_colors'].copy()

In [None]:
adata

### clean up obs, var, uns

In [None]:
adata.obs = adata.obs.loc[:,['sample', 'n_counts', 'log_counts', 'n_counts_rank', 'n_genes', 'log_genes', 'mt_frac', 'rp_frac', 'ambi_frac','is_paneth', 'final_doublets', 'final_doublets_cat', 'doublet_calls', 'cells_remain','batch','leiden', 'size_factors', 'S_score', 'G2M_score', 'phase', 'proliferation']]

In [None]:
gc.collect()

In [None]:
uns_to_drop = ['leiden_r0.7', 'leiden_r0.8', 'leiden_r0.9', 'leiden_r1', 'leiden_r1.25', 'leiden_r1.5', 'leiden_r1.75', 'leiden_r2', 'leiden_r2.25', 'leiden_r2.5', 'leiden_r0.7_colors', 'leiden_r0.8_colors', 'leiden_r0.9_colors', 'leiden_r1_colors', 'leiden_r1.25_colors', 'leiden_r1.5_colors', 'leiden_r1.75_colors', 'leiden_r2_colors', 'leiden_r2.25_colors', 'leiden_r2.5_colors']

In [None]:
for uns in uns_to_drop:
    del adata.uns[uns]

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=['leiden','Lgr5'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=2, legend_loc='on data', cmap = mymap)

In [None]:
sc.pl.umap(adata, color=['sample','leiden','phase', 'Lgr5','final_doublets_cat','doublet_calls'], size=10, add_outline=True, alpha=1, wspace =0.9, outline_width=(0.3, 0.0), ncols=2, cmap = mymap)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden', ['5']), resolution=0.2, key_added='leiden_sub1')

In [None]:
sc.pl.umap(adata, color=['leiden_sub1','phase','final_doublets_cat', 'Lgr5'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), wspace =0.65, ncols=2, cmap = mymap)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden_sub1', ['5,2']), resolution=0.3, key_added='leiden_sub2')

In [None]:
sc.pl.umap(adata, color=['leiden_sub2','phase','final_doublets_cat','Lgr5'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden_sub2', ['5,1']), resolution=0.3, key_added='leiden_sub3')

In [None]:
sc.pl.umap(adata, color=['leiden_sub3','Lgr5','phase','final_doublets_cat'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden_sub3', ['5,0']), resolution=0.3, key_added='leiden_sub4')

In [None]:
sc.pl.umap(adata, color=['leiden_sub4','phase','Lgr5'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden_sub4', ['5,4']), resolution=0.4, key_added='leiden_sub5')

In [None]:
sc.pl.umap(adata, color=['leiden_sub5','phase','Lgr5'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden_sub5', ['2']), resolution=0.3, key_added='leiden_sub6')

In [None]:
sc.pl.umap(adata, color=['leiden_sub6','phase','Lgr5'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, wspace =0.7, cmap = mymap)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden_sub6', ['0']), resolution=0.3, key_added='leiden_sub7')

In [None]:
sc.pl.umap(adata, color=['leiden_sub7','phase','Lgr5'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden_sub7', ['1']), resolution=0.4, key_added='leiden_sub8')

In [None]:
sc.pl.umap(adata, color=['leiden_sub8','phase','Lgr5'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden_sub8', ['12']), resolution=0.4, key_added='leiden_sub9')

In [None]:
sc.pl.umap(adata, color=['leiden_sub9','phase','Lgr5'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden_sub9', ['19']), resolution=0.2, key_added='leiden_sub10')

In [None]:
sc.pl.umap(adata, color=['leiden_sub10','Isl1','Neurog3'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap)

In [None]:
adata.obs['leiden_wnn'] = adata.obs['leiden_sub10']

# Initial Annotation

In [None]:
if 'log1p' in adata.uns.keys():
    del adata.uns['log1p']

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden_wnn')

In [None]:
with rc_context({'figure.figsize': (8,4)}):
    sc.pl.rank_genes_groups(adata, n_genes=80, fontsize=5.5, ncols=2)

In [None]:
marker_genes = ['Neurog3','Tph1','Isl1','Pou2f3','Lgr5','Dmbt1','Hmgb2','Top2a','Defa24','Sis','Cd52','Muc2','Dll1','Lyz1', 'Epcam', 'Itln1']

In [None]:
adata.X =adata.layers['sct_logcounts'].copy()

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=marker_genes+['leiden_wnn'], size=12, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, color_map=mymap)

In [None]:
sc.tl.dendrogram(adata, groupby='leiden_wnn', var_names=marker_genes, key_added='marker_gene_dendrogram')

In [None]:
sc.pl.DotPlot(adata, var_names=marker_genes, groupby='leiden_wnn', cmap=mymap, use_raw=False, categories_order=adata.uns['marker_gene_dendrogram']['categories_ordered']).style(color_on='square', dot_edge_lw=1, grid=True, dot_min=0.15, dot_edge_color=None).show()

In [None]:
sc.pl.dotplot(adata,dendrogram='marker_gene_dendrogram', var_names=marker_genes, groupby='leiden_wnn', cmap=mymap, use_raw=False)

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [None]:
groupby = 'leiden_wnn'
obs_keys = ['doublet_calls', 'S_score', 'G2M_score']

df = pd.DataFrame(data = adata[:,np.in1d(adata.var_names,marker_genes)].X.toarray(), 
                  index = adata.obs_names, 
                  columns=adata.var_names[np.in1d(adata.var_names, marker_genes)].values)

df[groupby]= pd.Series(adata.obs[groupby], index=df.index)

if 'df_all' in globals():
    del df_all
    
for i,marker in enumerate(marker_genes):
    if i == 0:
        df_all = pd.DataFrame(df.groupby(by=groupby)[marker].apply(np.mean).values, index=df.groupby(by=groupby)[marker].apply(np.mean).index, columns=['mean_'+marker])
    else:
        df_all['mean_'+marker] = df.groupby(by=groupby)[marker].apply(np.mean).values
        
for key in obs_keys:
    df_all['mean_'+key] = adata.obs.groupby(by=groupby)[key].apply(np.mean).values
        
df_all

In [None]:
for i,marker in enumerate(marker_genes):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 0.5, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['Dmbt1']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 0.3, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['Lyz1','Sis','Defa24','Neurog3']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 1, 10], labels=['low', 'high'])

for i,marker in enumerate(['Sis']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 1.2, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['Dll1']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 1.5, 10], labels=['low', 'high'])

for i,marker in enumerate(['Lgr5']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 0.3, 10], labels=['low', 'high'])

for i,marker in enumerate(['Cd52']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 1, 10], labels=['low', 'high'])

for i,marker in enumerate(['Epcam']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 0.4, 10], labels=['low', 'high'])

for i,marker in enumerate(['Neurog3']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 1.75, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['Tph1']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 2.5, 10], labels=['low', 'high'])

for key in obs_keys:
    df_all['lowhigh_'+key] = pd.cut(df_all['mean_'+key], bins=[-1, 0.25, 10], labels=['low', 'high'])
    
for key in ['doublet_calls']:
    df_all['lowhigh_'+key] = pd.cut(df_all['mean_'+key], bins=[-10, 4, 10], labels=['low', 'high'])

df_all.iloc[:,len(marker_genes + obs_keys):]

In [None]:
anno_key = 'initial_cell_type'
cluster_key = 'leiden_wnn'

adata.obs[anno_key] = adata.obs[cluster_key].cat.add_categories(['ISC', 'Early Prog.', 'Paneth prog.',
                                                                                       'EE Prog.', 'EEC', 'EC', 
                                                                                       'Goblet', 'Paneth', 'Tuft', 
                                                                                       'TA', 'Enterocyte', 'Non-Epithelial','Doublets'])

# doublets
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[(df_all['lowhigh_Lyz1']=='high')& 
                                          (df_all['lowhigh_Itln1']=='high')& 
                                          (df_all['lowhigh_Dmbt1']=='high')& 
                                          (df_all['lowhigh_Hmgb2']=='high')].index)] = 'Paneth prog.' #Dmbt1 Hmgb2

# ISC
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[df_all['lowhigh_Lgr5']=='high'].index)] = 'ISC'

# Early Prog
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[(df_all['lowhigh_Dll1']=='high') & 
                                          (df_all['lowhigh_Muc2']=='low')].index)] = 'Early Prog.'

# EEC Prog
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[df_all['lowhigh_Neurog3']=='high'].index)] = 'EE Prog.'

# EEC
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[(df_all['lowhigh_Isl1']=='high') & 
                                          (df_all['lowhigh_Neurog3']=='low')].index)] = 'EEC'

# EC
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[(df_all['lowhigh_Tph1']=='high')& (df_all['lowhigh_Cd52']=='low')].index)] = 'EC' # & (df_all['lowhigh_Neurog3']=='low')

# Goblet
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[(df_all['lowhigh_Muc2']=='high') & 
                                         (df_all['lowhigh_Isl1']=='low')].index)] = 'Goblet'

# Paneth
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[(df_all['lowhigh_Lyz1']=='high')].index)] = 'Paneth'

# Tuft
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[(df_all['lowhigh_Pou2f3']=='high')].index)] = 'Tuft'

# TA
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[((df_all['lowhigh_Dmbt1']=='high') | (df_all['lowhigh_Top2a']=='high')) & 
                                          (df_all['lowhigh_Muc2']=='low') & 
                                          (df_all['lowhigh_Lgr5']=='low') & 
                                          (df_all['lowhigh_Dll1']=='low') & 
                                          (df_all['lowhigh_Pou2f3']=='low')].index)] = 'TA'   

# Enterocytes
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                                   df_all[(df_all['lowhigh_Sis']=='high') & 
                                          (df_all['lowhigh_Isl1']=='low') & 
                                          (df_all['lowhigh_Hmgb2']=='low')].index)] = 'Enterocyte'

# Immune etc
adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
                            df_all[(df_all['lowhigh_Cd52']=='high') & (df_all['lowhigh_Epcam']=='low')].index)] = 'Non-Epithelial'

# # Doublets
# adata.obs[anno_key][np.in1d(adata.obs[cluster_key], 
#                                    df_all[(df_all['lowhigh_doublet_calls']=='high') & (df_all['lowhigh_Top2a']=='low')].index)] = 'Doublet'

adata.obs[anno_key]= adata.obs[anno_key].cat.remove_unused_categories()

In [None]:
pd.value_counts(adata.obs['initial_cell_type'])

In [None]:
sc.pl.umap(adata, color=['initial_cell_type', 'final_doublets_cat'], size=12, add_outline=True, alpha=1, outline_width=(0.3, 0.0))

In [None]:
sc.pl.umap(adata, color=['initial_cell_type','sample'], size=12, add_outline=True, alpha=1, outline_width=(0.3, 0.0))

In [None]:
sc.pl.DotPlot(adata, var_names=marker_genes, groupby='initial_cell_type', cmap=mymap, use_raw=False).style(color_on='square', dot_edge_lw=1, grid=True, dot_min=0.15, dot_edge_color=None).show()

In [None]:
adata = adata[adata.obs['initial_cell_type']!='Non-Epithelial']

In [None]:
sc.pl.umap(adata, color=['initial_cell_type','sample'], size=12, add_outline=True, alpha=1, outline_width=(0.3, 0.0))

In [None]:
gc.collect()

In [None]:
adata.write('/mnt/hdd/data/Diseased/adata_markedDoublets_normalized_initialAnno_diseased_woimmune.h5ad')

### remove non-paneth

In [None]:
with h5py.File("/mnt/hdd/data/Healthy/adata_markedDoublets_normalized.h5ad") as f:
    is_paneth = read_elem(f["obs/is_paneth"])

In [None]:
is_paneth

In [None]:
adata

In [None]:
adata.obs['is_paneth'] = is_paneth

# Save

In [None]:
# Save
adata.write('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_diseased_woimmune.h5ad')

## Normalize differently

Since SCRAN assumes that at least half of the genes in the data being normalized are not differentially expressed between subgroups of cells, we performed SCRAN normalization within clusters. To this end, we first performed total count normalization, by dividing each count by its cell’s total count and multiplying by 10,000. We then performed a log transformation using natural log and pseudocount 1. A PCA was subsequently performed. Using the first 50 principal components, a neighborhood graph was calculated with the number of neighbors set to k = 15. Data were subsequently clustered with Louvain clustering at a resolution of r = 0.5. SCRAN normalization was then performed on the raw counts, using the Louvain clusters as input clusters and with the minimum mean (library size adjusted) average count of genes to be used for normalization set to 0.1. The resulting size factors were used for normalization. For the final HLCA (and not the benchmarking subset), cells with abnormally low size factors (<0.01) or abnormally high total counts after normalization (>10 × 105) were removed from the data (267 cells in total). (from lung cancer publication, https://doi.org/10.1038/s41591-023-02327-2)

In [None]:
def checkAdata(adata):
    if type(adata) is not ad.AnnData:
        raise TypeError('Input is not a valid AnnData object')

In [None]:
def SCRAN_normalize(adata, min_mean = 0.1, n_pcs=50, counts_per_cell = 1e4, 
    louvain_r=0.5, ignore_R_warnings=False, log_transform=True): # from: Github https://github.com/LungCellAtlas/HLCA_reproducibility in scripts/scib_excerpts.py
    """adapted from scIB, returns normalized and log1p transformed adata"""
    # import R-related packages:
    import anndata2ri
    import rpy2.robjects as ro
    import rpy2.rinterface_lib.callbacks

    if ignore_R_warnings == True:
        # Ignore R warning messages
        rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) 


    checkAdata(adata)
    
    # massive speedup when working with sparse matrix
    if not sparse.issparse(adata.X): # quick fix: HVG doesn't work on dense mtx
        adata.X = sparse.csr_matrix(adata.X)
    
    anndata2ri.activate()
    ro.r('library("scran")')
    
    # keep raw counts
    adata.layers["raw_counts"] = adata.X.copy()
    
    # Preliminary clustering for differentiated normalisation
    adata_pp = adata.copy()
    sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=counts_per_cell)
    sc.pp.log1p(adata_pp)
    sc.pp.pca(adata_pp, n_comps=n_pcs, svd_solver='arpack')
    sc.pp.neighbors(adata_pp)
    sc.tl.louvain(adata_pp, key_added='groups', resolution=louvain_r)
    
    ro.globalenv['data_mat'] = adata.X.T
    ro.globalenv['input_groups'] = adata_pp.obs['groups']
    # size_factors = ro.r(
    #    f'computeSumFactors(data_mat, clusters = input_groups, min.mean = {min_mean})')
    size_factors = ro.r('sizeFactors(computeSumFactors(SingleCellExperiment('
                            'list(counts=data_mat)), clusters = input_groups,'
                            f' min.mean = {min_mean}))')
    del adata_pp
    
    # modify adata
    adata.obs['size_factors'] = size_factors
    adata.X /= adata.obs['size_factors'].values[:,None]
    if log_transform == True:
        print("log1p-transforming data")
        sc.pp.log1p(adata)
    # convert to sparse, bc operation always converts to dense
    adata.X = sparse.csr_matrix(adata.X)
    adata.layers['SCRAN_counts_log']= adata.X.copy() # Store the full data set in 'raw' as log-normalised data for statistical testing
    return adata

In [None]:
adata.X = adata.layers['raw_counts']
adata = SCRAN_normalize(adata)

In [None]:
sparsify_all_layers(adata)

In [None]:
# Save
adata.write('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno.h5ad')

In [None]:
sc.pl.umap(adata, color=marker_genes+['leiden_wnn'], size=12, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, color_map=mymap)

In [None]:
sc.pl.DotPlot(adata, var_names=marker_genes, groupby='initial_cell_type', cmap=mymap, use_raw=False).style(color_on='square', dot_edge_lw=1, grid=True, dot_min=0.15, dot_edge_color=None).show()