<!--  -->
# Preprocessing - Joint Embedding & Doublet Removal with scVI
Adapted from Michael Sterr and Minas Schwager

2024-02-09 09:28:15 


# Setup

In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import session_info
import gc # Free memory #gc.collect()
import scipy.stats as stats

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
from matplotlib import cm
import seaborn as sb

# Analysis
import scanpy as sc
import scvi
import anndata as ad

In [None]:
# Settings

import warnings
warnings.filterwarnings("ignore")

## Directory
base_dir = '/mnt/hdd/'
data_dir = 'data/Diseased/'
nb_dir = 'Notebooks/Gut_project/'
sc.settings.figdir = base_dir + nb_dir + 'Figures'
sc.settings.cachedir = base_dir + 'Cache'

## Scanpy settings
sc.settings.verbosity = 3
#sc.logging.print_versions()
#session_info.show()

In [None]:
%run utils.ipynb

In [None]:
mymap = load_RdOrYl_cmap_settings()

# Setup R

In [None]:
#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri
setup_R('/home/scanalysis/mnt/envs/scUV/lib/R')

In [None]:
%%R

.libPaths()

# Load Data

In [None]:
adata = sc.read_h5ad('/mnt/hdd/data/Diseased/adata_markedDoublets_normalized_initialAnno_diseased_woimmune.h5ad')

In [None]:
sc.pl.umap(adata, color=['initial_cell_type','sample'], size=12, add_outline=True, alpha=1, outline_width=(0.3, 0.0))

In [None]:
# Clean up .obs
adata.obs = adata.obs.loc[:,['sample', 'n_counts', 'log_counts', 'n_counts_rank', 'n_genes', 'log_genes', 'mt_frac', 'rp_frac', 'ambi_frac', 'final_doublets', 'final_doublets_cat', 'doublet_calls', 'cells_remain', 'batch', 'leiden','initial_cell_type', 'size_factors','is_paneth']].copy()
# delete all uns/obsm/varm/layers/obsp/raw
del adata.uns
del adata.obsm
del adata.varm
del adata.obsp
del adata.raw
gc.collect()

In [None]:
## add metadata
metadata_df =read_excel_metadata(f'/mnt/hdd/data/metadata_mouse_gut.xlsx')
# Ensure folder name is the index in metadata for easier access
metadata_df.drop(metadata_df[metadata_df['kit'] == 'Multiome_ATAC_v1'].index, inplace=True)
metadata_df.drop(metadata_df[metadata_df['condition'].isin(['Ctr','Ctr/WT'])].index, inplace=True)
metadata_df.set_index('folder name', inplace=True)
metadata_df.drop(['Sample Pooling - confounded with Project?','date','Project Name','Link_id','sample name','Cell Count [cells/µl]','Viable Cells [%]','Lib. Concentration [ng/µl]','Lib. Molarity [nM]','Average Lib. Size [bp]','cDNA Cycles','Lib. Cycles','10x Sample Index','Sequencing Depth [reads/cell]','exclusion, reason'], axis=1, inplace=True)

In [None]:
metadata_df

In [None]:
# Function to update adata.obs with metadata using a lambda function
for col in metadata_df.columns:
    try:
        adata.obs[col] = adata.obs['sample'].apply(lambda x: metadata_df.at[x, col])
    except KeyError as err:
        print(f'no such key: {err} in col {col}')

In [None]:
adata

minimize adata

In [None]:
del adata.layers['ambiguous']
del adata.layers['sct_counts']
del adata.layers['log_raw_counts']
del adata.layers['matrix']
del adata.layers['scran_counts']
del adata.layers['sct_logcounts']
del adata.layers['sct_scale_data']
del adata.layers[ 'spliced']
del adata.layers[ 'unspliced']
gc.collect()


In [None]:
adata.obs.drop(['cells_remain', 'batch','leiden', 'size_factors', 'Project', 'pretty name', 'sequencing', 'condition', 'line', 'strain', 'enriched', 'enrichment proportion', 'treatment', 'diet', 'tissue', 'structure', 'target cell number', 'Read Length','Index Type', 'sequencing machine'],axis=1)
gc.collect()

## Batch correction

In [None]:
sc.tl.pca(adata, n_comps = 55)

In [None]:
n_hidden=512
n_latent=50
n_layers=2

batch_key = 'sample'
labels_key = 'initial_cell_type'

categorical_covariate_keys = ['kit']
continuous_covariate_keys = None

layer = 'raw_counts'

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer=layer, batch_key=batch_key, labels_key=labels_key, categorical_covariate_keys=categorical_covariate_keys, continuous_covariate_keys=continuous_covariate_keys)

In [None]:
model_scvi = scvi.model.SCVI(adata, n_hidden=n_hidden, n_layers=n_layers, n_latent=n_latent, gene_likelihood='nb', dispersion='gene-batch')
print(model_scvi)
#model_scvi.view_anndata_setup()

In [None]:
model_scvi.train(max_epochs=1000, early_stopping=True)

In [None]:
adata.obsm['X_scVI'] = model_scvi.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, use_rep='X_scVI')
sc.tl.umap(adata, min_dist = 0.2)

In [None]:
#sc.tl.leiden(adata)
#sc.tl.leiden(adata, resolution = 2, key_added = 'leiden_2')
sc.tl.leiden(adata, resolution = 3, key_added = 'leiden_3')

In [None]:
sc.pl.umap(adata, color= ['sample', 'leiden', 'doublet_calls'], size=20, color_map=mymap)

In [None]:
sc.pl.umap(adata, color= ['final_doublets_cat', 'leiden', 'leiden_2', 'leiden_3'], size=20, color_map=mymap)

## Filter doublet clusters

In [None]:
gc.collect()

In [None]:
import pegasus as pg

### Distribution in leiden clusters

In [None]:
pg.compo_plot(adata, 'leiden_2', 'doublet_calls', style = 'frequency',
              sort_function=None, 
              palette=['#FFD700', '#FF7F50', '#8B0000', '#0000CD', '#6495ED', '#008080', '#B0C4DE', '#696969'], dpi = 150)

In [None]:
pg.compo_plot(adata, 'leiden_3', 'doublet_calls', style = 'frequency',
              sort_function=None, 
              palette=['#FFD700', '#FF7F50', '#8B0000', '#0000CD', '#6495ED', '#008080', '#B0C4DE', '#696969'], dpi = 150)

In [None]:
pd.set_option('display.max_columns', None)

### Filter - create object with doublets

In [None]:
adata[np.isin(adata.obs['leiden_3'], ['9', '27', '30'])].shape

In [None]:
adata = adata[np.isin(adata.obs['leiden_3'], ['9', '27', '30'], invert = True)].copy()
adata

### Recalculate UMAP

In [None]:
sc.pp.neighbors(adata, use_rep='X_scVI', metric='correlation')
sc.tl.umap(adata, min_dist = 0.2)

In [None]:
sc.tl.leiden(adata)
sc.tl.leiden(adata, resolution = 2, key_added = 'leiden_2')
sc.tl.leiden(adata, resolution = 3, key_added = 'leiden_3')

In [None]:
sc.pl.umap(adata, color= ['sample', 'leiden','leiden_2', 'leiden_3', 'doublet_calls'], size=20, color_map=mymap)

In [None]:
pg.compo_plot(adata, 'leiden', 'doublet_calls', style = 'frequency',
              sort_function=None, 
              palette=['#FFD700', '#FF7F50', '#8B0000', '#0000CD', '#6495ED', '#008080', '#B0C4DE', '#696969'], dpi = 150)

In [None]:
pg.compo_plot(adata, 'leiden_2', 'doublet_calls', style = 'frequency',
              sort_function=None, 
              palette=['#FFD700', '#FF7F50', '#8B0000', '#0000CD', '#6495ED', '#008080', '#B0C4DE', '#696969'], dpi = 150)

In [None]:
pg.compo_plot(adata, 'leiden_3', 'doublet_calls', style = 'frequency',
              sort_function=None, 
              palette=['#FFD700', '#FF7F50', '#8B0000', '#0000CD', '#6495ED', '#008080', '#B0C4DE', '#696969'], dpi = 150)

### Filter - create object with doublets

In [None]:
adata = adata[np.isin(adata.obs['leiden_3'], ['34','36'], invert = True)].copy()
adata

### Recalculate UMAP

In [None]:
sc.pp.neighbors(adata, use_rep='X_scVI', metric='correlation')
sc.tl.umap(adata, min_dist = 0.2)

In [None]:
#sc.tl.leiden(adata)
#sc.tl.leiden(adata, resolution = 2, key_added = 'leiden_2')
sc.tl.leiden(adata, resolution = 3, key_added = 'leiden_3')

In [None]:
sc.pl.umap(adata, color= ['sample', 'leiden_3', 'doublet_calls'], size=20, color_map=mymap)

In [None]:
pg.compo_plot(adata, 'leiden_3', 'doublet_calls', style = 'frequency',
              sort_function=None, 
              palette=['#FFD700', '#FF7F50', '#8B0000', '#0000CD', '#6495ED', '#008080', '#B0C4DE', '#696969'], dpi = 150)

In [None]:
sc.tl.leiden(adata, restrict_to=('leiden_3', ['9']), resolution=0.7, key_added='leiden_sub1')

In [None]:
pg.compo_plot(adata, 'leiden_sub1', 'doublet_calls', style = 'frequency',
              sort_function=None, 
              palette=['#FFD700', '#FF7F50', '#8B0000', '#0000CD', '#6495ED', '#008080', '#B0C4DE', '#696969'], dpi = 150)

In [None]:
adata = adata[np.isin(adata.obs['leiden_sub1'], ['9,0'], invert = True)].copy()
adata

### Filter - all with doublet calls above 3

In [None]:
adata.obs.doublet_calls.value_counts()

In [None]:
adata = adata[adata.obs['doublet_calls'] < 4].copy()
adata

In [None]:
adata.obs.drop(['sample number Minas'],axis=1,inplace=True)

In [None]:
adata.obs['sample'].value_counts()

In [None]:
sc.pl.umap(adata, color= ['initial_cell_type', 'kit', 'condition', 'diet', 'line', 'strain'], size=20, ncols=3,color_map=mymap)

In [None]:
adata.write('/mnt/hdd/data/Diseased/adata_markedDoublets_normalized_initialAnno_noimmune_scvi_wodblts.h5ad')

### investigate object

In [None]:
adata = sc.read_h5ad('/mnt/hdd/data/Diseased/adata_markedDoublets_normalized_initialAnno_noimmune_scvi_wodblts_imputed_subsetted.h5ad')

In [None]:
adata.obs['sample'].value_counts()

In [None]:
adata

In [None]:
sc.pl.umap(adata, color= ['initial_cell_type', 'kit', 'condition', 'diet', 'line', 'strain'], size=20, ncols=3,color_map=mymap)

In [None]:
sc.pp.neighbors(adata, use_rep='X_scVI', metric='correlation')
sc.tl.umap(adata, min_dist = 0.2)

In [None]:
sc.tl.leiden(adata)

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=['Ghrl','Sst','Gcg','Gip','Cck','Sct','Tac1','Tph1', 'Spdef','Reg4'],layer= 'log_dca_counts', size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap)

In [None]:
sc.pl.umap(adata, color=['leiden','diet', 'condition','strain','pretty name'],layer= 'log_dca_counts', size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap, legend_fontsize=9)

### reintegrate

#### update metadata

In [None]:
## add metadata
metadata_df =read_excel_metadata(f'/mnt/hdd/data/metadata_mouse_gut.xlsx')
# Ensure folder name is the index in metadata for easier access
metadata_df.drop(metadata_df[metadata_df['kit'] == 'Multiome_ATAC_v1'].index, inplace=True)
#metadata_df.drop(metadata_df[metadata_df['condition'].isin(['Ctr','Ctr/WT'])].index, inplace=True)
metadata_df.set_index('folder name', inplace=True)
metadata_df.drop(['Sample Pooling - confounded with Project?','date','Project Name','Link_id','sample name','Cell Count [cells/µl]','Viable Cells [%]','Lib. Concentration [ng/µl]','Lib. Molarity [nM]','Average Lib. Size [bp]','cDNA Cycles','Lib. Cycles','10x Sample Index','Sequencing Depth [reads/cell]','exclusion, reason'], axis=1, inplace=True)

In [None]:
# Function to update adata.obs with metadata using a lambda function
for col in metadata_df.columns:
    try:
        adata.obs[col] = adata.obs['sample'].apply(lambda x: metadata_df.at[x, col])
    except KeyError as err:
        print(f'no such key: {err} in col {col}')

In [None]:
adata.obs.drop(['sample number Minas'],axis=1,inplace=True)

In [None]:
sc.tl.pca(adata, n_comps = 55)

In [None]:
n_hidden=512
n_latent=50
n_layers=2

batch_key = 'sample'
labels_key = 'initial_cell_type'

categorical_covariate_keys = ['kit']
continuous_covariate_keys = None

layer = 'raw_counts'

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer=layer, batch_key=batch_key, labels_key=labels_key, categorical_covariate_keys=categorical_covariate_keys, continuous_covariate_keys=continuous_covariate_keys)

In [None]:
model_scvi = scvi.model.SCVI(adata, n_hidden=n_hidden, n_layers=n_layers, n_latent=n_latent, gene_likelihood='nb', dispersion='gene-batch')
print(model_scvi)
#model_scvi.view_anndata_setup()

In [None]:
model_scvi.train(max_epochs=1000, early_stopping=True)

In [None]:
adata.obsm['X_scVI_rm_Dblts'] = model_scvi.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, use_rep='X_scVI_rm_Dblts')
sc.tl.umap(adata, min_dist = 0.2)

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden','diet', 'condition','strain','pretty name'],layer= 'log_dca_counts', size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap, legend_fontsize=9)

In [None]:
adata.write('scvi_diseased_wo_dblts_save.h5ad')

In [None]:
model_scvi.save('scvi_diseased', overwrite=True, save_anndata=True)

### reload object

In [None]:
adata = sc.read_h5ad('scvi_diseased_wo_dblts_save.h5ad')

In [None]:
sc.pl.umap(adata, color=['leiden', 'condition','line','strain','pretty name'],layer= 'log_dca_counts', size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap, legend_fontsize=9,frameon=True)

In [None]:
sc.pl.umap(adata, color=['Ghrl', 'condition'],layer= 'log_dca_counts', size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap, legend_fontsize=9)

In [None]:
sc.tl.paga(adata,groups='initial_cell_type')

In [None]:
sc.pl.paga(adata)

In [None]:
sc.tl.umap(adata,init_pos='paga')

In [None]:
sc.pl.umap(adata, color=['leiden','diet', 'condition','strain','pretty name'],layer= 'log_dca_counts', size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap, legend_fontsize=9)

In [None]:
sc.pl.umap(adata, color=['Ghrl', 'diet'],layer= 'log_dca_counts', size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap, legend_fontsize=9)

In [None]:
sc.pl.violin(adata,groupby='condition',keys='Ghrl',rotation=90)

In [None]:
sc.pl.violin(adata,groupby='condition',keys='Sst',rotation=90)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['leiden','diet', 'condition','strain','pretty name'],layer= 'log_dca_counts', size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, cmap = mymap, legend_fontsize=9)