
# Preprocessing - Imputation
Adapted from Michael Sterr

2024-06-24


# Setup


In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import session_info
import gc

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
from matplotlib import cm
import seaborn as sb

# Analysis
#import muon as mu
#from muon import atac as ac # Import a module with ATAC-seq-related functions
import scanpy as sc
import scanpy.external as sce
import torch
#import scipy.stats as sci
from sklearn.metrics import mean_squared_error

In [None]:
# Settings

import warnings
warnings.filterwarnings("ignore")

## Directory
base_dir = '/mnt/hdd/'
data_dir = 'data/Healthy/'
nb_dir = 'Notebooks/Gut_project/'
sc.settings.figdir = base_dir + nb_dir + 'Figures'
sc.settings.cachedir = base_dir + 'Cache'

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()
session_info.show()

In [None]:
# Color maps
ch_YlRd=sb.cubehelix_palette(100, start=.7, rot=.25, gamma=0.6, hue=2, light=1, dark=0.05, as_cmap=True)

In [None]:
# Plot settings
%matplotlib inline

## Plotting parameters
rcParams['figure.figsize']=(6,6) #rescale figures
#sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False, color_map='tab10' ,transparent=True, dpi=150, dpi_save=300)
sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=True, dpi=150, dpi_save=300)

## Grid & Ticks
rcParams['grid.alpha'] = 0
rcParams['xtick.bottom'] = True
rcParams['ytick.left'] = True

## Embed font
plt.rc('pdf', fonttype=42)

## Define new default settings
plt.rcParamsDefault = plt.rcParams

# Setup R

In [None]:
%run utils.ipynb

In [None]:
#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri
setup_R('/home/scanalysis/mnt/envs/scUV_scvelo/lib/R')

In [None]:
%%R

.libPaths()

In [None]:
%%R
# Parallelization
library(BiocParallel)
register(MulticoreParam(20, progressbar = TRUE))

library(future)
plan("multicore", workers = 20)
options(future.globals.maxSize = 64 * 1024^2)
plan()

library(doParallel)
registerDoParallel(20)

sessionInfo()

# Load Data

## aData

In [None]:
adata = sc.read_h5ad('adata_for_imputation.h5ad')

In [None]:
adata

#### clean up adata

In [None]:
gc.collect()

In [None]:
sc.pp.neighbors(adata, use_rep='X_scANVI')
sc.tl.leiden(adata)
sc.tl.umap(adata, min_dist=0.8)

In [None]:
sc.pl.umap(adata, color=['sample','n_counts','n_genes','mt_frac','rp_frac','doublet_calls'],  size=10, alpha=1, outline_width=(0.3, 0.0), ncols=3, wspace =.95, cmap=ch_YlRd)

In [None]:
gc.collect()

In [None]:
sc.tl.umap(adata, min_dist=0.5, spread=1.05)

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type','n_counts','n_genes','mt_frac','rp_frac','doublet_calls'],  size=10, alpha=1, outline_width=(0.3, 0.0), ncols=3, wspace =.98, cmap=ch_YlRd)

In [None]:
marker_genes = ['Foxa2','Neurog3','Tph1','Isl1','Pou2f3','Lgr5','Dmbt1','Hmgb2','Top2a','Defa24','Gna11','Cd52','Muc2','Fcgbp','Lyz1']

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type','n_counts', 'phase'] + marker_genes,  size=10, alpha=1, outline_width=(0.3, 0.0), ncols=3, wspace =.98, cmap=ch_YlRd)

# DCA

In [None]:
adata.X = adata.layers['sct_counts'].copy()

In [None]:
sc.pp.filter_genes(adata, min_counts=1)

In [None]:
gc.collect()

In [None]:
adata = sce.pp.dca(adata, batch_size=32, epochs=300, log1p=True,  normalize_per_cell=False, scale=False, activation='relu', ae_type='nb', batchnorm=True, hidden_size=(1024,512,1024), optimizer="RMSprop", verbose=True, copy=True)

In [None]:
adata.layers['dca_counts'] = adata.X.copy()

In [None]:
adata.layers['log_dca_counts'] = sc.pp.log1p(adata.layers['dca_counts'])

In [None]:
marker_genes = ['Foxa2','Neurog3','Tph1','Isl1','Pou2f3','Lgr5','Dmbt1','Hmgb2','Top2a','Defa24','Gna11','Cd52','Muc2','Fcgbp','Lyz1']

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, layer='log_dca_counts')

In [None]:
torch.cuda.empty_cache()

In [None]:
adata.layers['sct_logcounts'] = adata.layers['sct_logcounts'].toarray()

In [None]:
testGenes=marker_genes

x=np.zeros(len(testGenes))
for i,gene in enumerate(testGenes):
    
    df_cor=np.corrcoef(adata[:,gene].layers['log_dca_counts'][:,0],adata[:,gene].layers['sct_logcounts'][:,0])
    x[i]=df_cor[0][1]
    
x

In [None]:
np.mean(x)

In [None]:
mean_squared_error(adata[:,testGenes].layers['log_dca_counts'], adata[:,testGenes].layers['sct_logcounts'])

In [None]:
topHVGs = adata.var.loc[(adata.var['means'].sort_values(ascending=False).index[0:1000]),:].loc[adata.var['highly_variable'] == True,:].iloc[0:50,:].index

In [None]:
x=np.zeros(len(topHVGs))
for i,gene in enumerate(topHVGs):
    
    df_cor=np.corrcoef(adata[:,gene].layers['log_dca_counts'][:,0],adata[:,gene].layers['sct_logcounts'][:,0])
    x[i]=df_cor[0][1]
    
x

In [None]:
np.mean(x)

In [None]:
mean_squared_error(adata[:,topHVGs].layers['log_dca_counts'], adata[:,topHVGs].layers['sct_logcounts'])

In [None]:
with rc_context({'figure.figsize': (6,4)}):
    sc.pl.violin(adata, keys=['Foxa2','Neurog3','Lgr5','Top2a'], groupby='initial_cell_type', layer='log_dca_counts', use_raw=False, rotation=90)

In [None]:
gc.collect()

# Magic

In [None]:
adata.X =adata.layers['sct_logcounts'].copy()

In [None]:
sce.pp.magic(adata, n_pca=100, knn=5, t=5)

In [None]:
adata.layers['magic_counts'] = adata.X.copy()

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type'] + marker_genes , size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, layer='magic_counts')

In [None]:
testGenes=marker_genes

x=np.zeros(len(testGenes))
for i,gene in enumerate(testGenes):
    
    df_cor=np.corrcoef(adata[:,gene].layers['magic_counts'][:,0],adata[:,gene].layers['sct_logcounts'][:,0])
    x[i]=df_cor[0][1]
    
x

In [None]:
np.mean(x)

In [None]:
mean_squared_error(adata[:,testGenes].layers['magic_counts'], adata[:,testGenes].layers['sct_logcounts'])

In [None]:
x=np.zeros(len(topHVGs))
for i,gene in enumerate(topHVGs):
    
    df_cor=np.corrcoef(adata[:,gene].layers['magic_counts'][:,0],adata[:,gene].layers['sct_logcounts'][:,0])
    x[i]=df_cor[0][1]
    
x

In [None]:
np.mean(x)

In [None]:
mean_squared_error(adata[:,topHVGs].layers['magic_counts'], adata[:,topHVGs].layers['sct_logcounts'])

In [None]:
with rc_context({'figure.figsize': (6,4)}):
    sc.pl.violin(adata, keys=['Foxa2','Neurog3','Lgr5','Top2a'], groupby='initial_cell_type', layer='magic_counts', use_raw=False, rotation=90)

# Save

In [None]:
adata.X = adata.layers['log_dca_counts'].copy()

In [None]:
sparsify_all_layers(adata)

In [None]:
adata

In [None]:
# Save
adata.write('_'.join(['/'.join([file_path,file_base_name]),'adata_markedDoublets_mergedPeaks_normalized_initialAnno_rmDoublets_integrated_imputed.h5mu']))