
# Preprocessing - Normalization
Adapted from Michael Sterr

2024-06-01


# Setup


In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import session_info
import h5py
import scipy.sparse as sparse
import anndata as ad
import scipy.stats as stats
import gc


# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib import cm
import seaborn as sb
from matplotlib import colors
import matplotlib.colors as mcolors
from matplotlib.pyplot import rc_context
from plotnine import *
from adjustText import adjust_text

# Analysis
import scanpy as sc

#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri

# Warnings
import warnings
warnings.filterwarnings('ignore') #(action='once') 

sc.logging.print_versions()

In [None]:
# Settings

## Directory
base_dir = '/mnt/hdd/Notebooks/Gut_project/'
sc.settings.figdir = base_dir + 'Figures'
sc.settings.cachedir = base_dir + 'Cache'

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()
session_info.show()

In [None]:
# Color maps
colors2 = plt.cm.Reds(np.linspace(0, 1, 128)) 
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20)) 
colorsComb = np.vstack([colors3, colors2]) 
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)

In [None]:
# Plot settings
%matplotlib inline

## Plotting parameters
rcParams['figure.figsize']=(5,5) #rescale figures
#sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False, color_map='tab10' ,transparent=True, dpi=150, dpi_save=300)
sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=True, dpi=150, dpi_save=300)

## Font
'''rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Source Sans 3']'''

## Grid & Ticks
rcParams['grid.alpha'] = 0
rcParams['xtick.bottom'] = True
rcParams['ytick.left'] = True

## Embed font
plt.rc('pdf', fonttype=42)

## Define new default settings
plt.rcParamsDefault = plt.rcParams

# Setup R

In [None]:
%run utils.ipynb

In [None]:
setup_R('/home/scanalysis/mnt/envs/scUV/lib/R')

In [None]:
%%R

.libPaths()

In [None]:
%%R
library(Signac)
library(Seurat)
library(SingleCellExperiment)
library(sctransform)

library(RColorBrewer)
library(ggplot2)
library(cowplot)

# Parallelization
library(BiocParallel)
register(MulticoreParam(20, progressbar = TRUE))

library(future)
plan("multicore", workers = 20)
options(future.globals.maxSize = 64 * 1024 ^ 3) # for 32 Gb RAM?
plan()

library(doParallel)
registerDoParallel(20)
sessionInfo()

# Load Data

In [None]:
samples = ['105_Gut_PF1',  '106_Gut_PF2', '107_Gut_VSG3', '108_Gut_VSG5', '83_Gut_Sham_d7_1', '84_Gut_VSG_d7_1', '85_Gut_Sham_d7_2', '86_Gut_VSG_d7_2', 'HFD_1', 'HFD_2',
 'HFD_3',  'MUC13635', 'MUC13636', 'MUC13643','MUC13646', 'MUC8397', 'MUC8398', 'MUC8400', 'Mutant_1', 'Mutant_2', 'Mutant_3_FVR', 'Mutant_4_FVR']
base_path = '/mnt/hdd/data'
base_path1 = '/mnt/hdd/data/Diseased/'
outs_path = '/count_matrices/'
out_base_name = 'Diseased'
out_path = '/mnt/hdd/data/Files'

## Adata

In [None]:
adata= sc.read_h5ad(f'{base_path}/{out_base_name}/Dbtl_detected_velocyto_diseased.h5ad')

In [None]:
adata.obs_names_make_unique()

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=['sample','n_counts','log_counts','n_genes','log_genes','mt_frac','rp_frac','final_doublets_cat','doublet_calls'], size=10, ncols=3, add_outline=True, alpha=1, outline_width=(0.3, 0.0), wspace=1.25)

## CC Genes

In [None]:
all_cc_genes, s_genes_regev, g2m_genes_regev, cc_genes_regev, cc_genes_macosko, s_genes_macosko, g2m_genes_macosko, m_genes_macosko, mg1_genes_macosko, g1s_genes_macosko = load_cell_cycle_genes(adata, genome ='mus_musculus')

# Normalization with Scran

In [None]:
adata.layers['raw_counts'] = adata.X.copy()

In [None]:
normalise_scran(adata)

In [None]:
gc.collect()

In [None]:
sparsify_all_layers(adata)

In [None]:
scran_mat = adata.layers['scran_counts'].T

In [None]:
sc.pp.highly_variable_genes(adata)
sc.pl.highly_variable_genes(adata)

In [None]:
# Get HVGs and overlap with cell cycle & ambient genes

## HVGs from SCT
hvgs = pd.Series(adata.var_names[adata.var['highly_variable']])
print('\nHighly variable genes before filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

# overlap HVGs with CC genes
hvcc = list(hvgs[hvgs.isin(all_cc_genes)])
print('\nHighly variable cell cycle genes:',len(hvcc),'\n',hvcc)

# overlap HVGs with ambient genes
hvambi = list(hvgs[hvgs.isin(list(adata[:,adata.var['is_ambient'] == True].var_names))])
print('\nHighly variable ambient genes:',len(hvambi),'\n',hvambi)

# remove cell cycle genes
adata.var.loc[hvcc,'highly_variable'] = False

# remove ambient genes
adata.var.loc[hvambi,'highly_variable'] = False

print('\nHighly variable genes after filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

In [None]:
adata.var['is_ambient'].value_counts()

In [None]:
gc.collect()

In [None]:
# Calc umap
sc.pp.pca(adata, svd_solver='arpack', use_highly_variable=True)
sc.pp.neighbors(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.5)
sc.tl.umap(adata)

In [None]:
genes = ['Lgr5','Sis','Pou2f3','Spdef','Defa24','Chga','Neurog3','Tph1','Isl1','Foxa2']

In [None]:
sc.pl.umap(adata, color=['sample','leiden','doublet_calls','final_doublets_cat'] + genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, wspace=1.1)

# Normalization with Seurat SCT 


#### save current state

In [None]:
adata.write(f'{base_path}/{out_base_name}/Dbtl_detected_velocyto_scran_diseased.h5ad')

In [None]:
del adata

#### read adata again

In [None]:
adata= sc.read_h5ad(f'{base_path}/{out_base_name}/Dbtl_detected_velocyto_scran_diseased.h5ad')

In [None]:
adata.obs['sample'].value_counts()

In [None]:
adata.X = adata.layers['raw_counts'].copy()

In [None]:
adata.X = sci.sparse.csr_matrix(adata.X)  

In [None]:
import torch
torch.cuda.empty_cache()
gc.collect()

In [None]:
data_mat = adata.X.T
obs_names = adata.obs_names
var_names = adata.var_names

In [None]:
%%R -i var_names -i obs_names -i data_mat -o seurat
rownames(data_mat) <- var_names
colnames(data_mat) <- obs_names
seurat <- CreateSeuratObject(counts = data_mat, project = "0", min.cells = 0, min.features = 0)
gc()

In [None]:
import torch
torch.cuda.empty_cache()
gc.collect()

In [None]:
%%R
library(Signac)
library(Seurat)
library(SingleCellExperiment)
library(sctransform)

library(RColorBrewer)
library(ggplot2)
library(cowplot)

# Parallelization
library(BiocParallel)
register(MulticoreParam(10, progressbar = TRUE))

library(future)
plan("multicore", workers = 10)
options(future.globals.maxSize = 64 * 1024 ^ 3) # for 32 Gb RAM?
plan()

library(doParallel)
registerDoParallel(10)
sessionInfo()

In [None]:
%%R
seurat <- SCTransform(seurat, verbose = TRUE, return.only.var.genes = FALSE, ncells = 2000, variable.features.n = NULL, vst.flavor = "v2") #return.only.var.genes = FALSE and added ncells=2000, conserve_memory =TRUE to save memory and prevent breaking

#### save seurat

In [None]:
%%R
saveRDS(seurat, "SCtransformseurat.rds")

#### read seurat

In [None]:
%%R
summary(seurat[["SCT"]]@scale.data)

In [None]:
%%R -o sce
# Add feature meta data (since Seurat v4 -> will be fixed?)
var <- c('detection_rate','gmean', 'variance', 'residual_variance')
seurat[["SCT"]]@meta.features <- SCTResults(seurat[["SCT"]], slot = "feature.attributes")[, var]
seurat[["SCT"]]@meta.features$variable <- FALSE
seurat[["SCT"]]@meta.features[VariableFeatures(seurat[["SCT"]] ), "variable"] <- TRUE
colnames(seurat[["SCT"]]@meta.features) <- paste0("sct.", colnames(seurat[["SCT"]]@meta.features) )

# Convert to SingleCellExperiment
sce <- as.SingleCellExperiment(seurat)

In [None]:
%%R
# Add feature meta data (since Seurat v4 -> will be fixed?)
rowData(sce) <- seurat[["SCT"]]@meta.features

In [None]:
%%R -o sce
# Rename and add layers
SummarizedExperiment::assay(sce, i = 1) <- seurat[["SCT"]]@counts
SummarizedExperiment::assay(sce, i = 2) <- seurat[["SCT"]]@data
SummarizedExperiment::assay(sce, i = 3 <- seurat[["SCT"]]@scale.data
#SummarizedExperiment::assay(sce, i = 4) <- seurat[["RNA"]]@counts
SummarizedExperiment::assayNames(sce) <- c("sct_counts", "sct_logcounts", "sct_scale_data")#, "raw_counts")
rm(list = ls())
gc()

In [None]:
# add SCT data to adata
adata.layers['sct_counts'] = sce[:,adata.var_names].X.copy()
adata.layers['sct_logcounts'] = sce[:,adata.var_names].layers['sct_logcounts'].copy()
adata.layers['sct_scale_data'] = sce[:,adata.var_names].layers['sct_scale_data'].copy()
adata.var[['sct.detection_rate', 'sct.gmean', 'sct.variance', 'sct.residual_variance', 'sct.variable']] = sce[:,adata_gex.var_names].var[['sct.detection_rate', 'sct.gmean', 'sct.variance', 'sct.residual_variance', 'sct.variable']].copy()
gc.collect()

In [None]:
     
# Set HVGs
print('\tSet HVGs...')
adata.var.loc[:,'highly_variable'] = [bool(i) for i in sce[:,adata.var_names].var['sct.variable']]
#hvgs = pd.Series(adata.var['sct.variable'][adata.var['sct.variable'] > 0].index) # use HVGs from sct
#adata.var['highly_variable']= False
#adata.var.loc[hvgs,'highly_variable'] = True

# Filter genes: Min 20 cells - filters out 0 count genes
print('\tFilter genes...')
sc.pp.filter_genes(adata, min_cells=20)

gc.collect()


In [None]:
adata.write(f'{base_path}/{out_base_name}/Dbtl_detected_velocyto_scran_sct_diseased.h5ad')

#### adata ran by Michi

In [None]:
adata = sc.read_h5ad(f'{base_path}/{out_base_name}/Dbtl_detected_velocyto_scran_sct_diseased_michi.h5ad')

In [None]:
gc.collect()

In [None]:
adata.X = adata.layers['sct_logcounts']

In [None]:
adata

In [None]:
# Get HVGs and overlap with cell cycle & ambient genes

## HVGs from SCT
hvgs = pd.Series(adata.var_names[adata.var['highly_variable']])
print('\nHighly variable genes before filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

# overlap HVGs with CC genes
hvcc = list(hvgs[hvgs.isin(all_cc_genes)])
print('\nHighly variable cell cycle genes:',len(hvcc),'\n',hvcc)

# overlap HVGs with ambient genes
hvambi = list(hvgs[hvgs.isin(list(adata[:,adata.var['is_ambient'] == True].var_names))])
print('\nHighly variable ambient genes:',len(hvambi),'\n',hvambi)

# remove cell cycle genes
adata.var.loc[hvcc,'highly_variable'] = False

# remove ambient genes
adata.var.loc[hvambi,'highly_variable'] = False

print('\nHighly variable genes after filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

In [None]:
# Calc umap
sc.pp.pca(adata, svd_solver='arpack', use_highly_variable=True)
sc.pp.neighbors(adata)
sc.tl.leiden(adata, resolution=0.5)

sc.tl.umap(adata)

In [None]:
fig, ax = plt.subplots()
ax.scatter(x=adata.var['sct.gmean'], y=adata.var['sct.residual_variance'], c=adata.var['highly_variable'], s=1, alpha=0.8)
#ax.set_yscale('log')
ax.set_ylim((0,500)) 
ax.set_xscale('log')
ax.set_xlabel('Mean UMI Counts')
ax.set_ylabel('Residual Variance')

In [None]:
fig, ax = plt.subplots()
ax.scatter(x=adata.var['sct.gmean'], y=adata.var['sct.residual_variance'], c=adata.var['highly_variable'], s=1, alpha=0.8)
ax.set_yscale('log')
#ax.set_ylim((0,500)) 
ax.set_xscale('log')
ax.set_xlabel('Mean UMI Counts')
ax.set_ylabel('Residual Variance')

In [None]:
qc_metrics(adata, ambient=False)

In [None]:
sc.pl.umap(adata, color=['n_counts', 'log_counts','n_genes','log_genes','mt_frac','rp_frac','sample','leiden'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=2,wspace=0.95)

In [None]:
sc.pl.umap(adata, color=['sample','leiden','doublet_calls','final_doublets_cat'] + genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=2, wspace=0.95)

# Nomalization Results

In [None]:
genes = ['Lgr5','Sis','Pou2f3','Spdef','Defa24','Chga','Neurog3','Tph1','Isl1','Foxa2']

In [None]:
sc.pl.umap(adata, color=['n_counts', 'log_counts','n_genes','log_genes','mt_frac','rp_frac','sample','leiden'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, wspace = 0.95)

In [None]:
sc.pl.umap(adata, color=['sample','leiden'] + genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, wspace =1.05)

In [None]:
with rc_context({'figure.figsize':(8,8)}):
    sc.pl.violin(adata, use_raw=False, keys=['n_counts', 'log_counts','n_genes','log_genes','mt_frac','rp_frac'], groupby='leiden',rotation=90)

In [None]:
with rc_context({'figure.figsize':(8,8)}):
    sc.pl.violin(adata, use_raw=False, keys=['n_counts', 'log_counts','n_genes','log_genes','mt_frac','rp_frac'], groupby='sample', rotation=90)

In [None]:
with rc_context({'figure.figsize':(6,5)}):
    sc.pl.violin(adata, use_raw=False, keys=genes, groupby='leiden', layer='raw_counts', rotation=90)

In [None]:
with rc_context({'figure.figsize':(8,12)}):
    sc.pl.violin(adata, use_raw=False, keys=genes, groupby='leiden', layer='sct_counts', rotation=90)

In [None]:
with rc_context({'figure.figsize':(6,8)}):
    sc.pl.violin(adata, use_raw=False, keys=genes, groupby='leiden', layer='sct_logcounts', rotation=90)

In [None]:
with rc_context({'figure.figsize':(6,8)}):
    sc.pl.violin(adata, use_raw=False, keys=genes, groupby='leiden', layer='scran_counts', rotation=90)

In [None]:
with rc_context({'figure.figsize':(6,8)}):
    sc.pl.violin(adata, use_raw=False, keys=genes, groupby='leiden', layer='sct_scale_data', rotation=90)

In [None]:
colors2 = plt.cm.Reds(np.linspace(0, 1, 128)) 
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20)) 
colorsComb = np.vstack([colors3, colors2]) 
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)

for gene in genes:
    df = pd.DataFrame({'sct_residuals':list(chain.from_iterable(adata[:,gene].layers['sct_scale_data'].toarray())), 'sct':list(chain.from_iterable(adata[:,gene].layers['sct_logcounts'].toarray())), 'scran':list(chain.from_iterable(adata[:,gene].layers['scran_counts'].toarray())), 'raw':list(chain.from_iterable(adata[:,gene].layers['log_raw_counts'].toarray())), 'leiden':list(adata.obs['leiden'].astype(int))})
    df = df.sort_values(by=['leiden'])
    df.loc[:,'leiden'] = df.loc[:,'leiden'].astype('category')
    lims_x = []
    lims_y = []
    lims_line = []

    fig, axs = plt.subplots(1, 3, constrained_layout=True, figsize=(10, 3))
    # Plots
    axs[0].scatter(df.loc[:,'sct'], y=df.loc[:,'scran'], s=2, alpha=0.2, c=df.loc[:,'leiden'], cmap=mymap)
    axs[1].scatter(df.loc[:,'raw'], y=df.loc[:,'sct'], s=2, alpha=0.2, c=df.loc[:,'leiden'], cmap=mymap)
    axs[2].scatter(df.loc[:,'raw'], y=df.loc[:,'scran'], s=2, alpha=0.2, c=df.loc[:,'leiden'], cmap=mymap)

    # Aesthetics
    for i,ax in enumerate(axs):
        lims_x.append(ax.get_xlim())
        lims_y.append(ax.get_ylim())
        lims_line.append([np.min([ax.get_xlim(), ax.get_ylim()]), np.max([ax.get_xlim(), ax.get_ylim()])])

    axs[0].set_xlabel('SCT Normalized')
    axs[0].set_ylabel('SCRAN Normalized')
    #axs[0].set_aspect('equal')
    axs[0].plot(lims_line[0], lims_line[0], 'k-', alpha=1, zorder=0, ls='--', lw=1)
    axs[0].set_xlim(lims_x[0])
    axs[0].set_ylim(lims_y[0])

    axs[1].set_xlabel('Raw')
    axs[1].set_ylabel('SCT Normalized')
    axs[1].set_title(gene, fontweight='bold')
    #axs[1].set_aspect('equal')
    axs[1].plot(lims_line[1], lims_line[1], 'k-', alpha=1, zorder=0, ls='--', lw=1)
    axs[1].set_xlim(lims_x[1])
    axs[1].set_ylim(lims_y[1])

    axs[2].set_xlabel('Raw')
    axs[2].set_ylabel('SCRAN Normalized')
    #axs[2].set_aspect('equal')
    axs[2].plot(lims_line[2], lims_line[2], 'k-', alpha=1, zorder=0, ls='--', lw=1)
    axs[2].set_xlim(lims_x[2])
    axs[2].set_ylim(lims_y[2])

    plt.show()

# Save AData

In [None]:
adata

In [None]:
adata.write('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_diseased.h5ad')

# Session Info

In [None]:
print_r_session()