## Setup

In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import h5py
import scipy.sparse as sparse
import anndata as ad
import scipy.stats as stats
import gc
import os

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib import cm
from matplotlib import colors
from matplotlib.pyplot import rc_context
import seaborn as sb
#from plotnine import *
from adjustText import adjust_text
#import pegasus as pg

# Analysis
import scanpy as sc
import muon as mu
from muon import atac as ac
#import snapatac2 as snap
import pysam
# Preporcessing
import scrublet as scr

#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri


# Warnings
import warnings
warnings.filterwarnings('ignore') #(action='once') 
import session_info
session_info.show()

#sc.logging.print_versions()

In [None]:
# Colormap
colors2 = plt.cm.Reds(np.linspace(0, 1, 128)) 
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20)) 
colorsComb = np.vstack([colors3, colors2]) 
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)

In [None]:
%run utils.ipynb

In [None]:
setup_R('/home/scanalysis/mnt/envs/scUV/lib/R')
#setup_R('/home/scanalysis/mnt/miniforge3/envs/LN/lib/R')
%reload_ext rpy2.ipython

In [None]:
%%R
.libPaths()

In [None]:
%%R

# Parallelization
library(BiocParallel)
register(MulticoreParam(64, progressbar = TRUE))

library(future)
plan(multicore, workers = 64)
#options(future.globals.maxSize = 100 * 1024 ^ 3) # for 50 Gb RAM
plan()

library(doParallel)
registerDoParallel(64)

sessionInfo()

## read data

In [None]:
base_path = '/mnt/hdd/data/Multiome/'
outs_path = '/outs'

In [None]:
sc.settings.figdir = base_path + 'Figures'
sc.settings.cachedir = base_path + 'Cache'

## Scanpy settings
sc.settings.verbosity = 3

In [None]:
# Get a list of folder names, sorted alphabetically
folder_names = sorted([f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))])
folder_variables = {}

for index, folder_name in enumerate(folder_names, start=46):
    variable_name = f"sample{index}"
    folder_variables[folder_name] = variable_name
folder_variables

In [None]:
for variable_name, folder_name in folder_variables.items():
    globals()[variable_name] = folder_name

#### samples definition

In [None]:
samples = ['597_NVF_Crypts_Rep1', '598_FVF_Crypts_Rep1','599_FVF_Crypts_Rep2','604_NVF_Crypts_Rep2','FVF-low', 'FVF-high']
#samples = ['FVF-low', 'FVF-high']


#already done multiple times
for sample in samples:
    print('Loading ' + base_path + sample + outs_path)
    path = base_path + sample + outs_path
    mudata=mu.read_10x_h5('/'.join([path,'raw_feature_bc_matrix.h5']))
    mudata.obs['sample'] = folder_variables[sample]
    print(mudata.shape)
    mudata.var_names_make_unique()

    mudata.mod['rna'].obs = mudata.obs.copy()
    mudata.mod['atac'].obs = mudata.obs.copy()

    # Save combined
    mu.write(path + '/' + sample + '_raw_feature_bc_matrix.h5mu', mudata)
    
    del mudata

## Dropletutils prep

In [None]:
%%R
library(scran)
library(RColorBrewer)
library(DropletUtils)

## Droplet Utils

In [None]:
mdata_list = {}
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/{folder_name}_raw_feature_bc_matrix.h5mu')
    mdata.var_names_make_unique()
    mdata.mod['rna'].obs = mdata.obs.copy()
    mdata.mod['atac'].obs = mdata.obs.copy()
    globals()[f'{sample_name}_mdata'] = mdata.copy()
    mdata_list[folder_name]=globals()[f'{sample_name}_mdata']

In [None]:
droplet_utils_save = []
for key, mdata in mdata_list.items():
    # GEX
    gex = mdata.mod['rna']
    sc.pp.filter_genes(gex, min_cells=1) # for emptyDrops to work 
    sparse_mat = gex.X.T
    genes = gex.var_names
    barcodes = gex.obs_names
    ro.globalenv['sparse_mat'] = gex.X.T
    ro.globalenv['genes'] = gex.var_names
    ro.globalenv['barcodes'] = gex.obs_names
    ro.r('''
    sce <- SingleCellExperiment(assays = list(counts = sparse_mat), colData=barcodes)
    rownames(sce) <- genes 
    ambient <- emptyDrops(counts(sce))
    is_cell <- ambient$FDR <= 0.05
    threshold_ambient <- 0.0005
    ambient_genes_values <- ambient@metadata$ambient
    ambient_genes <- names(ambient@metadata$ambient[ambient@metadata$ambient> threshold_ambient,])
    barcodes_filtered <- barcodes[which(is_cell)]
    cell_probs <- ambient$LogProb
    ''')
    gex.var['ambient_genes_values'] = ro.globalenv['ambient_genes_values']
    gex.obs['log_cell_probs'] = ro.globalenv['cell_probs']
    barcodes_filtered = ro.globalenv['barcodes_filtered']
    ambient_genes = ro.globalenv['ambient_genes']
    ambient_genes = np.array(ambient_genes)
    genes = np.array(gex.var.index)
    compare = np.isin(genes, ambient_genes)
    gex.var['is_ambient'] = compare
    droplet_utils_save.append(barcodes_filtered)
    #mdata.obs.drop('date',axis=1, inplace=True)
    mu.write(f'{base_path}{key}{outs_path}/DropUtils_matrix.h5mu',mdata)
    

In [None]:
mdata_list

## CellRanger

### read DU utils results if necessary

In [None]:
#load Droplet_utils results
mdata_list = {}
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/DropUtils_matrix.h5mu')
    mdata_list[folder_name]=mdata

In [None]:
mdata_list

### read CR barcodes and save info into col 'cell_confirmed'

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    adata = mdata_list[folder_name]
    all_barcodes=np.array(adata.obs.index)
    barcodes_Cell_Ranger = pd.read_csv(f'{base_path}{folder_name}{outs_path}/filtered_feature_bc_matrix/barcodes.tsv', names=['cells'])
    barcodes_Cell_Ranger=np.array(barcodes_Cell_Ranger['cells'])
    Cell_Ranger=np.isin(all_barcodes, barcodes_Cell_Ranger)
    adata.obs['cell_confirmed'] = Cell_Ranger
    mu.write(f'{base_path}{folder_name}{outs_path}/CR_DU_matrix.h5mu',adata)
    

In [None]:
mdata_list

### read CR results if necessary

### plot ambient threshold

In [None]:
for k, mdata in mdata_list.items():
    adata = mdata.mod['rna']
    set_ambient_threshold(adata)

## get QC covariates

In [None]:
mdata_list

'''for k, mdata in mdata_list.items():
    sample = mdata.obs['sample'][0]
    print(f'sample: {sample}')
    gex = mdata.mod['rna']
    #sc.pp.calculate_qc_metrics(gex, inplace=True, log1p=True)'''

### Run qc_metrics on all samples

In [None]:
# Quality control - calculate QC covariates #makes kernel die every time with sample FVF-neg?!. 
for k, mdata in mdata_list.items():
    if k not in samples:
        print(f'skipping {folder_name}...')
        continue
    sample = mdata.obs['sample'][0]
    print(f'sample: {sample}')
    print(sum(mdata.obs['cell_confirmed']))
    gex = mdata.mod['rna']
    print(gex.shape)
    qc_metrics(gex, ambient=True)
    print(gex.shape)
    mu.write(f'{base_path}{k}{outs_path}/CR_DU_QC_metrics.h5mu',mdata)

### Workaround to avoid kernel death

#### Read qc_metrics files again in case Kernel died

In [None]:
#read_mdatas
mdata_list = {}
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_QC_metrics.h5mu')
    mdata.var_names_make_unique()
    globals()[f'{sample_name}_mdata'] = mdata.copy()
    mdata_list[folder_name]=globals()[f'{sample_name}_mdata']
        # Clear memory
    del mdata
    gc.collect()

#### Run sample outside of function to avoid kernel death

### Filter for confirmed cells

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name in samples:
        print(f'working on {folder_name}...')
    else:
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_QC_metrics.h5mu')
    mdata.var_names_make_unique()
    try:
        ix = mdata.obs['cell_confirmed']
        mdata_f = mdata[ix,].copy()
    except Exception as e:
        mu.pp.intersect_obs(mdata)
        mdata_f = mdata.copy()
        print(e)
    prefilter_barcodes_mdata(mdata_f)
    mu.write(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_filtered.h5mu',mdata_f)
    # Clear memory
    del mdata
    del mdata_f
    gc.collect()

## QC RNA

### UMAP

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_filtered.h5mu')
    get_umap_leiden(mdata.mod['rna'])
    gex = mdata.mod['rna']  
    fig = sc.pl.umap(gex, color=['leiden','n_counts','log_counts','n_genes','log_genes','mt_frac','rp_frac', 'ambi_frac'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, return_fig=True)
    ax = fig.axes[0]
    ax.legend_.set_title(folder_name)
    plt.show()
    mu.write(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap.h5mu',mdata)
    # Clear memory
    del mdata
    gc.collect()

#### plot UMAPs

### MT

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap.h5mu')
    gex = mdata.mod['rna']
    ax = sc.pl.scatter(gex, 'mt_frac', 'n_genes', color='n_counts', show=False)
    ax.set_title(f"{folder_name}- n_counts")
    ax = sc.pl.scatter(gex, 'mt_frac', 'log_counts', color='n_genes', show=False)
    ax.set_title(f"{folder_name}- n_genes")
    ax = sc.pl.scatter(gex, 'mt_frac', 'rp_frac', color='n_genes', show=False)
    ax.set_title(f"{folder_name} - n_genes")
    plt.show()
    sb.histplot(gex.obs['mt_frac'], kde = True, bins=60)
    plt.title(label=f'mitochondrial fraction of {folder_name}', fontweight='bold')
    plt.show()
    plt.close()
    # Clear memory
    del mdata
    gc.collect()

#### set thresholds - manual and per sample individually

In [None]:
mito_dict = {'597_NVF_Crypts_Rep1': [0,0.32],
 '598_FVF_Crypts_Rep1': [0,0.35],
 '599_FVF_Crypts_Rep2': [0,0.28],
 '604_NVF_Crypts_Rep2': [0,0.35],
 'FVF-high': [0,0.3],
 'FVF-low': [0,0.4],
 'FVF-neg': [0,0.4]}

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap.h5mu')
    max_mito = mito_dict[folder_name][1]
    min_mito = mito_dict[folder_name][0]
    gex = mdata.mod['rna']
    p = sb.jointplot(x=gex.obs['mt_frac'], y=gex.obs['n_genes'], n_levels=15, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
        sb.scatterplot, alpha=0).ax_joint.vlines(x=[min_mito,max_mito], ymin=[0,0], ymax=[max(gex.obs['n_genes']),max(gex.obs['n_genes'])], color="black", lw=0.5).set_linestyle("--")
    plt.suptitle(f"{folder_name} - mt fraction by n_genes")
    plt.show()
    plt.close()

    p = sb.jointplot(x=gex.obs['mt_frac'], y=gex.obs['log_counts'], n_levels=15, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
        sb.scatterplot, alpha=0).ax_joint.vlines(x=[min_mito,max_mito], ymin=[0,0], ymax=[max(gex.obs['log_counts']),max(gex.obs['log_counts'])], color="black", lw=0.5).set_linestyle("--")
    plt.suptitle(f"{folder_name} - mt fraction by log_counts")
    plt.show()
    plt.close()
    # Clear memory
    del mdata
    gc.collect()
    

In [None]:
gex

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap.h5mu')
    gex = mdata.mod['rna']
    #filter mitos
    max_mito = mito_dict[folder_name][1]
    min_mito = mito_dict[folder_name][0]
    gex.obs['filter_mt_frac']=pd.Categorical(list(map(str,list((gex.obs['mt_frac'] < max_mito) & (gex.obs['mt_frac'] > min_mito)))))
    fig = sc.pl.umap(gex, color=['filter_mt_frac','mt_frac'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), return_fig=True)
    ax = fig.axes[0]
    ax.legend_.set_title(folder_name)
    plt.show()
    plt.close()
    # Scatter plot with vertical lines
    try:
        with rc_context({'figure.figsize': (6, 3)}):
            sc.pl.scatter(gex, x='mt_frac', y='n_genes', color='filter_mt_frac', show=False)
            ymin, ymax = 0, max(gex.obs['n_genes'])
            plt.vlines([min_mito, max_mito], ymin=ymin, ymax=ymax, color="black", lw=0.5, linestyles="--")
            plt.suptitle(f"{folder_name} - mt fraction by n_genes")
            plt.show()
            plt.close()
    except Exception as e:
        print(f'Error plotting scatter plot for {folder_name}: {e}')
    mu.write(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt.h5mu',mdata)
    # Clear memory
    del mdata
    gc.collect()

#### set MT threshholds and visualize

In [None]:
mito_filters= {}
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt.h5mu')
    gex = mdata.mod['rna']
    max_mito = mito_dict[folder_name][1]
    min_mito = mito_dict[folder_name][0]
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(gex.obs['mt_frac'], kde = True, bins=60)
        plt.axvline(max_mito, 0, 1)
        plt.axvline(min_mito, 0, 1)
        plt.title(label=f'mitochondrial fraction in {folder_name}')
        plt.show()
        plt.close()
    max_mito = mito_dict[folder_name][1]
    min_mito = mito_dict[folder_name][0]
    mito_filter = (gex.obs['mt_frac'] < max_mito) & (gex.obs['mt_frac'] > min_mito)
    mito_filters[folder_name] = mito_filter
    # Clear memory
    del mdata
    gc.collect()

### Counts

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt.h5mu')
    gex = mdata.mod['rna']
    sb.jointplot(x=gex[mito_filters[folder_name]].obs['log_counts'], y=gex[mito_filters[folder_name]].obs['log_genes'], n_levels=30, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
    sb.scatterplot, alpha=0)
    plt.suptitle(f"{folder_name} - mt fraction by n_genes")
    plt.show()
    plt.close()
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(gex[mito_filters[folder_name]].obs['n_counts'],color='red',  kde = True)
        sb.histplot(gex.obs['n_counts'],  kde = True)
        plt.title(label=f'n_counts in {folder_name}')
        plt.show()
        plt.close()  
    # Clear memory
    del mdata
    gc.collect()


#### set thresholds

In [None]:
counts_dict = {'597_NVF_Crypts_Rep1': [1250,55000],
 '598_FVF_Crypts_Rep1': [1000,60000],
 '599_FVF_Crypts_Rep2': [2500,50000],
 '604_NVF_Crypts_Rep2': [2200,60000],
 'FVF-high': [1500,60000],
 'FVF-low': [1800,70000],
 'FVF-neg': [1250,35000]}

In [None]:
x_lim = [0,3000]

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt.h5mu')
    gex = mdata.mod['rna']
    min_counts = counts_dict[folder_name][0]
    max_counts = counts_dict[folder_name][1]
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(gex.obs['n_counts'][(gex.obs['n_counts']<x_lim[1]) & mito_filters[folder_name]], kde = True, bins=60, color = 'red')
        sb.histplot(gex.obs['n_counts'][(gex.obs['n_counts']<x_lim[1])], kde = True, bins=60)
        plt.title(label=f'{folder_name} n_counts within xlim')
        plt.axvline(min_counts, 0, 1)
        plt.show()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
x_lim = [17000,100000]

#### set and visualize counts filter

In [None]:
counts_filters={}
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt.h5mu')
    gex = mdata.mod['rna']
    min_counts = counts_dict[folder_name][0]
    max_counts = counts_dict[folder_name][1]
    counts_filters[folder_name] = (gex.obs['n_counts'] > min_counts) & (gex.obs['n_counts'] < max_counts)
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(gex.obs['n_counts'][(gex.obs['n_counts']>x_lim[0]) & (gex.obs['n_counts']<x_lim[1]) & mito_filters[folder_name]], kde=True, bins=60, color = 'red')
        sb.histplot(gex.obs['n_counts'][(gex.obs['n_counts']>x_lim[0]) & (gex.obs['n_counts']<x_lim[1])], kde = True, bins=60)
        plt.title(label=f'{folder_name} n_counts within xlim')
        plt.axvline(max_counts, 0, 1)
        plt.show()
    # Clear memory
    del mdata
    gc.collect()

#### visualize thresholds

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt.h5mu')
    gex = mdata.mod['rna']
    min_counts = counts_dict[folder_name][0]
    max_counts = counts_dict[folder_name][1]
    p1= sc.pl.scatter(gex, 'n_counts', 'n_genes', color='mt_frac', show=False).vlines(x=[min_counts, max_counts], ymin=[0,0], ymax=[max(gex.obs['n_genes']),max(gex.obs['n_genes'])], color="black", lw=0.5).set_linestyle("--")
    plt.suptitle(f'{folder_name}')
    plt.show()
    plt.close()
    p2 =sc.pl.scatter(gex, 'log_counts', 'log_genes', color='mt_frac', show=False).vlines(x=[np.log(min_counts), np.log(max_counts)], ymin=[np.log(min(gex.obs['n_genes'])),np.log(min(gex.obs['n_genes']))], ymax=[np.log(max(gex.obs['n_genes'])),np.log(max(gex.obs['n_genes']))], color="black", lw=0.5).set_linestyle("--")
    plt.suptitle(f'{folder_name} log')
    plt.show()
    # Clear memory
    del mdata
    gc.collect()

### Genes

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt.h5mu')
    gex = mdata.mod['rna']
    min_counts= counts_filters[folder_name][0]
    max_counts = counts_filters[folder_name][1]
    sb.jointplot(x=gex[mito_filters[folder_name]].obs['log_counts'], y=gex[mito_filters[folder_name]].obs['log_genes'], n_levels=30, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
        sb.scatterplot, alpha=0).ax_joint.vlines(x=[np.log(min_counts), np.log(max_counts)], ymin=[np.log(min(gex.obs['n_genes'])),np.log(min(gex.obs['n_genes']))], ymax=[np.log(max(gex.obs['n_genes'])),np.log(max(gex.obs['n_genes']))], color="black", lw=0.5).set_linestyle("--")
    plt.suptitle(f'{folder_name}')
    plt.show()
    plt.close()
    sb.histplot(gex.obs['n_genes'][mito_filters[folder_name] & counts_filters[folder_name]], kde=True, bins=60, color = 'red')
    sb.histplot(gex.obs['n_genes'][mito_filters[folder_name]], kde=True, bins=60, color = 'green')
    sb.histplot(gex.obs['n_genes'], kde=True, bins=60)
    plt.title(f'{folder_name}')
    plt.show()
    plt.close()
    # Clear memory
    del mdata
    gc.collect()

#### set min_genes

In [None]:
genes_dict = {'597_NVF_Crypts_Rep1': [850,0],
 '598_FVF_Crypts_Rep1': [450,0],
 '599_FVF_Crypts_Rep2': [1000,0],
 '604_NVF_Crypts_Rep2': [1250,0],
 'FVF-high': [1000,0],
 'FVF-low': [1000,0],
 'FVF-neg': [800,0]}

In [None]:
x_lim = [0,2000]

#### set and plot genes filter

In [None]:
genes_filters={}
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt.h5mu')
    gex = mdata.mod['rna']
    min_genes= genes_dict[folder_name][0]
    max_genes = genes_dict[folder_name][1]
    genes_filters[folder_name] = (gex.obs['n_genes'] > min_genes)
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(gex.obs['n_genes'][(gex.obs['n_genes']>x_lim[0]) & (gex.obs['n_genes']<x_lim[1])& mito_filters[folder_name] & counts_filters[folder_name]],color= 'red', kde=True, bins=60)
        sb.histplot(gex.obs['n_genes'][(gex.obs['n_genes']>x_lim[0]) & (gex.obs['n_genes']<x_lim[1])& mito_filters[folder_name]],color= 'green', kde=True, bins=60)
        sb.histplot(gex.obs['n_genes'][(gex.obs['n_genes']>x_lim[0]) & (gex.obs['n_genes']<x_lim[1])], kde=True, bins=60)
        plt.axvline(min_genes, 0, 1)
        plt.title(f'{folder_name}')
        plt.show()
    # Clear memory
    del mdata
    gc.collect()

### set and visualize all filters

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt.h5mu')
    min_genes = genes_dict[folder_name][0]
    min_counts= counts_dict[folder_name][0]
    max_counts = counts_dict[folder_name][1]
    gex = mdata.mod['rna']
    p=sc.pl.scatter(gex, 'n_counts', 'n_genes', color='mt_frac', show=False)
    gex = mdata.mod['rna']
    #gex.obs['filtered_cells']=pd.Categorical(list(map(str,list(qc_filters[folder_name]))))
    #gex.obs['filtered_cells'] = qc_filters[folder_name].astype(str)
    #gex.obs['filtered_cells']=pd.Categorical(list(map(str,list((counts_filters[folder_name] & genes_filters[folder_name] & mito_filters[folder_name])))))
    gex.obs['filtered_cells']=pd.Categorical(list(map(str,list((gex.obs['n_counts'] > counts_dict[folder_name][0]) & 
                                                             (gex.obs['n_counts'] < counts_dict[folder_name][1]) & 
                                                             (gex.obs['n_genes'] > genes_dict[folder_name][0]) & 
                                                             (gex.obs['mt_frac'] < mito_dict[folder_name][1]))))) #everything else seems to not perform AND logic ?!

    p.vlines(x=[min_counts, max_counts], ymin=[0,0], ymax=[max(gex.obs['n_genes']),max(gex.obs['n_genes'])], color="black", lw=0.5).set_linestyle("--")
    p.axhline(y=min_genes, xmin=0, xmax=max(gex.obs['n_counts']), color="black", lw=0.5).set_linestyle("--")
    plt.suptitle(f'{folder_name}')
    plt.show()

    p=sc.pl.scatter(gex, 'log_counts', 'log_genes', color='mt_frac', show=False)
    p.vlines(x=[np.log(min_counts), np.log(max_counts)], ymin=[np.log(min(gex.obs['n_genes'])),np.log(min(gex.obs['n_genes']))], ymax=[np.log(max(gex.obs['n_genes'])),np.log(max(gex.obs['n_genes']))], color="black", lw=0.5).set_linestyle("--")
    p.axhline(y=np.log(min_genes), color="black", lw=0.5).set_linestyle("--")
    plt.suptitle(f'{folder_name}')
    plt.show()
    mu.write(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered.h5mu', mdata)
    # Clear memory
    del mdata
    gc.collect()

#### Plots for QC correction

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered.h5mu')
    gex = mdata.mod['rna']   
    for col in ['log_cell_probs', 'log_counts', 'n_counts_rank', 'log_genes', 'mt_frac', 'rp_frac', 'ambi_frac']:
        fig, axes = plt.subplots(1,2, figsize=(10, 4), gridspec_kw=dict(width_ratios=[2,1],wspace = 0.3))
        
        sb.violinplot(x='leiden', y=col, data=gex.obs, ax=axes[0], hue = col, palette="bright")
        sb.violinplot(x='filtered_cells', y=col, data=gex.obs, ax=axes[1],hue = col, palette="colorblind")
        axes[1].set_ylabel(None)
        fig.suptitle(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered.h5mu')
    max_prob = -2000
    gex = mdata.mod['rna']
    sb.histplot(gex.obs['log_cell_probs'], kde=True, bins=60, color='Blue')
    sb.histplot(gex[gex.obs['filtered_cells']=='True'].obs['log_cell_probs'], kde=True, bins=60, color='Red')
    plt.axvline(max_prob, 0, 1)
    plt.title(label= f"Log Cell Probabilities\nFiltered Cell Barcodes of {folder_name}", fontweight='bold')
    plt.show()
    plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered.h5mu')
    gex = mdata.mod['rna'].copy()
    sc.pl.umap(gex, color=['log_counts','log_genes','mt_frac','rp_frac','ambi_frac','log_cell_probs'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, title= folder_name)
    plt.show()
    plt.close()
    fig=sc.pl.umap(gex, color=['filtered_cells','leiden'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=6, return_fig=True)
    ax=fig.axes[0]
    ax.legend_.set_title(folder_name)
    plt.show()
    plt.close()

    ###############################################################################
    ###############################################################################

    with rc_context({'figure.figsize': (6, 4)}): #rcParams['figure.figsize']=(6,4)
        key = 'leiden'
        labels = list(gex.obs[key].cat.categories)
        keep_pct = []
        filter_pct = []
        width = 0.85       # the width of the bars: can also be len(x) sequence
    
        for label in labels:
            try:
                keep_pct = keep_pct + [gex.obs['filtered_cells'][gex.obs[key]==label].value_counts()['True']/gex.obs['filtered_cells'][gex.obs[key]==label].value_counts().sum()*100]
            except KeyError:
                keep_pct += [0]
            try:
                filter_pct = filter_pct + [gex.obs['filtered_cells'][gex.obs[key]==label].value_counts()['False']/gex.obs['filtered_cells'][gex.obs[key]==label].value_counts().sum()*100]
            except KeyError:
                filter_pct += [0]
        fig, ax = plt.subplots()

        ax.bar(labels, filter_pct, width, label='Filter Out', edgecolor='0', linewidth=0.5)
        ax.bar(labels, keep_pct, width, bottom=filter_pct, label='Keep', edgecolor='0', linewidth=0.5)

        ax.set_ylabel('%')
        ax.set_title(f'Percentage of Filtered Cells in {folder_name}')
        ax.axes.set_xticklabels(labels=labels, rotation=90)
        ax.legend(bbox_to_anchor=(1, .5),loc='center left', edgecolor='1')

        plt.ylim([-2.5,100+2.5])
        plt.xlim([-1+0.25,len(labels)-0.25])

        plt.show()
        plt.close()

    #################################################################################
    #################################################################################

    p= sc.pl.scatter(gex, 'mt_frac', 'n_genes', color='filtered_cells')
    plt.suptitle(f'{folder_name}')
    plt.show()
    plt.close()
    p= sc.pl.scatter(gex, 'log_counts', 'log_genes', color='filtered_cells')
    plt.suptitle(f'{folder_name}')
    plt.show()
    plt.close()
    # Clear memory
    del mdata
    gc.collect()

### recover Paneth cells

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered.h5mu')
    print(f'folder_name: {folder_name}')
    gex = mdata.mod['rna']
    sc.pl.umap(gex, color=['Lyz1','Defa24','Mmp7','Itln1'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, color_map = mymap)
    # Clear memory
    del mdata
    gc.collect()

In [None]:
min_lyz = 30
min_defa = 80
min_itln = 30
paneth_th_dict ={'597_NVF_Crypts_Rep1': [100,800,100],
 '598_FVF_Crypts_Rep1': [100,800,100],
 '599_FVF_Crypts_Rep2': [100,800,150],
 '604_NVF_Crypts_Rep2': [100,800,200],
 'FVF-high': [80,500,100],
 'FVF-low': [80,500,100]}

for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered.h5mu')
    print(f'folder_name: {folder_name}')
    adata = mdata.mod['rna']

    p=sc.pl.scatter(adata, x='log_counts',y='Lyz1', color='filtered_cells', show=False)
    p.hlines(y=paneth_th_dict[folder_name][2], xmin=[min(adata.obs['log_counts'])], xmax=[max(adata.obs['log_counts'])], color="black", lw=0.5).set_linestyle("--")
    p.semilogy()
    plt.title(folder_name)
    plt.show()
    plt.close()

    p=sc.pl.scatter(adata, x='log_counts',y='Defa24', color='filtered_cells', show=False)
    p.hlines(y=[paneth_th_dict[folder_name][1]], xmin=[min(adata.obs['log_counts'])], xmax=[max(adata.obs['log_counts'])], color="black", lw=0.5).set_linestyle("--")
    p.semilogy() 
    plt.title(folder_name)
    plt.show()
    plt.close()

    p=sc.pl.scatter(adata, x='log_counts',y='Itln1', color='filtered_cells', show=False)
    p.hlines(y=[paneth_th_dict[folder_name][2]], xmin=[min(adata.obs['log_counts'])], xmax=[max(adata.obs['log_counts'])], color="black", lw=0.5).set_linestyle("--")
    p.semilogy() 
    plt.title(folder_name)
    plt.show()
    plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered.h5mu')
    print(f'folder_name: {folder_name}')
    adata = mdata.mod['rna']
    # Find paneth cells
    marker_genes = ['Lyz1','Defa24','Itln1']
    df = pd.DataFrame(data = adata[:,np.in1d(adata.var_names,marker_genes)].X.toarray(), 
                    index = adata.obs_names, 
                    columns=adata.var_names[np.in1d(adata.var_names, marker_genes)].values)
    adata.obs['is_paneth'] = list(map(str,list((df['Itln1'] > paneth_th_dict[folder_name][2]) | (df['Defa24'] > paneth_th_dict[folder_name][1]) | (df['Lyz1'] > paneth_th_dict[folder_name][0] ))))
    adata.obs.loc[adata.obs['is_paneth']=='True','is_paneth'] = 'Paneth'
    adata.obs.loc[adata.obs['is_paneth']=='False','is_paneth'] = 'Non-Paneth'
    mu.write(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered_paneth.h5mu',mdata)
    # Clear memory
    del mdata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered_paneth.h5mu')
    print(f'folder_name: {folder_name}')
    adata = mdata.mod['rna']
    fig =sc.pl.umap(adata, color = ['is_paneth','filtered_cells'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=6, return_fig=True)
    ax = fig.axes[0]
    ax.legend_.set_title(folder_name)
    plt.show()
    plt.close()
    adata.obs['filtered_cells'][adata.obs['is_paneth']=='Paneth'] = 'True'
    mu.write(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered_paneth.h5mu',mdata)
    # Clear memory
    del mdata
    gc.collect()

### Filter data, redo UMAP and save

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/CR_DU_bc_f_umap_mt_QCfiltered_paneth.h5mu')
    print(f'folder_name: {folder_name}')
    gex = mdata.mod['rna']
    # Filter cells according to identified QC thresholds:
    gex = qc_filter_mdata(mdata, gex, modality='rna', qc_filter=gex.obs['filtered_cells']=='True')
    del gex.obs['filtered_cells']
    del gex.obs['filter_mt_frac']
    get_umap_leiden(gex)
    mu.write(f'{base_path}{folder_name}{outs_path}/GEX_1_done.h5mu',mdata)
    # Clear memory
    del mdata
    gc.collect()

### plot results

In [None]:
gex.obs

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/GEX_1_done.h5mu')
    gex = mdata.mod['rna']
    mt_max = gex.obs[gex.obs['is_paneth']=='Non-Paneth']['mt_frac'].max()
    print(f'{folder_name} max mt_frac where not paneth: {mt_max}.')
    sc.pl.umap(gex, color=['log_counts','log_genes','mt_frac','rp_frac'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, title=folder_name)
    # Clear memory
    del mdata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata = read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/GEX_1_done.h5mu')
    gex = mdata.mod['rna']
    sb.jointplot(
        data=gex.obs,
        x="log_counts",
        y="log_genes",
        kind="hist", bins=100, cmap="rocket_r", color="#f69c73", space=0
    )
    plt.suptitle(f'{folder_name}')
    plt.show()
    # Clear memory
    del mdata
    gc.collect()

## Preprocessing ATAC

### load samples, keep only intersect and run signac QC

In [None]:
# retain only cell passing GEX QC
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/GEX_1_done.h5mu')
    mu.pp.intersect_obs(mdata)
    # ATAC
    atac = mdata.mod['atac']
    gex = mdata.mod['rna']
    atac.var_names_make_unique()

    # add umap
    atac.obsm['X_umap'] = gex.obsm['X_umap']
    atac.obs['leiden'] = gex.obs['leiden']
    atac = signac_qc_metrics(atac, aggregated=False,species='Mmusculus', genome="mm10", sample =folder_name, ensembl_release="v102", cr_path=base_path)
    mu.write(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu',mdata)
    # Clear memory
    del mdata
    gc.collect()
    

### counts

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    try:
        sb.jointplot(x=atac.obs['log_nCount_atac'], y=atac.obs['log_nFeature_atac'], n_levels=30, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
        sb.scatterplot, alpha=0)
    except ValueError:
        sb.jointplot(x=atac.obs['log_nCount_atac'], y=atac.obs['log_nFeature_atac'], n_levels=30, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73", clip= ((6,12),(5,11))).plot_joint(
        sb.scatterplot, alpha=0)    
    plt.suptitle(folder_name)
    plt.show()
    plt.close()
    #Thresholding decision: counts
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['nCount_atac'], kde=True)
        plt.title(folder_name)
        plt.show()
        plt.close
    # Clear memory
    del mdata
    gc.collect()

##### set min count

In [None]:
count_dict = {'597_NVF_Crypts_Rep1': [3500,120000],
 '598_FVF_Crypts_Rep1': [6500,120000],
 '599_FVF_Crypts_Rep2': [4500,120000],
 '604_NVF_Crypts_Rep2': [6000,130000],
 'FVF-high': [5000,140000],
 'FVF-low': [7500,120000]}

In [None]:
x_lim = [0,20000]

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac'] 
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['nCount_atac'][(atac.obs['nCount_atac']<x_lim[1])], kde=True, bins=60)
        plt.axvline(count_dict[folder_name][0], 0, 1)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
x_lim = [40000,200000]

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['nCount_atac'][(atac.obs['nCount_atac']>x_lim[0]) & (atac.obs['nCount_atac']<x_lim[1])], kde=True, bins=60)
        plt.axvline(count_dict[folder_name][1], 0, 1)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
counts_filters= {}
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac'] 
    min_counts_atac = count_dict[folder_name][0]
    max_counts_atac = count_dict[folder_name][1]
    counts_filters[folder_name] = (atac.obs['nCount_atac'] > min_counts_atac) & (atac.obs['nCount_atac'] < max_counts_atac)
    sc.pl.scatter(atac, 'nCount_atac', 'nFeature_atac', color='nucleosome_signal', show=False).vlines(x=[count_dict[folder_name][0], count_dict[folder_name][1]], ymin=[0,0], ymax=[max(atac.obs['nFeature_atac']),max(atac.obs['nFeature_atac'])], color="black", lw=0.5).set_linestyle("--")
    plt.suptitle(folder_name)
    plt.show()
    plt.close()
    sc.pl.scatter(atac, 'log_nCount_atac', 'log_nFeature_atac', color='nucleosome_signal', show=False).vlines(x=[np.log(count_dict[folder_name][0]), np.log(count_dict[folder_name][1])], ymin=[np.log(min(atac.obs['nFeature_atac'])),np.log(min(atac.obs['nFeature_atac']))], ymax=[np.log(max(atac.obs['nFeature_atac'])),np.log(max(atac.obs['nFeature_atac']))], color="black", lw=0.5).set_linestyle("--")
    plt.suptitle(folder_name)
    plt.show()
    plt.close()
    # Clear memory
    del mdata
    gc.collect()


### TSS Enrichment

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    #Thresholding decision: tss enrichment
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['TSS.enrichment'][counts_filters[folder_name]], kde=True, bins=60)
        sb.histplot(atac.obs['TSS.enrichment'], kde=True, bins=60)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
tss_dict = {'597_NVF_Crypts_Rep1': [3.8,8.5],
 '598_FVF_Crypts_Rep1': [4,8],
 '599_FVF_Crypts_Rep2': [4,8],
 '604_NVF_Crypts_Rep2': [3.8,8.5],
 'FVF-high': [4,8.5],
 'FVF-low': [4,8]}

In [None]:
x_lim = [2,5.2]

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['TSS.enrichment'][(atac.obs['TSS.enrichment']>x_lim[0]) & (atac.obs['TSS.enrichment']<x_lim[1]) & counts_filters[folder_name]], kde=True, bins=60)
        sb.histplot(atac.obs['TSS.enrichment'][(atac.obs['TSS.enrichment']>x_lim[0]) & (atac.obs['TSS.enrichment']<x_lim[1])], kde=True, bins=60)
        plt.axvline(tss_dict[folder_name][0], 0, 1)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
x_lim = [6,12]

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['TSS.enrichment'][(atac.obs['TSS.enrichment']>x_lim[0]) & (atac.obs['TSS.enrichment']<x_lim[1]) & counts_filters[folder_name]], kde=True, bins=60)
        sb.histplot(atac.obs['TSS.enrichment'][(atac.obs['TSS.enrichment']>x_lim[0]) & (atac.obs['TSS.enrichment']<x_lim[1])], kde=True, bins=60)
        plt.axvline(tss_dict[folder_name][1], 0, 1)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()


#### save tss filter and plot again

In [None]:
tss_filters_atac = {}
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    x_lim = [0,15]
    min_tss = tss_dict[folder_name][0]
    max_tss = tss_dict[folder_name][1]
    tss_filters_atac[folder_name] = (atac.obs['TSS.enrichment'] > min_tss) & (atac.obs['TSS.enrichment'] < max_tss)
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['TSS.enrichment'][(atac.obs['TSS.enrichment']>x_lim[0]) & (atac.obs['TSS.enrichment']<x_lim[1]) & counts_filters[folder_name]], kde=True, bins=60)
        sb.histplot(atac.obs['TSS.enrichment'][(atac.obs['TSS.enrichment']>x_lim[0]) & (atac.obs['TSS.enrichment']<x_lim[1])], kde=True, bins=60)
        plt.axvline(tss_dict[folder_name][0], 0, 1)
        plt.axvline(tss_dict[folder_name][1], 0, 1)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()


### Nucleosome signal

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    #Thresholding decision: nucleosome signal
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['nucleosome_signal'][counts_filters[folder_name] & tss_filters_atac[folder_name]], kde=True, bins=60)
        sb.histplot(atac.obs['nucleosome_signal'], kde=True, bins=60)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
nuc_dict = {'597_NVF_Crypts_Rep1': [0.35,0.8],
 '598_FVF_Crypts_Rep1': [0.35,0.8],
 '599_FVF_Crypts_Rep2': [0.4,0.8],
 '604_NVF_Crypts_Rep2': [0.35,0.78],
 'FVF-high': [0.45,1],
 'FVF-low': [0.45,1]}

In [None]:
x_lim = [0,0.6]

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['nucleosome_signal'][(atac.obs['nucleosome_signal']>x_lim[0]) & (atac.obs['nucleosome_signal']<x_lim[1]) & counts_filters[folder_name] & tss_filters_atac[folder_name]], kde=True, bins=60)
        sb.histplot(atac.obs['nucleosome_signal'][(atac.obs['nucleosome_signal']>x_lim[0]) & (atac.obs['nucleosome_signal']<x_lim[1]) & counts_filters[folder_name] ], kde=True, bins=60)
        # sb.distplot(atac.obs['nucleosome_signal'][(atac.obs['nucleosome_signal']>x_lim[0]) & (atac.obs['nucleosome_signal']<x_lim[1]) & counts_filter_atac], kde=True, bins=60)
        sb.histplot(atac.obs['nucleosome_signal'][(atac.obs['nucleosome_signal']>x_lim[0]) & (atac.obs['nucleosome_signal']<x_lim[1])], kde=True, bins=60)
        plt.axvline(nuc_dict[folder_name][0], 0, 1)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
x_lim = [0.7,1.2]

#### set nucl filter and plot

In [None]:
nuc_filters_atac={}
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    min_nuc = nuc_dict[folder_name][0]
    max_nuc = nuc_dict[folder_name][1]
    nuc_filters_atac[folder_name] = (atac.obs['nucleosome_signal'] > min_nuc) & (atac.obs['nucleosome_signal'] < max_nuc)
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['nucleosome_signal'][(atac.obs['nucleosome_signal']>x_lim[0]) & (atac.obs['nucleosome_signal']<x_lim[1]) & counts_filters[folder_name] & tss_filters_atac[folder_name]], kde=True, bins=60)
        sb.histplot(atac.obs['nucleosome_signal'][(atac.obs['nucleosome_signal']>x_lim[0]) & (atac.obs['nucleosome_signal']<x_lim[1]) & counts_filters[folder_name] ], kde=True, bins=60)
        sb.histplot(atac.obs['nucleosome_signal'][(atac.obs['nucleosome_signal']>x_lim[0]) & (atac.obs['nucleosome_signal']<x_lim[1])], kde=True, bins=60)
        plt.axvline(nuc_dict[folder_name][1], 0, 1)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

### FRip, Blacklist and Mito reads frac

In [None]:
frip_dict = {'597_NVF_Crypts_Rep1': [0.55,120000],
 '598_FVF_Crypts_Rep1': [0.55,120000],
 '599_FVF_Crypts_Rep2': [0.55,120000],
 '604_NVF_Crypts_Rep2': [0.55,130000],
 'FVF-high': [0.55,140000],
 'FVF-low': [0.55,120000]}

In [None]:
x_lim = [0,1]

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['cr_fraction_fragments_in_peaks'][(atac.obs['cr_fraction_fragments_in_peaks']>x_lim[0]) & (atac.obs['cr_fraction_fragments_in_peaks']<x_lim[1]) & counts_filters[folder_name] & tss_filters_atac[folder_name] & nuc_filters_atac[folder_name]], kde=True, bins=60)
        sb.histplot(atac.obs['cr_fraction_fragments_in_peaks'][(atac.obs['cr_fraction_fragments_in_peaks']>x_lim[0]) & (atac.obs['cr_fraction_fragments_in_peaks']<x_lim[1])], kde=True, bins=60)
        plt.axvline(frip_dict[folder_name][0], 0, 1)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
x_lim = [0,0.1]

In [None]:
max_black_dict = {'597_NVF_Crypts_Rep1': 0.045,
 '598_FVF_Crypts_Rep1': 0.04,
 '599_FVF_Crypts_Rep2': 0.04,
 '604_NVF_Crypts_Rep2': 0.05,
 'FVF-high': 0.04,
 'FVF-low': 0.04}

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['fraction_counts_in_blacklist'][(atac.obs['fraction_counts_in_blacklist']>x_lim[0]) & (atac.obs['fraction_counts_in_blacklist']<x_lim[1]) & counts_filters[folder_name] & tss_filters_atac[folder_name] & nuc_filters_atac[folder_name]], kde=True, bins=60)
        sb.histplot(atac.obs['fraction_counts_in_blacklist'][(atac.obs['fraction_counts_in_blacklist']>x_lim[0]) & (atac.obs['fraction_counts_in_blacklist']<x_lim[1])], kde=True, bins=60)
        plt.axvline(max_black_dict[folder_name], 0, 1)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
x_lim = [0,0.1]

In [None]:
max_mito_dict = {'597_NVF_Crypts_Rep1': 0.0075,
 '598_FVF_Crypts_Rep1': 0.0075,
 '599_FVF_Crypts_Rep2': 0.0075,
 '604_NVF_Crypts_Rep2': 0.0075,
 'FVF-high': 0.0075,
 'FVF-low': 0.0075}

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(atac.obs['cr_fraction_reads_in_mito'][(atac.obs['cr_fraction_reads_in_mito']>x_lim[0]) & (atac.obs['cr_fraction_reads_in_mito']<x_lim[1])& counts_filters[folder_name] & tss_filters_atac[folder_name] & nuc_filters_atac[folder_name]], kde=True, bins=60)
        sb.histplot(atac.obs['cr_fraction_reads_in_mito'][(atac.obs['cr_fraction_reads_in_mito']>x_lim[0]) & (atac.obs['cr_fraction_reads_in_mito']<x_lim[1])], kde=True, bins=60)
        plt.axvline(max_mito_dict[folder_name], 0, 1)
        plt.title(folder_name)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

#### define filter

In [None]:
fbm_filters_atac={}
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    min_frip = frip_dict[folder_name][0]
    max_black= max_black_dict[folder_name]
    max_mito= max_mito_dict[folder_name]
    fbm_filters_atac[folder_name] = (atac.obs['cr_fraction_fragments_in_peaks'] > min_frip) & (atac.obs['fraction_counts_in_blacklist'] < max_black) & (atac.obs['cr_fraction_reads_in_mito'] < max_mito)
    # Clear memory
    del mdata
    gc.collect()

### Filtering

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac.h5mu')
    atac = mdata.mod['atac']
    atac.obs['filtered_cells']=pd.Categorical(list(map(str,list(counts_filters[folder_name] & tss_filters_atac[folder_name] & nuc_filters_atac[folder_name] & fbm_filters_atac[folder_name]))))
    min_frip = frip_dict[folder_name][0]
    max_black= max_black_dict[folder_name]
    max_mito= max_mito_dict[folder_name]
    min_counts_atac = count_dict[folder_name][0]
    max_counts_atac = count_dict[folder_name][1]
    min_tss = tss_dict[folder_name][0]
    max_tss = tss_dict[folder_name][1]
    min_nuc = nuc_dict[folder_name][0]
    max_nuc = nuc_dict[folder_name][1]
    #atac.obs['filtered_cells2']=pd.Categorical(list(map(str,list((atac.obs['cr_fraction_fragments_in_peaks'] > min_frip) & (atac.obs['fraction_counts_in_blacklist'] < max_black) & (atac.obs['cr_fraction_reads_in_mito'] < max_mito) &(atac.obs['nCount_atac'] > min_counts_atac) & (atac.obs['nCount_atac'] < max_counts_atac)&(atac.obs['TSS.enrichment'] > min_tss) & (atac.obs['TSS.enrichment'] < max_tss)))))
    mu.write(f'{base_path}{folder_name}{outs_path}/ATAC_signac_filtered.h5mu',mdata)
    # Clear memory
    del mdata
    gc.collect()

#### run this individually:

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac_filtered.h5mu')
    atac = mdata.mod['atac']
    for col in ['log_nCount_atac', 'nCount_atac', 'log_nFeature_atac', 'TSS.enrichment', 'nucleosome_signal']:
        fig, axes = plt.subplots(1,2, figsize=(10, 4), gridspec_kw=dict(width_ratios=[2,1],wspace = 0.3))
        sb.violinplot(x='leiden', y=col, data=atac.obs, ax=axes[0], hue = col, palette="bright")
        sb.violinplot(x='filtered_cells', y=col, data=atac.obs, ax=axes[1], hue = col, palette="colorblind")
        axes[1].set_ylabel(None)
        fig.suptitle(col)
        plt.show()
        plt.close()
    # Clear memory
    del mdata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac_filtered.h5mu')
    atac = mdata.mod['atac']
    sc.pl.umap(atac, color=['log_nCount_atac', 'log_nFeature_atac', 'TSS.enrichment', 'nucleosome_signal', 'cr_fraction_fragments_in_peaks', 'cr_fraction_reads_in_mito', 'fraction_counts_in_blacklist'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, title = folder_name)
    sc.pl.umap(atac, color=['filtered_cells','leiden'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=6)

    ###############################################################################
    ###############################################################################

    with rc_context({'figure.figsize': (6, 4)}): #rcParams['figure.figsize']=(6,4)
        key = 'leiden'
        labels = list(atac.obs[key].cat.categories)
        keep_pct = []
        filter_pct = []
        width = 0.85       # the width of the bars: can also be len(x) sequence

        for label in labels:
            keep_pct = keep_pct + [atac.obs['filtered_cells'][atac.obs[key]==label].value_counts()['True']/atac.obs['filtered_cells'][atac.obs[key]==label].value_counts().sum()*100]
            filter_pct = filter_pct + [atac.obs['filtered_cells'][atac.obs[key]==label].value_counts()['False']/atac.obs['filtered_cells'][atac.obs[key]==label].value_counts().sum()*100]

        fig, ax = plt.subplots()

        ax.bar(labels, filter_pct, width, label='Filter Out', edgecolor='0', linewidth=0.5)
        ax.bar(labels, keep_pct, width, bottom=filter_pct, label='Keep', edgecolor='0', linewidth=0.5)

        ax.set_ylabel('%')
        ax.set_title(f'Percentage of Filtered Cells in {folder_name}')
        ax.axes.set_xticklabels(labels=labels, rotation=90)
        ax.legend(bbox_to_anchor=(1, .5),loc='center left', edgecolor='1')

        plt.ylim([-2.5,100+2.5])
        plt.xlim([-1+0.25,len(labels)-0.25])

        plt.show()
        plt.close()

    #################################################################################
    #################################################################################

    sc.pl.scatter(atac, 'log_nCount_atac', 'nucleosome_signal', color='filtered_cells', title=folder_name)
    plt.show()
    plt.close()
    sc.pl.scatter(atac, 'log_nCount_atac', 'TSS.enrichment', color='filtered_cells', title=folder_name)
    plt.show()
    plt.close()
    # Clear memory
    del mdata
    gc.collect()


In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac_filtered.h5mu')
    atac = mdata.mod['atac']
     #Filter cells according to identified QC thresholds:
    atac = qc_filter_mdata(mdata, atac, modality='atac', qc_filter=(counts_filters[folder_name] & tss_filters_atac[folder_name] & nuc_filters_atac[folder_name] & fbm_filters_atac[folder_name]))
    mu.write(f'{base_path}{folder_name}{outs_path}/ATAC_signac_filtered.h5mu',mdata)
    # Clear memory
    del mdata
    gc.collect()

## Filter ATAc and save results

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac_filtered.h5mu')
    atac = mdata.mod['atac']
    atac = signac_qc_metrics(atac, aggregated=False, species='Mmusculus', genome="mm10", sample = folder_name, ensembl_release="v102", cr_path=base_path)
    # Clear memory
    del mdata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/ATAC_signac_filtered.h5mu')
    # retain only cell passing GEX & ATAC QC
    try:
        mu.pp.intersect_obs(mdata)
        mdata_f = mdata.copy()
    except ValueError: # workaround if current mdata is view
        mdata_f = mdata.copy()
        mu.pp.intersect_obs(mdata_f)
    atac = mdata_f.mod['atac']
    # retain only cell passing GEX & ATAC QC
    # Make matrices sparse again
    sparsify_mdata(mdata_f)
    mdata_f.mod['rna'].var['is_ambient'] = mdata_f.mod['rna'].var['is_ambient'].astype(str).astype('category').copy()
    # https://github.com/scverse/muon/issues/65
    mdata_f.mod['atac'].uns['files'] = dict(mdata_f.mod['atac'].uns['files'])
    mdata_f.mod['atac'].uns['atac'] = dict(mdata_f.mod['atac'].uns['atac'])
    del mdata.mod['atac'].obs['filtered_cells']
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done.h5mu',mdata_f)
    # Clear memory
    del mdata
    gc.collect()