# Setup

In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
import os
import torch

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
from matplotlib import colors
import seaborn as sb
from plotnine import *
from adjustText import adjust_text
import umap.umap_ as umap
#import pegasus as pg


# Analysis
import muon as mu
import scanpy as sc
import scanpy.external as sce
import scrublet as scr
import doubletdetection
import scvi

#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri

# Warnings
import warnings
warnings.filterwarnings('ignore') #(action='once')


#garbage collector
import gc

In [None]:
sc.logging.print_versions()

In [None]:
# Plot settings
%matplotlib inline

## Directory
sc.settings.figdir='/mnt/hdd/Notebooks/Gut_project/Figures'

## Plotting parameters
rcParams['figure.figsize']=(5,5) #rescale figures
#sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False, color_map='tab10' ,transparent=True, dpi=150, dpi_save=300)
sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=True, dpi=150, dpi_save=300)
sb.set_theme(rc={'figure.figsize':(3,3)})

## Font
#rcParams['font.family'] = 'sans-serif'
#rcParams['font.sans-serif'] = ['Source Sans 3']

## Grid & Ticks
rcParams['grid.alpha'] = 0
rcParams['xtick.bottom'] = True
rcParams['ytick.left'] = True


## Embed font
plt.rc('pdf', fonttype=42)

## Define new default settings
#plt.rcParamsDefault = plt.rcParams

In [None]:
# Colormap
colors2 = plt.cm.Reds(np.linspace(0, 1, 128)) 
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20)) 
colorsComb = np.vstack([colors3, colors2]) 
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)

## setup R

In [None]:
%run utils.ipynb

In [None]:
setup_R('/home/scanalysis/mnt/envs/scUV/lib/R')

In [None]:
%%R
.libPaths()

## Load Data

In [None]:
import os
import glob

In [None]:
base_path = '/mnt/hdd/data/Multiome/'
outs_path = '/outs/'

In [None]:
# Get a list of folder names, sorted alphabetically
folder_names = sorted([f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))])
folder_variables = {}

for index, folder_name in enumerate(folder_names, start=46):
    variable_name = f"sample{index}"
    folder_variables[folder_name] = variable_name
folder_variables

In [None]:
for variable_name, folder_name in folder_variables.items():
    globals()[variable_name] = folder_name

In [None]:
samples = ['597_NVF_Crypts_Rep1', '598_FVF_Crypts_Rep1','599_FVF_Crypts_Rep2','604_NVF_Crypts_Rep2', 'FVF-high','FVF-low']
#samples = [ 'FVF-low']

# continue GEX doublet detection

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_doubletfinder.h5mu')
    adata = mdata.mod['rna']
    sc.pl.umap(adata, color=['n_genes','n_counts','pANN.sct','DF_classifications_1.sct','DF_classifications_2.sct'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, title = f'n_genes {folder_name}')
    #mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

## Dblt detection

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_doubletfinder.h5mu')
    adata = mdata.mod['rna'] 
    adata.obs['dd_doublets']=0
    adata.obs['dd_scores']=0
    adata.obs['dd_log_p_values']=0
    adata.obs['dd_voting_average']=0
    adata_clf = doubletdetection.BoostClassifier(n_iters=200, clustering_algorithm='leiden', standard_scaling=True, n_jobs=20, random_state=42) #changed from n_jobs =64, because only up to 20 threads are allowed, and set random_state to 42, because of reasons.
    adata_dd_doublets = adata_clf.fit(adata.X).predict(p_thresh=1e-6, voter_thresh=0.8)
    plot=doubletdetection.plot.convergence(adata_clf, show=False, p_thresh=1e-6, voter_thresh=0.8)
    plot=doubletdetection.plot.threshold(adata_clf, show=True, p_step=5, log_p_grid=np.arange(-15, -1))
    umap_plot(adata.X, adata_dd_doublets, random_state=1, show=True)
    adata.obs['dd_doublets']=adata_dd_doublets.astype('bool')
    adata.obs['dd_-log_p_values']=np.mean(adata_clf.all_log_p_values_, axis=0) * -1
    adata.obs['dd_voting_average']=adata_clf.voting_average_
    adata.obs['dd_scores']=np.mean(adata_clf.all_scores_, axis=0)
    adata.obs['dd_doublets_cat'] = adata.obs['dd_doublets'].astype(str).astype('category')
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd2.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

In [None]:
#still 597
umap_plot(adata.X, adata_dd_doublets, random_state=1, show=True)
adata.obs['dd_doublets']=adata_dd_doublets.astype('bool')
adata.obs['dd_-log_p_values']=np.mean(adata_clf.all_log_p_values_, axis=0) * -1
adata.obs['dd_voting_average']=adata_clf.voting_average_
adata.obs['dd_scores']=np.mean(adata_clf.all_scores_, axis=0)
adata.obs['dd_doublets_cat'] = adata.obs['dd_doublets'].astype(str).astype('category')
mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd2.h5mu',mdata)
del mdata
del adata
gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    elif folder_name == '597_NVF_Crypts_Rep1':
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_doubletfinder.h5mu')
    adata = mdata.mod['rna'] 
    adata.obs['dd_doublets']=0
    adata.obs['dd_scores']=0
    adata.obs['dd_log_p_values']=0
    adata.obs['dd_voting_average']=0
    adata_clf = doubletdetection.BoostClassifier(n_iters=200, clustering_algorithm='leiden', standard_scaling=True, n_jobs=20, random_state=42) #changed from n_jobs =64, because only up to 20 threads are allowed, and set random_state to 42, because of reasons.
    adata_dd_doublets = adata_clf.fit(adata.X).predict(p_thresh=1e-6, voter_thresh=0.8)
    plot=doubletdetection.plot.convergence(adata_clf, show=False, p_thresh=1e-6, voter_thresh=0.8)
    plot=doubletdetection.plot.threshold(adata_clf, show=True, p_step=5, log_p_grid=np.arange(-15, -1))
    umap_plot(adata.X, adata_dd_doublets, random_state=1, show=True)
    adata.obs['dd_doublets']=adata_dd_doublets.astype('bool')
    adata.obs['dd_-log_p_values']=np.mean(adata_clf.all_log_p_values_, axis=0) * -1
    adata.obs['dd_voting_average']=adata_clf.voting_average_
    adata.obs['dd_scores']=np.mean(adata_clf.all_scores_, axis=0)
    adata.obs['dd_doublets_cat'] = adata.obs['dd_doublets'].astype(str).astype('category')
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd2.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd2.h5mu')
    adata = mdata.mod['rna']
    with rc_context({'figure.figsize': (6, 3)}):
        sc.pl.umap(adata, color=['n_genes','n_counts','dd_doublets_cat','dd_-log_p_values','dd_voting_average','dd_scores'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, title = f'{folder_name} n_genes')
    print('DoubletDetection doublet rate:', adata.obs['dd_doublets'].value_counts()[1]/adata.obs['sample'].value_counts()[0]*100, '% (',adata.obs['dd_doublets'].value_counts()[1],' cells)' )
    #mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

## Solo sample wise

#### sample 597_FVF_Crypts

In [None]:
mdata= read_h5mu_to_mudata(f'{base_path}597_NVF_Crypts_Rep1{outs_path}/multiome_1_done_dd2.h5mu')
adata = mdata.mod['rna'] 

In [None]:
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata, n_hidden=256, n_latent=20, gene_likelihood='nb')
vae.train()
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()
predictions = solo.predict()
predictions['solo_doublet_class'] = solo.predict(soft=False)

In [None]:
#predictions.index = [index[:-2] for index in predictions.index]
predictions.columns = ['solo_doublet_score', 'solo_singlet_score','solo_doublet_class']

In [None]:
p=sb.jointplot(data=predictions, x='solo_singlet_score', y='solo_doublet_score', s=2, kind='scatter', linewidth=0, space=0, height = 4, marginal_kws=dict(bins=200, kde=True)) #, hue='solo_doublet_class',data=predictions))
p.plot_joint(sb.scatterplot, color="black", s=3 ,data=predictions, linewidth=0)
p.plot_joint(sb.scatterplot, s=2, hue='solo_doublet_class',data=predictions, linewidth=0)
p.ax_joint.axvline(x=0, ymin=0, ymax=max(predictions['solo_doublet_score']), color="black", lw=0.5).set_linestyle("--")
p.ax_joint.axhline(y=0, xmin=0, xmax=max(predictions['solo_singlet_score']), color="black", lw=0.5).set_linestyle("--")
p.ax_joint.legend(frameon=False)
plt.show()

In [None]:
print('SOLO doublet rate:', sum(predictions.solo_doublet_class == 'doublet')/adata.n_obs*100, '% (',sum(predictions.solo_doublet_class == 'doublet'),' cells)' )

In [None]:
adata.obs = pd.concat([adata.obs, predictions], axis=1)
adata.obs['solo_doublets'] = False
adata.obs.loc[adata.obs['solo_doublet_class'] == 'doublet','solo_doublets'] = True
mu.write(f'{base_path}597_NVF_Crypts_Rep1{outs_path}/multiome_1_done_dd2.h5mu',mdata)


#### sample 598_NVF_Crypts

In [None]:
mdata= read_h5mu_to_mudata(f'{base_path}598_FVF_Crypts_Rep1{outs_path}/multiome_1_done_dd2.h5mu')
adata = mdata.mod['rna'] 

In [None]:
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata, n_hidden=256, n_latent=20, gene_likelihood='nb')
vae.train()
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()
predictions = solo.predict()
predictions['solo_doublet_class'] = solo.predict(soft=False)

In [None]:
#predictions.index = [index[:-2] for index in predictions.index]
predictions.columns = ['solo_doublet_score', 'solo_singlet_score','solo_doublet_class']

In [None]:
p=sb.jointplot(data=predictions, x='solo_singlet_score', y='solo_doublet_score', s=2, kind='scatter', linewidth=0, space=0, height = 4, marginal_kws=dict(bins=200, kde=True)) #, hue='solo_doublet_class',data=predictions))
p.plot_joint(sb.scatterplot, color="black", s=3 ,data=predictions, linewidth=0)
p.plot_joint(sb.scatterplot, s=2, hue='solo_doublet_class',data=predictions, linewidth=0)
p.ax_joint.axvline(x=0, ymin=0, ymax=max(predictions['solo_doublet_score']), color="black", lw=0.5).set_linestyle("--")
p.ax_joint.axhline(y=0, xmin=0, xmax=max(predictions['solo_singlet_score']), color="black", lw=0.5).set_linestyle("--")
p.ax_joint.legend(frameon=False)
plt.show()

In [None]:
print('SOLO doublet rate:', sum(predictions.solo_doublet_class == 'doublet')/adata.n_obs*100, '% (',sum(predictions.solo_doublet_class == 'doublet'),' cells)' )

In [None]:
adata.obs = pd.concat([adata.obs, predictions], axis=1)
adata.obs['solo_doublets'] = False
adata.obs.loc[adata.obs['solo_doublet_class'] == 'doublet','solo_doublets'] = True
mu.write(f'{base_path}598_FVF_Crypts_Rep1{outs_path}/multiome_1_done_dd2.h5mu',mdata)


#### sample 599_FVF_Crypts

In [None]:
mdata= read_h5mu_to_mudata(f'{base_path}599_FVF_Crypts_Rep2{outs_path}/multiome_1_done_dd2.h5mu')
adata = mdata.mod['rna'] 

In [None]:
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata, n_hidden=256, n_latent=20, gene_likelihood='nb')
vae.train()
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()
predictions = solo.predict()
predictions['solo_doublet_class'] = solo.predict(soft=False)

In [None]:
#predictions.index = [index[:-2] for index in predictions.index]
predictions.columns = ['solo_doublet_score', 'solo_singlet_score','solo_doublet_class']

In [None]:
with rc_context({'figure.figsize': (5, 5)}):
    p=sb.jointplot(data=predictions, x='solo_singlet_score', y='solo_doublet_score', s=2, kind='scatter', linewidth=0, height = 4, space=0, marginal_kws=dict(bins=200, kde=True)) #, hue='solo_doublet_class',data=predictions))
    p.plot_joint(sb.scatterplot, color="black", s=3 ,data=predictions, linewidth=0)
    p.plot_joint(sb.scatterplot, s=2, hue='solo_doublet_class',data=predictions, linewidth=0)
    p.ax_joint.axvline(x=0, ymin=0, ymax=max(predictions['solo_doublet_score']), color="black", lw=0.5).set_linestyle("--")
    p.ax_joint.axhline(y=0, xmin=0, xmax=max(predictions['solo_singlet_score']), color="black", lw=0.5).set_linestyle("--")
    p.ax_joint.legend(frameon=False)
    plt.show()

In [None]:
print('SOLO doublet rate:', sum(predictions.solo_doublet_class == 'doublet')/adata.n_obs*100, '% (',sum(predictions.solo_doublet_class == 'doublet'),' cells)' )

In [None]:
adata.obs = pd.concat([adata.obs, predictions], axis=1)
adata.obs['solo_doublets'] = False
adata.obs.loc[adata.obs['solo_doublet_class'] == 'doublet','solo_doublets'] = True
mu.write(f'{base_path}599_FVF_Crypts_Rep2{outs_path}/multiome_1_done_dd2.h5mu',mdata)


#### sample 604_NVF_Crypts

In [None]:
mdata= read_h5mu_to_mudata(f'{base_path}604_NVF_Crypts_Rep2{outs_path}/multiome_1_done_dd2.h5mu')
adata = mdata.mod['rna'] 

In [None]:
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata, n_hidden=256, n_latent=20, gene_likelihood='nb')
vae.train()
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()
predictions = solo.predict()
predictions['solo_doublet_class'] = solo.predict(soft=False)

In [None]:
#predictions.index = [index[:-2] for index in predictions.index]
predictions.columns = ['solo_doublet_score', 'solo_singlet_score','solo_doublet_class']

In [None]:
with rc_context({'figure.figsize': (5, 5)}):
    p=sb.jointplot(data=predictions, x='solo_singlet_score', y='solo_doublet_score', s=2, kind='scatter', linewidth=0, space=0, height = 4, marginal_kws=dict(bins=200, kde=True)) #, hue='solo_doublet_class',data=predictions))
    p.plot_joint(sb.scatterplot, color="black", s=3 ,data=predictions, linewidth=0)
    p.plot_joint(sb.scatterplot, s=2, hue='solo_doublet_class',data=predictions, linewidth=0)
    p.ax_joint.axvline(x=0, ymin=0, ymax=max(predictions['solo_doublet_score']), color="black", lw=0.5).set_linestyle("--")
    p.ax_joint.axhline(y=0, xmin=0, xmax=max(predictions['solo_singlet_score']), color="black", lw=0.5).set_linestyle("--")
    p.ax_joint.legend(frameon=False)
    plt.show()

In [None]:
print('SOLO doublet rate:', sum(predictions.solo_doublet_class == 'doublet')/adata.n_obs*100, '% (',sum(predictions.solo_doublet_class == 'doublet'),' cells)' )

In [None]:
adata.obs = pd.concat([adata.obs, predictions], axis=1)
adata.obs['solo_doublets'] = False
adata.obs.loc[adata.obs['solo_doublet_class'] == 'doublet','solo_doublets'] = True
mu.write(f'{base_path}604_NVF_Crypts_Rep2{outs_path}/multiome_1_done_dd2.h5mu',mdata)


#### sample FVF_high

In [None]:
mdata= read_h5mu_to_mudata(f'{base_path}FVF-high{outs_path}/multiome_1_done_dd2.h5mu')
adata = mdata.mod['rna'] 

In [None]:
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata, n_hidden=256, n_latent=20, gene_likelihood='nb')
vae.train()
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()
predictions = solo.predict()
predictions['solo_doublet_class'] = solo.predict(soft=False)

In [None]:
#predictions.index = [index[:-2] for index in predictions.index]
predictions.columns = ['solo_doublet_score', 'solo_singlet_score','solo_doublet_class']

In [None]:
with rc_context({'figure.figsize': (5, 5)}):
    p=sb.jointplot(data=predictions, x='solo_singlet_score', y='solo_doublet_score', s=2, kind='scatter', linewidth=0, height = 4, space=0, marginal_kws=dict(bins=200, kde=True)) #, hue='solo_doublet_class',data=predictions))
    p.plot_joint(sb.scatterplot, color="black", s=3 ,data=predictions, linewidth=0)
    p.plot_joint(sb.scatterplot, s=2, hue='solo_doublet_class',data=predictions, linewidth=0)
    p.ax_joint.axvline(x=0, ymin=0, ymax=max(predictions['solo_doublet_score']), color="black", lw=0.5).set_linestyle("--")
    p.ax_joint.axhline(y=0, xmin=0, xmax=max(predictions['solo_singlet_score']), color="black", lw=0.5).set_linestyle("--")
    p.ax_joint.legend(frameon=False)
    plt.show()

In [None]:
print('SOLO doublet rate:', sum(predictions.solo_doublet_class == 'doublet')/adata.n_obs*100, '% (',sum(predictions.solo_doublet_class == 'doublet'),' cells)' )

In [None]:
adata.obs = pd.concat([adata.obs, predictions], axis=1)
adata.obs['solo_doublets'] = False
adata.obs.loc[adata.obs['solo_doublet_class'] == 'doublet','solo_doublets'] = True
mu.write(f'{base_path}FVF-high{outs_path}/multiome_1_done_dd2.h5mu',mdata)


#### sample FVF_low

In [None]:
mdata= read_h5mu_to_mudata(f'{base_path}FVF-low{outs_path}/multiome_1_done_dd2.h5mu')
adata = mdata.mod['rna'] 

In [None]:
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata, n_hidden=256, n_latent=20, gene_likelihood='nb')
vae.train()
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()
predictions = solo.predict()
predictions['solo_doublet_class'] = solo.predict(soft=False)

In [None]:
#predictions.index = [index[:-2] for index in predictions.index]
predictions.columns = ['solo_doublet_score', 'solo_singlet_score','solo_doublet_class']

In [None]:
with rc_context({'figure.figsize': (5, 5)}):
    p=sb.jointplot(data=predictions, x='solo_singlet_score', y='solo_doublet_score', s=2, kind='scatter', linewidth=0, height = 4, space=0, marginal_kws=dict(bins=200, kde=True)) #, hue='solo_doublet_class',data=predictions))
    p.plot_joint(sb.scatterplot, color="black", s=3 ,data=predictions, linewidth=0)
    p.plot_joint(sb.scatterplot, s=2, hue='solo_doublet_class',data=predictions, linewidth=0)
    p.ax_joint.axvline(x=0, ymin=0, ymax=max(predictions['solo_doublet_score']), color="black", lw=0.5).set_linestyle("--")
    p.ax_joint.axhline(y=0, xmin=0, xmax=max(predictions['solo_singlet_score']), color="black", lw=0.5).set_linestyle("--")
    p.ax_joint.legend(frameon=False)
    plt.show()

In [None]:
print('SOLO doublet rate:', sum(predictions.solo_doublet_class == 'doublet')/adata.n_obs*100, '% (',sum(predictions.solo_doublet_class == 'doublet'),' cells)' )

In [None]:
adata.obs = pd.concat([adata.obs, predictions], axis=1)
adata.obs['solo_doublets'] = False
adata.obs.loc[adata.obs['solo_doublet_class'] == 'doublet','solo_doublets'] = True
mu.write(f'{base_path}FVF-low{outs_path}/multiome_1_done_dd2.h5mu',mdata)

## Summary

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd2.h5mu')
    adata = mdata.mod['rna'] 
    with rc_context({'figure.figsize': (5, 5)}):
        sc.pl.umap(adata, color=['solo_doublet_score', 'solo_singlet_score','solo_doublet_class'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, cmap='RdBu_r', vcenter=0, title =f'{folder_name} solo_doublet_score')
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu')
    adata = mdata.mod['rna'] 
    # Sum up doublets from different tools
    adata.obs['doublet_calls'] = adata.obs[['predicted_doublet','dd_doublets','solo_doublets','df_doublets','sdf_doublets','scds_doublets']].sum(axis=1)
    sc.pl.umap(adata, color=['sample','doublet_calls'],  add_outline=True, alpha=1, outline_width=(0.3, 0.0)) #size=1,
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

In [None]:
# Number of calls
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu')
    adata = mdata.mod['rna'] 
    print(f'\n{folder_name} :')
    print(adata.obs['doublet_calls'].value_counts())
    del mdata
    del adata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu')
    adata = mdata.mod['rna'] 
    adata.obs.loc[:,'final_doublets'] = False
    adata.obs.loc[adata.obs.loc[:,'doublet_calls'] > 3,'final_doublets'] = True
    adata.obs['final_doublets_cat'] = adata.obs['final_doublets'].astype(str).astype('category')
    sc.pl.umap(adata, color=['final_doublets_cat','doublet_calls'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0))
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

In [None]:
# Number of final doublets
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu')
    adata = mdata.mod['rna'] 
    print(f'{folder_name} :')
    print('Number of doublets:')
    print(adata.obs['final_doublets'].value_counts())

    # Percentage:
    print('\nOverall doublet rate: ',adata.obs['final_doublets'].value_counts()[1]/len(adata.obs['final_doublets'])*100,'%')
    del mdata
    del adata
    gc.collect()

In [None]:
# Annotate the data sets
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu')
    adata = mdata.mod['rna'] 
    print(f'\n{folder_name} :')
    print(adata.obs['sample'].value_counts())
    # Checking the total size of the data set
    adata.shape
    del mdata
    del adata
    gc.collect()

#### Doublet rates per cluster

In [None]:
import pegasus as pg
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu')
    adata = mdata.mod['rna'] 
    print(f'{folder_name} :')
    adata.obs['doublets_shown'] = adata.obs['doublet_calls'].astype(str).astype('category')
    pg.compo_plot(adata,'leiden', 'doublets_shown', style = 'frequency',
              sort_function=None, 
              palette=['#FFD700', '#FF7F50', '#8B0000', '#0000CD', '#6495ED', '#008080', '#B0C4DE', '#696969'], dpi = 150)
    plt.show()
    plt.close()
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

# Doublet Identification ATAC

## AMULET & scDblFinder



In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu')
    atac = mdata.mod['atac'] 
    fragment_file = f'{base_path}{folder_name}{outs_path}/atac_fragments.tsv.gz'
    atac.uns['files']['fragments'] = fragment_file
    run_scDblFinder_ATAC(atac, repeats_file=f'{base_path}AMULET_exclusion_regions_noChr.bed')
    sb.histplot(atac.obs['atac.combined.score'], kde=True, bins=100)
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary_atac.h5mu',mdata)
    del mdata
    del atac
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
for folder_name, sample_name in folder_variables.items(): #stopped chunk before because it took too long with 20 as memory
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    elif folder_name in ['597_NVF_Crypts_Rep1', '598_FVF_Crypts_Rep1', '599_FVF_Crypts_Rep2']:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary.h5mu')
    atac = mdata.mod['atac'] 
    fragment_file = f'{base_path}{folder_name}{outs_path}/atac_fragments.tsv.gz'
    atac.uns['files']['fragments'] = fragment_file
    run_scDblFinder_ATAC(atac, repeats_file=f'{base_path}AMULET_exclusion_regions_noChr.bed')
    sb.histplot(atac.obs['atac.combined.score'], kde=True, bins=100)
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary_atac.h5mu',mdata)
    del mdata
    del atac
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
for folder_name, sample_name in folder_variables.items(): #reprint the plots
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue   
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary_atac.h5mu')
    atac = mdata.mod['atac']
    with rc_context({'figure.figsize': (5, 3)}):
        sb.histplot(atac.obs['atac.combined.score'], kde=True, bins=100)
        plt.title(folder_name)
        plt.show()
        plt.close()
    del mdata
    del atac
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
cutoff_dict = {'597_NVF_Crypts_Rep1': (0.98,0.55), #min(atac[atac.obs['hybrid_class'] == 'doublet'].obs['atac.combined.score'])
 '598_FVF_Crypts_Rep1': (0.98,0.55),
 '599_FVF_Crypts_Rep2': (0.99,0.55),
 '604_NVF_Crypts_Rep2': (0.99,0.55),
 'FVF-high': (0.99,0.55),
 'FVF-low': (0.99,0.55)
}

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary_atac.h5mu')
    atac = mdata.mod['atac']    
    # Adjust cut-off as doublet rate is too high 
    cut_off = cutoff_dict[folder_name][0]

    with rc_context({'figure.figsize': (5, 3)}):
        sb.histplot(atac.obs['atac.combined.score'][(atac.obs['atac.combined.score']>0.95) & (atac.obs['atac.combined.score']<=1)], kde=True, bins=100)
        plt.axvline(cut_off, 0, 1, color="black", lw=1).set_linestyle("--")
        plt.title(folder_name)
        plt.show()
        plt.close()
    print(folder_name)
    print('ATAC doublet rate:', (atac.obs['atac.combined.score'] > cut_off).value_counts()[1]/atac.obs['sample'].value_counts()[0]*100, '% (',(atac.obs['atac.combined.score'] > cut_off).value_counts()[1],' cells)' )
    del mdata
    del atac
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary_atac.h5mu')
    atac = mdata.mod['atac']   
    atac.obs['atac.combined.class'] = 'singlet'
    atac.obs['atac.combined.class'] = pd.Categorical(atac.obs['atac.combined.class'], categories=['doublet','singlet'])
    atac.obs.loc[atac.obs['atac.combined.score'] >= cut_off,'atac.combined.class'] = 'doublet'
    atac.obs.loc[:,'atac_sdf_doublets'] = False
    atac.obs.loc[atac.obs.loc[:,'atac.combined.class']=='doublet','atac_sdf_doublets'] = True

    print('ATAC AMULTET & scDblFinder doublet rate:', atac.obs['atac_sdf_doublets'].value_counts()[1]/atac.obs['sample'].value_counts()[0]*100, '% (',atac.obs['atac_sdf_doublets'].value_counts()[1],' cells)' )
    sc.pl.umap(atac, color=['atac.combined.score','atac.combined.class'], size=2, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, title =f'{folder_name} atac.combined.score')
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary_atac.h5mu',mdata)
    del mdata
    gc.collect()
    

# Aggregate Doublet Calls

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary_atac.h5mu')
    atac = mdata.mod['atac']   
    adata = mdata.mod['rna']
    adata.obs['atac_sdf_doublets'] = atac.obs['atac_sdf_doublets'].copy()
    mdata.obs['atac_sdf_doublets'] = atac.obs['atac_sdf_doublets'].copy()

    mdata.obs[['predicted_doublet','dd_doublets','solo_doublets','df_doublets','sdf_doublets','scds_doublets']] = adata.obs[['predicted_doublet','dd_doublets','solo_doublets','df_doublets','sdf_doublets','scds_doublets']].copy()
    atac.obs[['predicted_doublet','dd_doublets','solo_doublets','df_doublets','sdf_doublets','scds_doublets']] = adata.obs[['predicted_doublet','dd_doublets','solo_doublets','df_doublets','sdf_doublets','scds_doublets']].copy()
    # Sum up doublets from different tools
    mdata.obs['doublet_calls'] = mdata.obs[['predicted_doublet','dd_doublets','solo_doublets','df_doublets','sdf_doublets','scds_doublets','atac_sdf_doublets']].sum(axis=1)
    mdata.obsm['X_umap'] = mdata.mod['rna'].obsm['X_umap']
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary_atac_summary.h5mu',mdata)
    del mdata
    del adata
    del atac
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary_atac_summary.h5mu')
    print(f'{folder_name}: ')
    # Number of calls
    print(mdata.obs['doublet_calls'].value_counts())
    with rc_context({'figure.figsize': (3, 2)}):
        sc.pl.umap(mdata, color=['sample','doublet_calls'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0))
    del mdata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_summary_atac_summary.h5mu')
    atac = mdata.mod['atac']   
    adata = mdata.mod['rna']
    mdata.obs.loc[:,'final_doublets'] = False
    mdata.obs.loc[mdata.obs.loc[:,'doublet_calls'] > 3,'final_doublets'] = True
    mdata.obs['final_doublets_cat'] = mdata.obs['final_doublets'].astype(str).astype('category')
    # Annotate the data sets
    print(mdata.obs['sample'].value_counts())

    # Checking the total size of the data set
    mdata.shape
    # Number of final doublets
    print('Number of doublets:')
    print(mdata.obs['final_doublets'].value_counts())

    # Percentage:
    print('\nOverall doublet rate: ',mdata.obs['final_doublets'].value_counts()[1]/len(mdata.obs['final_doublets'])*100,'%')

    sc.pl.umap(mdata, color=['final_doublets_cat','doublet_calls'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0))
    adata.obs[['final_doublets','final_doublets_cat','doublet_calls']] = mdata.obs[['final_doublets','final_doublets_cat','doublet_calls']].copy()
    atac.obs[['final_doublets','final_doublets_cat','doublet_calls']] = mdata.obs[['final_doublets','final_doublets_cat','doublet_calls']].copy()
    mu.write(f'{base_path}{folder_name}{outs_path}/raw_feature_bc_matrix_filtered_markedDoublets.h5mu',mdata)
    del mdata
    del adata
    del atac
    gc.collect()

#### save adata

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/raw_feature_bc_matrix_filtered_markedDoublets.h5mu')
    adata = mdata.mod['rna']
    adata.write(f'{base_path}{folder_name}/count_matrices/Doublets_detected.h5ad')
    del mdata
    del adata
    gc.collect()