# Setup

In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
import os
import torch

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
from matplotlib import colors
import seaborn as sb
from plotnine import *
from adjustText import adjust_text
import umap.umap_ as umap
#import pegasus as pg


# Analysis
import muon as mu
import scanpy as sc
import scanpy.external as sce
import scrublet as scr
import doubletdetection
import scvi

#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri

# Warnings
import warnings
warnings.filterwarnings('ignore') #(action='once')


#garbage collector
import gc

In [None]:
sc.logging.print_versions()

In [None]:
# Plot settings
%matplotlib inline

## Directory
sc.settings.figdir='/mnt/hdd/Notebooks/Gut_project/Figures'

## Plotting parameters
rcParams['figure.figsize']=(5,5) #rescale figures
#sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False, color_map='tab10' ,transparent=True, dpi=150, dpi_save=300)
sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=True, dpi=150, dpi_save=300)
sb.set_theme(rc={'figure.figsize':(3,3)})

## Font
#rcParams['font.family'] = 'sans-serif'
#rcParams['font.sans-serif'] = ['Source Sans 3']

## Grid & Ticks
rcParams['grid.alpha'] = 0
rcParams['xtick.bottom'] = True
rcParams['ytick.left'] = True


## Embed font
plt.rc('pdf', fonttype=42)

## Define new default settings
#plt.rcParamsDefault = plt.rcParams

In [None]:
# Colormap
colors2 = plt.cm.Reds(np.linspace(0, 1, 128)) 
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20)) 
colorsComb = np.vstack([colors3, colors2]) 
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)

## setup R

In [None]:
%run utils.ipynb

In [None]:
setup_R('/home/scanalysis/mnt/envs/scUV/lib/R')

In [None]:
%%R
.libPaths()

## Load Data

In [None]:
import os
import glob

In [None]:
base_path = '/mnt/hdd/data/Multiome/'
outs_path = '/outs/'

In [None]:
# Get a list of folder names, sorted alphabetically
folder_names = sorted([f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))])
folder_variables = {}

for index, folder_name in enumerate(folder_names, start=46):
    variable_name = f"sample{index}"
    folder_variables[folder_name] = variable_name
folder_variables

In [None]:
for variable_name, folder_name in folder_variables.items():
    globals()[variable_name] = folder_name

In [None]:
samples = ['597_NVF_Crypts_Rep1', '598_FVF_Crypts_Rep1','599_FVF_Crypts_Rep2','604_NVF_Crypts_Rep2', 'FVF-high','FVF-low']
#samples = [ 'FVF-low']

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done.h5mu')
    gex = mdata.mod['rna']
    sc.pl.umap(gex, color=['sample','leiden','n_counts','log_counts','n_genes','log_genes','mt_frac','rp_frac'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4,
          color_map = mymap, title=f'{folder_name}')
    plt.show()
    plt.close()
    del mdata
    del gex
    gc.collect()

# Doublet detection GEX

### Scrublet

In [None]:
settings_dict = {'597_NVF_Crypts_Rep1': (0.09,0.28),
 '598_FVF_Crypts_Rep1': (0.1,0.16),
 '599_FVF_Crypts_Rep2': (0.09,0.22),
 '604_NVF_Crypts_Rep2': (0.09,0.255),
 'FVF-high': (0.09,0.21),
 'FVF-low': (0.09,0.3)
 }

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done.h5mu')
    adata = mdata.mod['rna']
    sce.pp.scrublet(adata, sim_doublet_ratio=5, expected_doublet_rate=settings_dict[folder_name][0], threshold=settings_dict[folder_name][1])
    sce.pl.scrublet_score_distribution(adata)
    plt.show()
    plt.close()
    adata.obs['scrublet_doublets_cat'] = adata.obs['predicted_doublet'].astype(str).astype('category')  
    sc.pl.umap(adata, color=['n_genes','n_counts','scrublet_doublets_cat','doublet_score'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4,title=f'{folder_name} n_genes',color_map = mymap)
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

### SCDS

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu')
    adata = mdata.mod['rna']
    run_SCDS(adata)
    print('Scds doublet rate hybrid class:', (adata.obs['hybrid_class'] == 'doublet').value_counts()[1]/adata.obs['sample'].value_counts()[0]*100, '% (',(adata.obs['hybrid_class'] == 'doublet').value_counts()[1],' cells)' )
    print('Cut-off:', min(adata[adata.obs['hybrid_class'] == 'doublet'].obs['hybrid_score']))
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

In [None]:
cutoff_dict = {'597_NVF_Crypts_Rep1': (1.2,0.5), #min(adata[adata.obs['hybrid_class'] == 'doublet'].obs['hybrid_score']), min(adata[adata.obs['hybrid_class_sct'] == 'doublet'].obs['hybrid_score_sct'])
 '598_FVF_Crypts_Rep1': (1.25,0.48),
 '599_FVF_Crypts_Rep2': (1.2,0.5),
 '604_NVF_Crypts_Rep2': (1.2,0.55),
 'FVF-high': (1.15,0.52),
 'FVF-low': (1.4,0.65)
}

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu')
    adata = mdata.mod['rna']
    # Adjust cut-off as doublet rate is too high 
    cut_off = cutoff_dict[folder_name][0]

    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(adata.obs['hybrid_score'], kde=True, bins=100)
        #sb.histplot(adata.obs['hybrid_score'][(adata.obs['hybrid_score']>0.35) & (adata.obs['hybrid_score']<2)], kde=True, bins=100)
        plt.axvline(cut_off, 0, 1, color="black", lw=1).set_linestyle("--")
        plt.title(folder_name)
        plt.show()
        plt.close()

    print('Scds doublet rate above cutoff:', (adata.obs['hybrid_score'] > cut_off).value_counts()[1]/adata.obs['sample'].value_counts()[0]*100, '% (',(adata.obs['hybrid_score'] > cut_off).value_counts()[1],' cells)' )
    print('Scds doublet rate class sct:', (adata.obs['hybrid_class_sct'] == 'doublet').value_counts()[1]/adata.obs['sample'].value_counts()[0]*100, '% (',(adata.obs['hybrid_class_sct'] == 'doublet').value_counts()[1],' cells)' )
    print('Cut-off:', min(adata[adata.obs['hybrid_class_sct'] == 'doublet'].obs['hybrid_score_sct']))
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu')
    adata = mdata.mod['rna']
    #Adjust cut-off as doublet rate is too high 
    cut_off = cutoff_dict[folder_name][1]

    with rc_context({'figure.figsize': (8, 3)}):
        sb.histplot(adata.obs['hybrid_score_sct'], kde=True, bins=100)
        #sb.histplot(adata.obs['hybrid_score_sct'][(adata.obs['hybrid_score_sct']>0.35) & (adata.obs['hybrid_score_sct']<0.85)], kde=True, bins=100)
        plt.axvline(cut_off, 0, 1, color="black", lw=1).set_linestyle("--")
        plt.title(folder_name)
        plt.show()
        plt.close()

    print('Scds doublet rate above cutoff sct:', (adata.obs['hybrid_score_sct'] > cut_off).value_counts()[1]/adata.obs['sample'].value_counts()[0]*100, '% (',(adata.obs['hybrid_score_sct'] > cut_off).value_counts()[1],' cells)' )
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu')
    adata = mdata.mod['rna']
    cut_off = cutoff_dict[folder_name][0]
    cut_off_sct = cutoff_dict[folder_name][1]
    adata.obs['hybrid_class_sct'] = pd.Categorical(adata.obs['hybrid_class_sct'], categories=['doublet','singlet'])
    adata.obs['hybrid_class_sct'] = 'singlet'
    adata.obs.loc[adata.obs['hybrid_score_sct'] > cut_off_sct,'hybrid_class_sct'] = 'doublet'
    adata.obs['hybrid_class'] = pd.Categorical(adata.obs['hybrid_class'], categories=['doublet','singlet'])
    adata.obs['hybrid_class'] = 'singlet'
    adata.obs.loc[adata.obs['hybrid_score'] > cut_off,'hybrid_class'] = 'doublet'
    adata.obs.loc[:,'scds_doublets'] = False
    adata.obs.loc[adata.obs.loc[:,'hybrid_class']=='doublet','scds_doublets'] = True
    adata.obs.loc[adata.obs.loc[:,'hybrid_class_sct']=='doublet','scds_doublets'] = True

    print('Scds doublet rate:', adata.obs['scds_doublets'].value_counts()[1]/adata.obs['sample'].value_counts()[0]*100, '% (',adata.obs['scds_doublets'].value_counts()[1],' cells)' )
    sc.pl.umap(adata, color=['n_genes','n_counts','hybrid_class','hybrid_score','hybrid_class_sct','hybrid_score_sct'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=6, title= f'{folder_name} n_genes')
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

## scDblFinder

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu')
    adata = mdata.mod['rna']
    run_scDblFinder(adata)
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu')
    adata = mdata.mod['rna']
    sc.pl.umap(adata, color=['n_genes','n_counts','scDblFinder.class','scDblFinder.score','scDblFinder.class.sct','scDblFinder.score.sct'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=6, title= f'n_genes {folder_name}')
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

## DoubletFinder

In [None]:
%%R
sessionInfo()

#### 597

In [None]:

mdata= read_h5mu_to_mudata(f'{base_path}597_NVF_Crypts_Rep1{outs_path}/multiome_1_done_dd.h5mu')
adata = mdata.mod['rna']
run_DoubletFinder(adata)
mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu',mdata)
del mdata
del adata
gc.collect()

#### 598

In [None]:
mdata= read_h5mu_to_mudata(f'{base_path}598_FVF_Crypts_Rep1{outs_path}/multiome_1_done_dd.h5mu')
adata = mdata.mod['rna']
run_DoubletFinder(adata)
mu.write(f'{base_path}598_FVF_Crypts_Rep1{outs_path}/multiome_1_done_dd.h5mu',mdata)
del mdata
del adata
gc.collect()

In [None]:
mu.write(f'{base_path}598_FVF_Crypts_Rep1{outs_path}/multiome_1_done_dd.h5mu',mdata)

#### 599

In [None]:
mdata= read_h5mu_to_mudata(f'{base_path}599_FVF_Crypts_Rep2{outs_path}/multiome_1_done_dd.h5mu')
adata = mdata.mod['rna']
run_DoubletFinder(adata)
mu.write(f'{base_path}599_FVF_Crypts_Rep2{outs_path}/multiome_1_done_dd.h5mu',mdata)
del mdata
del adata
gc.collect()

#### 604

In [None]:
mdata= read_h5mu_to_mudata(f'{base_path}604_NVF_Crypts_Rep2{outs_path}/multiome_1_done_dd.h5mu')
adata = mdata.mod['rna']
run_DoubletFinder(adata)
mu.write(f'{base_path}604_NVF_Crypts_Rep2{outs_path}/multiome_1_done_dd.h5mu',mdata)
del mdata
del adata
gc.collect()

#### FVF-high

In [None]:
mdata= read_h5mu_to_mudata(f'{base_path}FVF-high{outs_path}/multiome_1_done_dd.h5mu')
adata = mdata.mod['rna']
run_DoubletFinder(adata)
mu.write(f'{base_path}FVF-high{outs_path}/multiome_1_done_dd.h5mu',mdata)
del mdata
del adata
gc.collect()

#### FVF-low

In [None]:

mdata= read_h5mu_to_mudata(f'{base_path}FVF-low{outs_path}/multiome_1_done_dd.h5mu')
adata = mdata.mod['rna']
run_DoubletFinder(adata)
mu.write(f'{base_path}FVF-low{outs_path}/multiome_1_done_dd.h5mu',mdata)
del mdata
del adata
gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd.h5mu')
    adata = mdata.mod['rna']    
    sc.pl.umap(adata, color=['n_genes','n_counts','pANN','DF_classifications_1','DF_classifications_2'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, title = f'n_genes {folder_name}')
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_doubletfinder.h5mu',mdata)
    del mdata
    del adata
    gc.collect()

In [None]:
for folder_name, sample_name in folder_variables.items():
    if folder_name not in samples:
        print(f'skipping {folder_name}...')
        continue
    mdata= read_h5mu_to_mudata(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_doubletfinder.h5mu')
    adata = mdata.mod['rna']
    sc.pl.umap(adata, color=['n_genes','n_counts','pANN.sct','DF_classifications_1.sct','DF_classifications_2.sct'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5, title = f'n_genes {folder_name}')
    mu.write(f'{base_path}{folder_name}{outs_path}/multiome_1_done_dd_doubletfinder.h5mu',mdata)
    del mdata
    del adata
    gc.collect()