In [None]:
import numpy as np
import scanpy as sc
import os
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import sys
path_helper = ["C:\\","Users","vfriedrich","projects","monkey_IZI","git_documentation","scRNAseq_cross_species_primate_human","analysis","helper"]
sys.path.append(os.path.join(*path_helper))
import helperVDF as h
import seaborn as sns
print(sys.executable)

In [None]:
h.print_main_versions()

In [None]:
#env: scArches_env

In [None]:
pre = "H21"
drive = 'F'
base_model_path,base_table_path,base_plots_path,base_anndata_objects = h.return_local_paths(drive = drive,
                                                                                            pre = pre,
                                                                                            add_path = True)
#anno R world
path_s0110_scrublet_andother =  os.path.join('F:\\monkey_IZI\\analysisR','s0110_scrublet_andother.txt')
s0110_scrublet_andother = pd.read_csv(path_s0110_scrublet_andother,index_col=0)

In [None]:
#human
species = 'human'
warnings.filterwarnings("ignore")
IDs = h.return_cellranger_IDs(species)
adata_all_human = h.read_bg_corrected_counts(drive,cellranger_IDs = IDs,pre_bg_correction = 'H05_01', species = species)

s0110_scrublet_andother_human = s0110_scrublet_andother[s0110_scrublet_andother['species'] == 'human']
adata_all_human = h.add_anno_to_adata(adata=adata_all_human,
                    anno_df=s0110_scrublet_andother_human,
                    anno_columns_to_add=s0110_scrublet_andother_human.columns,rsuffix='_R')

#standard filtering - remove cell with zero expression and genes appearing in less than 50 cells
adata_all_human,nr_removed_cells,nr_removed_genes = h.standard_scanpy_filter(adata_all_human,min_genes_per_cell=1,min_cells_per_gene=50)

print('nr_removed_cells : ' + str(nr_removed_cells))
print('nr_removed_genes : ' + str(nr_removed_genes))

## Low quality QC
- on experimental level
- partly based on https://www.sc-best-practices.org/preprocessing_visualization/quality_control.html
- cutoffs for "log1p_n_genes_by_counts","log1p_total_counts" obtained via MAD with nmad factor 6
- mito-cutoff for cyno 15%
- low quality cluster cutoff: 0.25 

In [None]:
experiments = list(pd.unique(adata_all_human.obs['experiment_ori']))
columns_QC= ["log1p_n_genes_by_counts","log1p_total_counts","pct_counts_mt"]
cutoff_bad_cluster = 0.25

for exp in experiments:
    print(exp)
    adata_exp = h.filter_adata_obs(adata=adata_all_human,col_name='experiment_ori', val=exp)
    adata_exp = h.best_practice_base_preprocessing(adata_exp,nmad=6)
    adata_exp.obs['outlier_pct_counts_mt'] = adata_exp.obs["pct_counts_mt"] > 15
    for column in columns_QC:
        h.violinplot_QC(adata=adata_exp,
                column=column,
               save = os.path.join(base_plots_path,pre + '_' +exp + '_' + species + '_violin_' + column + '.pdf'),
               show = True)
    h.save_QC_cutoffsMAD(adata= adata_exp,
                         columns = columns_QC,
                         save =os.path.join(base_table_path,pre + '_' +exp + '_' + species + '_cutoffs_basicQC.csv'),
                         return_df = False)
    sc.pp.neighbors(adata_exp)
    sc.tl.louvain(adata_exp,resolution=7,key_added='louvain_res_7_' + exp)
    adata_exp.obs['low_quality_cell'] = (adata_exp.obs['outlier_log1p_total_counts'] | adata_exp.obs['outlier_log1p_n_genes_by_counts'] | adata_exp.obs['outlier_pct_counts_mt'])
    adata_exp.obs['low_quality_cell2'] = adata_exp.obs['low_quality_cell'].map({True: 'low_quality_cell', False: 'high_quality_cell'})
    
    adata_exp,bad_clusters,perc_df = h.do_clusterbasedQC_basicQC(adata=adata_exp,
                              obs_column_qc = 'low_quality_cell2',
                              obs_column_clustering='louvain_res_7_' + exp,
                              sort_by='low_quality_cell',
                              ID='basic_QC',
                              cutoff_bad_cluster=cutoff_bad_cluster)
    perc_df['low_quality_cell'] = perc_df['low_quality_cell'].fillna(0.0)
    h.basic_QC_plot_doublett_cutoff(df=perc_df,
                                  column = 'low_quality_cell',
                                  cutoff = cutoff_bad_cluster,
                                 xlabel='Cluster',
                                 ylabel = 'Low quality percentage',
                                 title= 'Low quality fraction per cluster ' + exp,
                                 save = os.path.join(base_plots_path,pre + '_' +exp + '_' + species + '_QC_cutoff_bad_cluster_' + species + '.pdf'))
    h.make_pie_plot_QC(adata_exp.obs['basic_QC'].value_counts(),
                     title = 'QC overview basic QC ' + exp,
                     save = os.path.join(base_plots_path,pre+exp + '_' + species + '_QC_basic_QC_piechart.pdf'),
                     show = True)
    adata_exp.obs.to_csv(os.path.join(base_table_path,pre + '_' +exp + '_' + species + '_anno_basicQC.csv'))

## Doublet QC

In [None]:
h.prepare_umap(adata_all_human)

sc.pl.umap(adata_all_human,color = ['timepoint','individual'])

In [None]:
sc.pl.umap(adata_all_human,color = ['scDblFinder.class2','dblt_doubldetect_guess2','dblt_scrublet_predicted2'])


In [None]:
sc.tl.louvain(adata_all_human,resolution=7,key_added='louvain_res_7')

sc.pl.umap(adata_all_human,color = 'louvain_res_7')

In [None]:
# three doublet detection tools
db_tool_1 = 'scDblFinder.class2'
db_tool_2 = 'dblt_doubldetect_guess2'
db_tool_3 = 'dblt_scrublet_predicted2'

In [None]:
adata_all_human = h.add_doublet_summary_three_tools(adata=adata_all_human,db_tool_1=db_tool_1,
                                                 db_tool_2 = db_tool_2,
                                                 db_tool_3 =db_tool_3)

adata_all_human,bad_clusters,doub_perc_df =  h.do_clusterbased_QCv2(adata=adata_all_human,
                                                   obs_column_clustering='louvain_res_7',
                                                   cutoff_bad_cluster=0.25)

In [None]:
h.basic_QC_plot_doublett_cutoff(df=doub_perc_df,
                              column = 'doublet',
                              cutoff = 0.25,
                             xlabel='Cluster',
                             ylabel = 'Doublette percentage',
                             title= 'Doublette fraction per cluster',
                             save = os.path.join(base_plots_path,pre + '_QC_cutoff_bad_cluster_' + species + '.pdf'))

In [None]:
QC_summary = adata_all_human.obs['doublet_QC'].value_counts()
h.make_pie_plot_QC(QC_summary,
                 title = 'QC overview human doublettes',
                 save = os.path.join(base_plots_path,pre + '_human_QC_doublet_piechart.pdf'),
                 show = True)

## Combine low quality QC and doublet QC

In [None]:
anno_basic_QC = pd.DataFrame(columns =['low_quality_cell', 'low_quality_cell', 'cell_quality_clusterbasic_QC','basic_QC'])
for exp in experiments:
    anno_basic_QC=anno_basic_QC.append(pd.read_csv(os.path.join(base_table_path,pre + '_' +exp + '_' + species + '_anno_basicQC.csv'),index_col=0)[['low_quality_cell','low_quality_cell','cell_quality_clusterbasic_QC','basic_QC']])

In [None]:
adata_all_human = h.add_anno_to_adata(adata=adata_all_human,
                  anno_df=anno_basic_QC,
                  anno_columns_to_add=anno_basic_QC.columns)

In [None]:
adata_all_human.obs['QC_summary'] = np.where((adata_all_human.obs['doublet_QC'] == 'good_cluster_good_cell') & (adata_all_human.obs['basic_QC'] == 'high_quality_cluster_high_quality_cell'), 
                                   'passed_QC', 'failedQC')

h.make_pie_plot_QC(adata_all_human.obs['QC_summary'].value_counts(),
                 title = 'QC summary human',
                 save = os.path.join(base_plots_path,pre + '_human_QC_summary_piechart.pdf'),
                 show = True)

In [None]:
adata_all_human.obs.to_csv(os.path.join(base_table_path,pre + '_'  + species + '_anno_QC.csv'))
#adata_all_human.write_h5ad(os.path.join(base_anndata_objects,pre + '_QC_bg_corrected.h5ad'))

# Save session

In [None]:
base_package_version_path = h.return_package_version_local_path(drive=drive)
h.save_package_versions(base_package_version_path,pre,do_print = True)
h.print_main_versions()