# CITE-seq Analysis Pipeline

This pipeline enables you to analyze and visualize your Cellular Indexing of Transcriptomes and Epitopes by Sequencing (CITE-seq) datasets with an array of algorithms and data visualization methods. The pipeline includes quality control, library size analysis, normalization, plotting most highly expressed genes, plotting samples, clustering, and enrichment analysis.

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
# Basic libraries
import pandas as pd
import random
import time
import numpy as np
import warnings

# Visualization
import scipy.stats as ss
import IPython
from IPython.display import HTML, display, Markdown, IFrame, FileLink, Image
from itertools import combinations
from scipy import stats

# Data analysis
from sklearn.decomposition import PCA
from sklearn.preprocessing import quantile_transform
from sklearn import cluster
from sklearn.manifold import TSNE
import umap
from rpy2 import robjects
from rpy2.robjects import r, pandas2ri
import scanpy as sc
import anndata

from maayanlab_bioinformatics.enrichment.crisp import enrich_crisp, fisher_overlap


# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Span, Select, Legend, PreText, Paragraph, LinearColorMapper, ColorBar, CategoricalColorMapper
from bokeh.layouts import layout, row, column, gridplot
from bokeh.palettes import all_palettes
import colorcet as cc
from bokeh.palettes import Category20

# External Code
from utils import *
output_notebook()


In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='Data_Section',
    title='Load your Dataset 1',
    subtitle='Load your expression data. In comma/tab separated formats, genes should be in rows and samples shoud be in columns. You can also upload Cell Ranger files (matrix, genes, and barcords files). ',
    img='analysis.png'
    
) %}


{% do SectionField(
    name='Data_Section2',
    title='Load your Dataset 2',
    subtitle='Load your expression data. In comma/tab separated formats, genes should be in rows and samples shoud be in columns. You can also upload Cell Ranger files (matrix, genes, and barcords files). ',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Data_Section3',
    title='Load your Dataset 3',
    subtitle='Load your expression data. In comma/tab separated formats, genes should be in rows and samples shoud be in columns. You can also upload Cell Ranger files (matrix, genes, and barcords files). ',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Data_Section4',
    title='Load your Dataset 4',
    subtitle='Load your expression data. In comma/tab separated formats, genes should be in rows and samples shoud be in columns. You can also upload Cell Ranger files (matrix, genes, and barcords files). ',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Normalization_Section',
    title='Select Normalization Methods',
    subtitle='',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Visualization_Section',
    title='Select Visualization Parameters',
    subtitle='',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='DEG_Section',
    title='Select Differentially Exprssed Gene Analysis Parameters',
    subtitle='',
    img='analysis.png'
    
) %}


In [None]:
%%appyter code_exec

{% set file_kind = TabField(
    name='file_kind',
    label='Dataset 1',
    default='.mtx from 10x Genomics',
    description='Upload your expression files',
    choices={
        'Plain text': [             
            FileField(
                name='rnaseq_data_filename1', 
                label='RNA-seq data file (.csv, .txt or .tsv)', 
                default='GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv',
                examples={'GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv': 'https://appyters.maayanlab.cloud/storage/CITEseq/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv'}, 
                description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
                section='Data_Section'
            ),
            FileField(
                name='adt_data_filename1', 
                label='ADT data file (.csv, .txt or .tsv)', 
                default='GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv',
                examples={'GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv': 'https://appyters.maayanlab.cloud/storage/CITEseq/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv'}, 
                description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
                section='Data_Section'
            ),
            FileField(
                name='meta_data_filename1', 
                label='(Optional) Meta data file (.csv, .txt or .tsv)', 
                default='',
                examples={'': ''}, 
                description='Upload metadata as two-column comma seperated or tab seperated format. The first column contains sample IDs and the other column contains sample labels',
                section='Data_Section'
            ),         
            StringField(
                name='meta_class_column_name1', 
                label='(Optional) Class column name in metadata', 
                default='', 
                description='class column name of metadata', 
                section='Data_Section'
            )


        ],
        
        '.mtx from 10x Genomics': [
            DescriptionField(name = 'Des1_1', text = 'Upload RNA-seq data files', section = 'Data_Section'),
            FileField(
                name='mtx_data_filename_rna1', 
                label='RNA-seq data file (.mtx)', 
                default="GSM4552996_SLN208_D2_matrix.mtx",
                examples={'GSM4552996_SLN208_D2_matrix.mtx': 'https://appyters.maayanlab.cloud/storage/CITEseq/GSM4552996_SLN208_D2_matrix.mtx'}, 
                description='Expression data file from 10x Genomics need to be in .mtx format which store the expression data in sparse matrix.',
                section='Data_Section'
            ),
            FileField(
                name='gene_data_filename_rna1', 
                label='Feature infomation file (.tsv)', 
                default="GSM4552996_SLN208_D2_features.tsv",
                examples={'GSM4552996_SLN208_D2_features.tsv': 'https://appyters.maayanlab.cloud/storage/CITEseq/GSM4552996_SLN208_D2_features.tsv'}, 
                description='A tab delimited file of the corresponding genes in the .mtx expression matrix file. The first column should be the gene IDs whereas the secound column corresponds to gene symbols.', 
                section='Data_Section'
            ),
            FileField(
                name='barcode_data_filename_rna1', 
                label='Barcode information file (.tsv)', 
                default="GSM4552996_SLN208_D2_barcodes.tsv",
                examples={'GSM4552996_SLN208_D2_barcodes.tsv': 'https://appyters.maayanlab.cloud/storage/CITEseq/GSM4552996_SLN208_D2_barcodes.tsv'}, 
                description='A tab delimited file of the corresponding barcodes in the .mtx expression matrix file. The first column should be the unique barcodes for cells.', 
                section='Data_Section'
            ),
            DescriptionField(name = 'sep1_1', text = '<hr>', section = 'Data_Section'),
            DescriptionField(name = 'Des1_2', text = 'If your RNA-seq files do not include Antibody-Derived Tags (ADT) data, upload ADT data files.', section = 'Data_Section'),
            
            FileField(
                name='mtx_data_filename_adt1', 
                label='ADT count data file (.mtx)', 
                default="",                
                description='Expression data file from 10x Genomics need to be in .mtx format which store the expression data in sparse matrix.',
                section='Data_Section'
            ),
            FileField(
                name='gene_data_filename_adt1', 
                label='ADT feature infomation file (.tsv)', 
                default="",
                description='A tab delimited file of the corresponding genes in the .mtx expression matrix file. The first column should be the gene IDs whereas the secound column corresponds to gene symbols.', 
                section='Data_Section'
            ),
            FileField(
                name='barcode_data_filename_adt1', 
                label='ADT barcode information file (.tsv)', 
                default="",
                description='A tab delimited file of the corresponding barcodes in the .mtx expression matrix file. The first column should be the unique barcodes for cells.', 
                section='Data_Section'
            ),
            DescriptionField(name = 'sep1_2', text = '<hr>', section = 'Data_Section'),
            
            FileField(
                name='meta_data_filename_mtx1', 
                label='(Optional) Meta data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload metadata as two-column comma seperated or tab seperated format. The first column contains sample IDs and the other column contains sample labels',
                section='Data_Section'
            ),         
            StringField(
                name='meta_class_column_name_mtx1', 
                label='(Optional) Class column name in metadata', 
                default='', 
                description='class column name of metadata', 
                section='Data_Section'
            )
        ]
        
    },
    section = 'Data_Section',
) %}
{% set dataset_name = StringField(
    name='dataset_name', 
    label='Dataset Name', 
    default='Dataset1', 
    description='', 
    section='Data_Section'
    )
%}

In [None]:
%%appyter code_exec

{% set file_kind2 = TabField(
    name='file_kind2',
    label='Dataset 2',
    default='Plain text',
    description='Upload your expression files',
    choices={
        'Plain text': [             
            FileField(
                name='rnaseq_data_filename2', 
                label='RNA-seq data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
                section='Data_Section2'
            ),
            FileField(
                name='adt_data_filename2', 
                label='ADT data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
                section='Data_Section2'
            ),
            FileField(
                name='meta_data_filename2', 
                label='(Optional) Meta data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload metadata as two-column comma seperated or tab seperated format. The first column contains sample IDs and the other column contains sample labels',
                section='Data_Section2'
            ),         
            StringField(
                name='meta_class_column_name2', 
                label='(Optional) Class column name in metadata', 
                default='', 
                description='class column name of metadata', 
                section='Data_Section2'
            )


        ],
        
        '.mtx from 10x Genomics': [
            DescriptionField(name = 'Des2_1', text = 'Upload RNA-seq data files', section = 'Data_Section2'),
            FileField(
                name='mtx_data_filename_rna2', 
                label='RNA-seq data file (.mtx)', 
                default="",
                description='Expression data file from 10x Genomics need to be in .mtx format which store the expression data in sparse matrix.',
                section='Data_Section2'
            ),
            FileField(
                name='gene_data_filename_rna2', 
                label='Feature infomation file (.tsv)', 
                default="",
                description='A tab delimited file of the corresponding genes in the .mtx expression matrix file. The first column should be the gene IDs whereas the secound column corresponds to gene symbols.', 
                section='Data_Section2'
            ),
            FileField(
                name='barcode_data_filename_rna2', 
                label='Barcode information file (.tsv)', 
                default="",
                description='A tab delimited file of the corresponding barcodes in the .mtx expression matrix file. The first column should be the unique barcodes for cells.', 
                section='Data_Section2'
            ),
            DescriptionField(name = 'sep2_1', text = '<hr>', section = 'Data_Section2'),
            DescriptionField(name = 'Des2_2', text = 'If your RNA-seq files do not include Antibody-Derived Tags (ADT) data, upload ADT data files.', section = 'Data_Section2'),
            
            FileField(
                name='mtx_data_filename_adt2', 
                label='ADT count data file (.mtx)', 
                default="",
                description='Expression data file from 10x Genomics need to be in .mtx format which store the expression data in sparse matrix.',
                section='Data_Section2'
            ),
            FileField(
                name='gene_data_filename_adt2', 
                label='ADT feature infomation file (.tsv)', 
                default="",
                description='A tab delimited file of the corresponding genes in the .mtx expression matrix file. The first column should be the gene IDs whereas the secound column corresponds to gene symbols.', 
                section='Data_Section2'
            ),
            FileField(
                name='barcode_data_filename_adt2', 
                label='ADT barcode information file (.tsv)', 
                default="",
                description='A tab delimited file of the corresponding barcodes in the .mtx expression matrix file. The first column should be the unique barcodes for cells.', 
                section='Data_Section2'
            ),
            DescriptionField(name = 'sep2_2', text = '<hr>', section = 'Data_Section2'),
            
            FileField(
                name='meta_data_filename_mtx2', 
                label='(Optional) Meta data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload metadata as two-column comma seperated or tab seperated format. The first column contains sample IDs and the other column contains sample labels',
                section='Data_Section2'
            ),         
            StringField(
                name='meta_class_column_name_mtx2', 
                label='(Optional) Class column name in metadata', 
                default='', 
                description='class column name of metadata', 
                section='Data_Section2'
            )
        ]
        
    },
    section = 'Data_Section2',
) %}
{% set dataset_name2 = StringField(
    name='dataset_name2', 
    label='Dataset Name', 
    default='Dataset2', 
    description='', 
    section='Data_Section2'
)
%}

In [None]:
%%appyter code_exec

{% set file_kind3 = TabField(
    name='file_kind3',
    label='Dataset 3',
    default='Plain text',
    description='Upload your expression files',
    choices={
        'Plain text': [             
            FileField(
                name='rnaseq_data_filename3', 
                label='RNA-seq data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
                section='Data_Section3'
            ),
            FileField(
                name='adt_data_filename3', 
                label='ADT data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
                section='Data_Section3'
            ),
            FileField(
                name='meta_data_filename3', 
                label='(Optional) Meta data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload metadata as two-column comma seperated or tab seperated format. The first column contains sample IDs and the other column contains sample labels',
                section='Data_Section3'
            ),         
            StringField(
                name='meta_class_column_name3', 
                label='(Optional) Class column name in metadata', 
                default='', 
                description='class column name of metadata', 
                section='Data_Section3'
            )


        ],
        
        '.mtx from 10x Genomics': [
            DescriptionField(name = 'Des3_1', text = 'Upload RNA-seq data files', section = 'Data_Section3'),
            FileField(
                name='mtx_data_filename_rna3', 
                label='RNA-seq data file (.mtx)', 
                default="",
                description='Expression data file from 10x Genomics need to be in .mtx format which store the expression data in sparse matrix.',
                section='Data_Section3'
            ),
            FileField(
                name='gene_data_filename_rna3', 
                label='Feature infomation file (.tsv)', 
                default="",
                description='A tab delimited file of the corresponding genes in the .mtx expression matrix file. The first column should be the gene IDs whereas the secound column corresponds to gene symbols.', 
                section='Data_Section3'
            ),
            FileField(
                name='barcode_data_filename_rna3', 
                label='Barcode information file (.tsv)', 
                default="",
                description='A tab delimited file of the corresponding barcodes in the .mtx expression matrix file. The first column should be the unique barcodes for cells.', 
                section='Data_Section3'
            ),
            DescriptionField(name = 'sep3_1', text = '<hr>', section = 'Data_Section3'),
            DescriptionField(name = 'Des3_2', text = 'If your RNA-seq files do not include Antibody-Derived Tags (ADT) data, upload ADT data files.', section = 'Data_Section3'),
            
            FileField(
                name='mtx_data_filename_adt3', 
                label='ADT count data file (.mtx)', 
                default="",
                description='Expression data file from 10x Genomics need to be in .mtx format which store the expression data in sparse matrix.',
                section='Data_Section3'
            ),
            FileField(
                name='gene_data_filename_adt3', 
                label='ADT feature infomation file (.tsv)', 
                default="",
                description='A tab delimited file of the corresponding genes in the .mtx expression matrix file. The first column should be the gene IDs whereas the secound column corresponds to gene symbols.', 
                section='Data_Section2'
            ),
            FileField(
                name='barcode_data_filename_adt3', 
                label='ADT barcode information file (.tsv)', 
                default="",
                description='A tab delimited file of the corresponding barcodes in the .mtx expression matrix file. The first column should be the unique barcodes for cells.', 
                section='Data_Section3'
            ),
            DescriptionField(name = 'sep3_2', text = '<hr>', section = 'Data_Section3'),
            
            FileField(
                name='meta_data_filename_mtx3', 
                label='(Optional) Meta data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload metadata as two-column comma seperated or tab seperated format. The first column contains sample IDs and the other column contains sample labels',
                section='Data_Section3'
            ),         
            StringField(
                name='meta_class_column_name_mtx3', 
                label='(Optional) Class column name in metadata', 
                default='', 
                description='class column name of metadata', 
                section='Data_Section3'
            )
        ]
        
    },
    section = 'Data_Section3',
) %}
{% set dataset_name3 = StringField(
    name='dataset_name3', 
    label='Dataset Name', 
    default='Dataset3', 
    description='', 
    section='Data_Section3'
    )
%}

In [None]:
%%appyter code_exec

{% set file_kind4 = TabField(
    name='file_kind4',
    label='Dataset 4',
    default='Plain text',
    description='Upload your expression files',
    choices={
        'Plain text': [             
            FileField(
                name='rnaseq_data_filename4', 
                label='RNA-seq data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
                section='Data_Section4'
            ),
            FileField(
                name='adt_data_filename4', 
                label='ADT data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
                section='Data_Section4'
            ),
            FileField(
                name='meta_data_filename4', 
                label='(Optional) Meta data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload metadata as two-column comma seperated or tab seperated format. The first column contains sample IDs and the other column contains sample labels',
                section='Data_Section4'
            ),         
            StringField(
                name='meta_class_column_name4', 
                label='(Optional) Class column name in metadata', 
                default='', 
                description='class column name of metadata', 
                section='Data_Section4'
            )


        ],
        
        '.mtx from 10x Genomics': [
            DescriptionField(name = 'Des4_1', text = 'Upload RNA-seq data files', section = 'Data_Section2'),
            FileField(
                name='mtx_data_filename_rna4', 
                label='RNA-seq data file (.mtx)', 
                default='',
                description='Expression data file from 10x Genomics need to be in .mtx format which store the expression data in sparse matrix.',
                section='Data_Section4'
            ),
            FileField(
                name='gene_data_filename_rna4', 
                label='Feature infomation file (.tsv)', 
                default='',
                description='A tab delimited file of the corresponding genes in the .mtx expression matrix file. The first column should be the gene IDs whereas the secound column corresponds to gene symbols.', 
                section='Data_Section4'
            ),
            FileField(
                name='barcode_data_filename_rna4', 
                label='Barcode information file (.tsv)', 
                default='',
                description='A tab delimited file of the corresponding barcodes in the .mtx expression matrix file. The first column should be the unique barcodes for cells.', 
                section='Data_Section4'
            ),
            DescriptionField(name = 'sep4_1', text = '<hr>', section = 'Data_Section4'),
            DescriptionField(name = 'Des4_2', text = 'If your RNA-seq files do not include Antibody-Derived Tags (ADT) data, upload ADT data files.', section = 'Data_Section4'),
            
            FileField(
                name='mtx_data_filename_adt4', 
                label='ADT count data file (.mtx)', 
                default='',
                description='Expression data file from 10x Genomics need to be in .mtx format which store the expression data in sparse matrix.',
                section='Data_Section4'
            ),
            FileField(
                name='gene_data_filename_adt4', 
                label='ADT feature infomation file (.tsv)', 
                default='',
                description='A tab delimited file of the corresponding genes in the .mtx expression matrix file. The first column should be the gene IDs whereas the secound column corresponds to gene symbols.', 
                section='Data_Section4'
            ),
            FileField(
                name='barcode_data_filename_adt4', 
                label='ADT barcode information file (.tsv)', 
                default='',
                description='A tab delimited file of the corresponding barcodes in the .mtx expression matrix file. The first column should be the unique barcodes for cells.', 
                section='Data_Section4'
            ),
            DescriptionField(name = 'sep4_2', text = '<hr>', section = 'Data_Section4'),
            
            FileField(
                name='meta_data_filename_mtx4', 
                label='(Optional) Meta data file (.csv, .txt or .tsv)', 
                default='',
                description='Upload metadata as two-column comma seperated or tab seperated format. The first column contains sample IDs and the other column contains sample labels',
                section='Data_Section4'
            ),         
            StringField(
                name='meta_class_column_name_mtx4', 
                label='(Optional) Class column name in metadata', 
                default='', 
                description='class column name of metadata', 
                section='Data_Section4'
            )
        ]
        
    },
    section = 'Data_Section4',
) %}
{% set dataset_name4 = StringField(
    name='dataset_name4', 
    label='Dataset Name', 
    default='Dataset4', 
    description='', 
    section='Data_Section4'
    )
%}

In [None]:
%%appyter code_exec
{% set qc_filter_genes = BoolField(
    name='qc_filter_genes', 
    label='Filter cells by quality control?', 
    default='true', 
    description='Check if you want cells to be filtered by mitochondrial gene expressions', 
    section='Normalization_Section')
%}

{% set qc_threshold = FloatField(
    name='qc_threshold', 
    label='Mitochondria Quality Control threshold', 
    default='0.05', 
    description='Remove cells that have too many mitochondrial genes expressed.', 
    section='Normalization_Section')
%} 

{% set qc_filter_doublets = BoolField(
    name='qc_filter_doublets', 
    label='Filter cells by doublet prediction?', 
    default='true', 
    description='Check if you want cells to be filtered by doublet prediction', 
    section='Normalization_Section')
%}

{% set log_normalization = BoolField(
    name='log_normalization', 
    label='Log normalization?', 
    default='true', 
    description='Check if you want the dataset to be log-transformed', 
    section='Normalization_Section')
%}
{% set normalization_method = ChoiceField(
    name='normalization_method',
    label='Normalization',
    choices={'None': 'None', 'Satija et al. (2015, Nature Biotechnology)': 'Seurat', 'Zheng et al. (2017, Nature Communications)': 'Zheng17','Weinreb et al. (2018, PNAS)': 'Weignreb17'},
    default='Satija et al. (2015, Nature Biotechnology)', 
    description='Standard normlization recipe for scRNA-seq datasets', 
    section='Normalization_Section')
%}


In [None]:
%%appyter code_exec

{% set nr_genes = IntField(
    name='nr_genes', 
    label='Genes for dimension reduction', 
    min=0, 
    max=30000, 
    default=500, 
    description='The maximum number of genes for dimension reduction analysis', 
    section='Visualization_Section')
%}
gene_list_for_clustergrammer = {{TextField(
    name='gene_list_for_clustergrammer', 
    label='Gene List for Clustergrammer (Optional)', 
    default='', 
    description='Paste your gene list (One gene per row) for Clustergrammer heatmap plots.', 
    section = 'Visualization_Section')}}

{% set clustering_topk = IntField(
    name='clustering_topk', 
    label='Genes for clustergrammer', 
    min=0, 
    max=1000, 
    default=800, 
    description='The number of genes with largest variance for Clustergrammer', 
    section='Visualization_Section')
%}


In [None]:
%%appyter code_exec

{% set integration_option = MultiChoiceField(
    name='integration_option',
    label='Analysis option for multiple datasets',
    choices=['intra-dataset','integrated-dataset', 'inter-dataset'],
    default=['intra-dataset'], 
    description='If you upload multiple datasets, select an analysis option.', 
    section='DEG_Section')
%}

{% set enrichment_groupby = ChoiceField(
    name='enrichment_groupby',
    label='Group for differentially expressed gene analysis',
    choices={'Cluster': 'Cluster', 'User-defined class': 'user_defined_class'},
    default='Cluster', 
    description='Specify groups for enrichment analysis. Clusters will be automatically generated by a clustering method', 
    section='DEG_Section')
%}

{% set diff_gex_method = ChoiceField(
    name='diff_gex_method',
    label='Differential expression analysis method',
    choices={'limma': 'limma','wilcoxon':'wilcoxon', 'characteristic direction': 'characteristic_direction', 'edgeR': 'edgeR', 'DESeq2': 'DESeq2'},
    default='wilcoxon', 
    description='Set a method to get differentially expressed genes', 
    section='DEG_Section')
%}



{% set gene_topk = IntField(
    name='gene_topk', 
    label='Maximum genes for Enrichr', 
    min=0, 
    max=1000, 
    default=200, 
    description='The maximum number of genes discovered by the DEG method', 
    section='DEG_Section')
%}


{% set libraries_tab = TabField(
    name='libraries_tab',
    label='Enrichr libraries?',
    default='Yes',
    description='',
    choices={
        'Yes': [             
            MultiChoiceField(
                name='enrichr_libraries',
                label='Select Enrichr Libraries (upto 2)',
                descriptions='Enrichr libraries to be visualized. Select one or two libraries',
                choices=['Gene Ontology',
                        'Pathway',
                        'Kinase',
                        'Transcription Factor',
                        'miRNA',
                        'Cell Type',
                         'Disease'],
                default=['Gene Ontology',
                        'Pathway',
                        'Kinase',],
                section='DEG_Section'
                ),
            

        ],
        
        'No': [
            FileField(
                name='library_filename', 
                label='Upload your library file (.gmt)', 
                default='GSE117498_HSPC_celltypes.gmt',
                examples={'example_kidney_gene_sets.gmt': 'https://appyters.maayanlab.cloud/storage/SC_RNA_seq/example_kidney_gene_sets.gmt'}, 
                description='',
                section='Data_Section'
            ),
            
        ]
        
    },
    section = 'DEG_Section',
) %}
{% set nr_genesets = IntField(
    name='nr_genesets', 
    label='Top ranked gene sets', 
    min=0, 
    max=100,
    default=15, 
    description='the number of result gene sets', 
    section='DEG_Section')
%}


In [None]:
%%appyter code_exec
rnaseq_data_filename = ""
adt_data_filename = ""
mtx_data_filename_rna = ""
gene_data_filename_rna = ""
barcode_data_filename_rna = ""
mtx_data_filename_adt = ""
gene_data_filename_adt = ""
barcode_data_filename_adt = ""

{%- if file_kind.raw_value == 'Plain text' %}
rnaseq_data_filename = {{ file_kind.value[0] }}
adt_data_filename = {{ file_kind.value[1] }}
meta_data_filename = {{ file_kind.value[2] }}
meta_class_column_name = {{ file_kind.value[3] }}

{%- elif file_kind.raw_value == '.mtx from 10x Genomics' %}
mtx_data_filename_rna = {{ file_kind.value[1] }}
gene_data_filename_rna = {{ file_kind.value[2] }}
barcode_data_filename_rna = {{ file_kind.value[3] }}

mtx_data_filename_adt = {{ file_kind.value[6] }}
gene_data_filename_adt = {{ file_kind.value[7] }}
barcode_data_filename_adt = {{ file_kind.value[8] }}

meta_data_filename = {{ file_kind.value[10] }}
meta_class_column_name = {{ file_kind.value[11] }}

{%- endif %}
dataset_name = "{{dataset_name.value}}"

In [None]:
%%appyter code_exec
rnaseq_data_filename2 = ""
adt_data_filename2 = ""
mtx_data_filename_rna2 = ""
gene_data_filename_rna2 = ""
barcode_data_filename_rna2 = ""
mtx_data_filename_adt2 = ""
gene_data_filename_adt2 = ""
barcode_data_filename_adt2 = ""
{%- if file_kind2.raw_value == 'Plain text' %}
rnaseq_data_filename2 = {{ file_kind2.value[0] }}
adt_data_filename2 = {{ file_kind2.value[1] }}
meta_data_filename2 = {{ file_kind2.value[2] }}
meta_class_column_name2 = {{ file_kind2.value[3] }}

{%- elif file_kind2.raw_value == '.mtx from 10x Genomics' %}
mtx_data_filename_rna2 = {{ file_kind2.value[1] }}
gene_data_filename_rna2 = {{ file_kind2.value[2] }}
barcode_data_filename_rna2 = {{ file_kind2.value[3] }}

mtx_data_filename_adt2 = {{ file_kind2.value[6] }}
gene_data_filename_adt2 = {{ file_kind2.value[7] }}
barcode_data_filename_adt2 = {{ file_kind2.value[8] }}

meta_data_filename2 = {{ file_kind2.value[10] }}
meta_class_column_name2 = {{ file_kind2.value[11] }}

{%- endif %}
dataset_name2 = "{{dataset_name2.value}}"

In [None]:
%%appyter code_exec
rnaseq_data_filename3 = ""
adt_data_filename3 = ""
mtx_data_filename_rna3 = ""
gene_data_filename_rna3 = ""
barcode_data_filename_rna3 = ""
mtx_data_filename_adt3 = ""
gene_data_filename_adt3 = ""
barcode_data_filename_adt3 = ""
{%- if file_kind3.raw_value == 'Plain text' %}
rnaseq_data_filename3 = {{ file_kind3.value[0] }}
adt_data_filename3 = {{ file_kind3.value[1] }}
meta_data_filename3 = {{ file_kind3.value[2] }}
meta_class_column_name3 = {{ file_kind3.value[3] }}

{%- elif file_kind3.raw_value == '.mtx from 10x Genomics' %}
mtx_data_filename_rna3 = {{ file_kind3.value[1] }}
gene_data_filename_rna3 = {{ file_kind3.value[2] }}
barcode_data_filename_rna3 = {{ file_kind3.value[3] }}

mtx_data_filename_adt3 = {{ file_kind3.value[6] }}
gene_data_filename_adt3 = {{ file_kind3.value[7] }}
barcode_data_filename_adt3 = {{ file_kind3.value[8] }}

meta_data_filename3 = {{ file_kind3.value[10] }}
meta_class_column_name3 = {{ file_kind3.value[11] }}

{%- endif %}
dataset_name3 = "{{dataset_name3.value}}"

In [None]:
%%appyter code_exec
rnaseq_data_filename4 = ""
adt_data_filename4 = ""
mtx_data_filename_rna4 = ""
gene_data_filename_rna4 = ""
barcode_data_filename_rna4 = ""
mtx_data_filename_adt4 = ""
gene_data_filename_adt4 = ""
barcode_data_filename_adt4 = ""
{%- if file_kind4.raw_value == 'Plain text' %}
rnaseq_data_filename4 = {{ file_kind4.value[0] }}
adt_data_filename4 = {{ file_kind4.value[1] }}
meta_data_filename4 = {{ file_kind4.value[2] }}
meta_class_column_name4 = {{ file_kind4.value[3] }}

{%- elif file_kind4.raw_value == '.mtx from 10x Genomics' %}
mtx_data_filename_rna4 = {{ file_kind4.value[1] }}
gene_data_filename_rna4 = {{ file_kind4.value[2] }}
barcode_data_filename_rna4 = {{ file_kind4.value[3] }}

mtx_data_filename_adt4 = {{ file_kind4.value[6] }}
gene_data_filename_adt4 = {{ file_kind4.value[7] }}
barcode_data_filename_adt4 = {{ file_kind4.value[8] }}

meta_data_filename4 = {{ file_kind4.value[10] }}
meta_class_column_name4 = {{ file_kind4.value[11] }}

{%- endif %}
dataset_name4 = "{{dataset_name4.value}}"

In [None]:
%%appyter code_exec
{%- if libraries_tab.raw_value == 'Yes' %}
enrichr_libraries_filename = []
enrichr_libraries = {{ libraries_tab.value[0] }}

{%- else %}
enrichr_libraries_filename = {{ libraries_tab.value[0] }}
enrichr_libraries = []
{%- endif %}
libraries_tab = "{{libraries_tab.raw_value}}"

In [None]:
%%appyter code_exec
integration_option = {{integration_option.value}}

In [None]:
%%appyter code_exec

qc_filter_genes = {{qc_filter_genes.value}}
qc_threshold = {{qc_threshold.value}}
qc_filter_doublets = {{qc_filter_doublets.value}}
log_normalization = {{log_normalization.value}}
normalization_method = "{{normalization_method.value}}"

nr_genes = {{nr_genes.value}}
clustering_topk = {{clustering_topk.value}}

integration_option = {{integration_option.value}}
enrichment_groupby = "{{enrichment_groupby.value}}"
diff_gex_method = "{{diff_gex_method.value}}"
gene_topk = {{gene_topk.value}}
nr_genesets = {{nr_genesets.value}}


In [None]:
warnings.filterwarnings('ignore')
random.seed(0)
pandas2ri.activate()
plot_type='interactive'
results = {}
table_counter = 1
figure_counter = 1
sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)
sc.settings.figdir="./"
bool_plot = True

# Load datasets

In [None]:
def load_seurat_files(mtx_filename, gene_filename, barcodes_filename, bool_adt=False):
    # bool_adt: if data contains both RNA and ADT 
    
    adata = anndata.read_mtx(mtx_filename).T
    with open(barcodes_filename, "r") as f:
        cells = f.readlines()
        cells = [x.strip() for x in cells]
    genes = pd.read_csv(
        gene_filename,
        header=None,
        sep='\t',
    )

    adata.var['gene_ids'] = genes.iloc[:, 0].values  
    if genes.shape[1] > 1:
        adata.var['gene_symbols'] = genes.iloc[:, 1].values
    else:
        adata.var['gene_symbols'] = genes.iloc[:, 0].values
    adata.var_names = adata.var['gene_symbols']
    adata.var_names_make_unique(join="-")

    adata.obs['barcode'] = cells
    adata.obs_names = cells
    adata.obs_names_make_unique(join="-")
    
    
    if bool_adt == True:
        genes.columns = ["gene_ids", "gene_symbols", "gene_annot"]
        adt_list = genes.loc[genes["gene_annot"]=="Antibody Capture", "gene_symbols"].tolist()
        adata_adt = adata[:, adata.var["gene_symbols"].isin(adt_list)]
        adata = adata[:, ~adata.var["gene_symbols"].isin(adt_list)]
        return adata, adata_adt
    else:
        return adata

In [None]:
adata_list = list()
adata_adt_list = list()

adata1, adata_adt1, table_counter, meta_class_column_name = load_data(dataset_name, rnaseq_data_filename, adt_data_filename, mtx_data_filename_rna, gene_data_filename_rna, barcode_data_filename_rna, mtx_data_filename_adt, gene_data_filename_adt, barcode_data_filename_adt, meta_data_filename, meta_class_column_name, table_counter)
adata2, adata_adt2, table_counter, _ = load_data(dataset_name2, rnaseq_data_filename2, adt_data_filename2, mtx_data_filename_rna2, gene_data_filename_rna2, barcode_data_filename_rna2, mtx_data_filename_adt2, gene_data_filename_adt2, barcode_data_filename_adt2, meta_data_filename2, meta_class_column_name2, table_counter)
adata3, adata_adt3, table_counter, _ = load_data(dataset_name3, rnaseq_data_filename3, adt_data_filename3, mtx_data_filename_rna3, gene_data_filename_rna3, barcode_data_filename_rna3, mtx_data_filename_adt3, gene_data_filename_adt3, barcode_data_filename_adt3, meta_data_filename3, meta_class_column_name3, table_counter)
adata4, adata_adt4, table_counter, _ = load_data(dataset_name4, rnaseq_data_filename4, adt_data_filename4, mtx_data_filename_rna4, gene_data_filename_rna4, barcode_data_filename_rna4, mtx_data_filename_adt4, gene_data_filename_adt4, barcode_data_filename_adt4, meta_data_filename4, meta_class_column_name4, table_counter)

for adata, adata_adt in zip([adata1, adata2, adata3, adata4], [adata_adt1, adata_adt2, adata_adt3, adata_adt4]):
    if adata is not None:
        adata_list.append(adata)
        adata_adt_list.append(adata_adt)

In [None]:
if qc_filter_doublets == True:
    display(Markdown("# Predict Doublet Scores"))
    display(Markdown("Single cell RNA-seq often generates technical artifacts known as doublets where multiple cells receive the same barcode. The appyter identifies problemtic doublets by using Scrublet (Wolock, Samuel L., et al. 2019) and filters out predicted doublets. Scrublet predicts doublets using a nearest-neighbor classifier trained on observed transcriptome and simulated doublets. "))

In [None]:
if qc_filter_doublets == True:
    new_adata_list = list()
    new_adata_adt_list = list()
    for adata, adata_adt in zip(adata_list, adata_adt_list):
        sc.external.pp.scrublet(adata)
        selected_pred = adata.obs[adata.obs["predicted_doublet"]==False]
        adata = adata[selected_pred.index]
        adata_adt = adata_adt[selected_pred.index]
        ds_name = adata.obs["batch"].unique()[0]
        display_statistics(adata, f"### Statistics of RNA data in {ds_name}###")
        display_statistics(adata_adt, f"### Statistics of protein data in {ds_name}###")
        new_adata_list.append(adata)
        new_adata_adt_list.append(adata_adt)
    adata_list = new_adata_list
    adata_adt_list = new_adata_adt_list

In [None]:
adata_merged = anndata.concat(adata_list)
adata_adt_merged = anndata.concat(adata_adt_list)

In [None]:
%%appyter markdown
{%if qc_filter_genes.value == True%}
# Filter Out Cells Based On Mitrochondrial Genes
High expression levels of mitochondrial genes could be an indicator of poor quality cells (Islam, Saiful, et al. 2014, Ilicic, Tomislav, et al. 2016). In a situation where the cell membrane is broken, cytoplasmic RNA will be lost, but RNAs enclosed in the mitochondria will be retained. This analysis removes single cells that are likely having the cell membrane broken.
{% endif %}

In [None]:
%%appyter code_exec
{%if qc_filter_genes.value == True %}
if adata_merged.var_names[0].startswith("ENSG") == False:
    mito_genes = adata_merged.var_names.str.startswith('MT-')    
else:
    gene_symbol_var_names = adata_merged.var_names
    mito_genes = [True if x in gene_id_map_dict and str(gene_id_map_dict[x]).startswith('MT-') else False for x in gene_symbol_var_names ]

# for each cell compute fraction of counts in mito genes vs. all genes
adata_merged.obs['percent_mito'] = np.sum(
    adata_merged[:, mito_genes].X, axis=1) / np.sum(adata_merged.X, axis=1)

sc.pl.violin(adata_merged, ['percent_mito'],
         jitter=0.4, multi_panel=True, show=True, save=True)
figure_counter = display_object(figure_counter, "Violin plot of the percentage of mitochondrial gene expression counts in each cell", istable=False)
display_link("violin.pdf", "Download figure")
adata_merged = adata_merged[adata_merged.obs.percent_mito < qc_threshold, :]
adata_adt_merged = adata_adt_merged[adata_merged.obs.index.tolist(), :]
display_statistics(adata_merged, "### Statistics of RNA data after QC ###")
display_statistics(adata_adt_merged, "### Statistics of protein data after QC ###")    
{% endif %}

In [None]:
adata_merged.raw = adata_merged

In [None]:
%%appyter markdown
{% if normalization_method.value != "None" %}
# Normalization
Various normalization methods can be applied based on the selection made by the user (Zheng, Grace XY, et al. 2017, Weinreb, Caleb, et al. 2018, Butler, Andrew, et al. 2018). These normalization methods convert the raw read counts into standardized measures of gene expression by removing factors that may negatively affect the analysis.
{% endif %}

In [None]:
%%appyter code_exec
{% if normalization_method.value != "None" %}
adata_norm = normalize(adata_merged, normalization_method, log_normalization)
adata_adt_merged = adata_adt_merged[adata_norm.obs.index]
adata_adt_norm = normalize(adata_adt_merged, normalization_method="CLR", log_normalization=log_normalization)
{% endif %}

table_counter = display_object(table_counter, "Normalized RNA data. The table displays the expression values after normalization.", adata_norm.to_df().T.head(), istable=True)
table_counter = display_object(table_counter, "Normalized protein data. The table displays the expression values after normalization.", adata_adt_norm.to_df().T.head(), istable=True)
display_statistics(adata_norm, "### Statistics of RNA data after normalization ###")
display_statistics(adata_adt_norm, "### Statistics of protein data after normalization ###")


# Clustergrammer

Clustergrammer (Fernandez, Nicolas F., et al. 2017) is a web-based tool for visualizing and analyzing high-dimensional data as interactive and hierarchically clustered heatmaps. It is commonly used to explore the similarity between samples in an RNA-seq dataset. In addition to identifying clusters of samples, it also allows to identify the genes which contribute to the clustering. It visualizes the top 800 genes with largest variance by default or you can upload your gene list.

## Using RNA Expression

In [None]:
# Run analysis
results['clustergrammer'] = run_clustergrammer(dataset=adata_norm, meta_class_column_name=meta_class_column_name, gene_list=gene_list_for_clustergrammer)

# Display results
plot_clustergrammar(results['clustergrammer'])
figure_counter = display_object(figure_counter, "Clustered heatmap plot. The figure contains an interactive heatmap displaying gene expression for each sample in the RNA-seq dataset. Every row of the heatmap represents a gene, every column represents a sample, and every cell displays normalized gene expression values. The heatmap additionally features color bars beside each column which represent prior knowledge of each sample, such as the tissue of origin or experimental treatment.", istable=False)
 

## Using Protein Expression

In [None]:
# Run analysis
results['clustergrammer'] = run_clustergrammer(dataset=adata_adt_norm, meta_class_column_name=meta_class_column_name, gene_list=gene_list_for_clustergrammer)

# Display results
plot_clustergrammar(results['clustergrammer'])
figure_counter = display_object(figure_counter, "Clustered heatmap plot. The figure contains an interactive heatmap displaying protein expression for each sample in the CITE-seq dataset. Every row of the heatmap represents an antibody-derived tag (ADT), every column represents a sample, and every cell displays normalized protein expression values. The heatmap additionally features color bars beside each column which represent prior knowledge of each sample, such as the tissue of origin or experimental treatment.", istable=False)
 

In [None]:
if len(adata_list) > 1:
    display(Markdown("# Batch Effect Correction"))
    display(Markdown("A batch effect is a difference between samples from different batches caused by non-biological factors such as sequencing platforms and laboratory conditions. These differences can lead to misunderstanding the biological signals from the data and resulting in inaccurate conclusions. To avoid such effects, there are several batch effect correction methods to remove the batch effects from the data. The Appyter uses Batch Balanced KNN (BBKNN) which is a fast batch effect removal algorithm (Polański, Krzysztof, et al. 2020). "))

In [None]:
if len(adata_list) > 1:
    # display samples before batch effect correction
    sc.tl.pca(adata_norm)
    sc.pp.neighbors(adata_norm, n_neighbors=30)
    sc.tl.umap(adata_norm, min_dist=0.1)
    
    sc.tl.pca(adata_adt_norm)
    sc.pp.neighbors(adata_adt_norm, n_neighbors=30)
    sc.tl.umap(adata_adt_norm, min_dist=0.1)
    
    # joint clustering   
    joint = adata_norm.copy()
    joint.obsm['protein'] = adata_adt_norm.to_df()

    joint.uns['neighbors'] = {}
    joint.uns['neighbors']['connectivities'] = average_graphs([adata_norm.uns['neighbors']['connectivities'], adata_adt_norm.uns['neighbors']['connectivities']])
    joint.uns['neighbors']['connectivities_key'] = 'connectivities'
    joint.uns['neighbors']['distances_key'] = 'distances'
    joint.uns['neighbors']['params'] = {'n_neighbors': 30, 'method': 'umap', 'random_state': 0, 'metric': 'euclidean'}

    sc.tl.leiden(joint, resolution=1.0)
    sc.tl.umap(joint, min_dist=0.1)

    # umap info into dataframe 
    umap_df = pd.DataFrame(joint.obsm["X_umap"])
    umap_df.columns = ['x', 'y']

    values_dict = dict()
    values_dict["batch"] = joint.obs["batch"].values
    category_list_dict = dict()
    category_list_dict["batch"] = list(sorted(joint.obs["batch"].unique()))
    figure_counter = plot_scatter(umap_df, values_dict, ["batch"], joint.obs.index.tolist(), "UMAP Plot of Samples before Batch Effect Correction. The figure contains an interactive scatter plot displaying samples colored by ", category_list_dict=category_list_dict, category=True, dropdown=False, figure_counter=figure_counter)

    # batch correction
    sc.external.pp.bbknn(joint, batch_key="batch")

    # display samples after batch effect correction
    sc.tl.umap(joint, min_dist=0.1)
    # umap info into dataframe 
    umap_df = pd.DataFrame(joint.obsm["X_umap"])
    umap_df.columns = ['x', 'y']
    
    values_dict = dict()
    values_dict["batch"] = joint.obs["batch"].values
    category_list_dict = dict()
    category_list_dict["batch"] = list(sorted(joint.obs["batch"].unique()))
    figure_counter = plot_scatter(umap_df, values_dict, ["batch"], joint.obs.index.tolist(), "UMAP Plot of Samples after Batch Effect Correction. The figure contains an interactive scatter plot displaying samples colored by ", category_list_dict=category_list_dict, category=True, dropdown=False, figure_counter=figure_counter)


# Differential Gene Expression Analysis
In the following section, the appyter computes differentially expressed genes for each cluster and performs enrichment analysis using the top genes. You can select analysis options such as intra dataset, integrated dataset, and inter dataset. Option “intra dataset” will analyze the data within each dataset. This will be useful when you want to analyze datasets from different conditions separately and see the difference between them. Option “integrated dataset” will analyze the integrated data. It will be useful when you have multiple datasets from the same condition. Option “inter dataset” will analyze the difference between datasets. It will be similar with bulk RNA-seq analysis.<br>
- The appyter clusters samples using the Leiden algorithm (Traag, Vincent A., et al. 2019) to identify well-connected clusters in networks. In this procedure, the appyter takes into account both modalities of the data by integrating connectivity graphs generated from each modality (ref https://scanpy-tutorials.readthedocs.io/en/multiomics/cite-seq/pbmc5k.html#Clustering). It visualizes the samples colored by clusters. <br>
- It visualizes protein levels of samples. You can select other proteins listed in the dropdown menu. <br>
- Gene expression signatures are alterations in the patterns of gene expression that occur as a result of cellular perturbations such as drug treatments, gene knock-downs or diseases. They can be quantified using differential gene expression (DGE) methods (Ritchie, Matthew E., et al. 2015, Clark, Neil R., et al. 2014), which compare gene expression between two groups of samples to identify genes whose expression is significantly altered in the perturbation.
<br>
- Enrichment analysis is a bioinformatics method used to identify prior knowledge terms which are over-represented in a given gene set by comparing the gene set to many annotated gene sets. The prior-knowledge gene sets can represent cell signaling pathways, molecular functions, diseases, and a wide variety of other terms obtained by processing data from multiple resources. The appyter allows you to input user-created gene set libraries or to select Enrichr libraries (Kuleshov, M.V., et al. 2016) which is a web-based application that performs enrichment analysis against a large collection of gene-set libraries. The appyter performs enrichment analysis for each cluster and visualizes the top enriched term for each cluster. <br>
- The appyter summarizes the enrichment analysis results by showing the number of samples for each enriched term. 

In [None]:
joint_object_dict = dict()

if "intra-dataset" in integration_option or len(adata_list) == 1:
    datasets = adata_norm.obs["batch"].unique()
    for dataset in datasets:
        display(Markdown(f"### Analysis results of {dataset}"))
        tmp_adata_norm = adata_norm[adata_norm.obs["batch"]==dataset].copy()
        tmp_adata_adt_norm = adata_adt_norm[adata_adt_norm.obs["batch"]==dataset].copy()
        sc.pp.neighbors(tmp_adata_norm, n_neighbors=30)
        sc.pp.neighbors(tmp_adata_adt_norm, n_neighbors=30)

        # clustering
        display(Markdown(f"#### Clustering of {dataset}"))
        tmp_joint, figure_counter = clustering(tmp_adata_norm, tmp_adata_adt_norm, dataset, bool_plot=bool_plot, figure_counter=figure_counter, batch_correction=False)
    
        # protein expression level
        display(Markdown(f"#### Scatter Plot with Protein Expression Levels of {dataset}"))
        figure_counter = protein_levels(tmp_adata_norm, tmp_adata_adt_norm, tmp_joint, bool_plot, figure_counter)
        
        # differential_gene_expression_analysis
        display(Markdown(f"#### Differential Gene Expression Analysis of {dataset}"))
        signatures, bool_cluster, table_counter = differential_gene_expression_analysis(tmp_joint, diff_gex_method, enrichment_groupby, table_counter)

        # enrichment analysis
        display(Markdown(f"#### Enrichment Analysis of {dataset}"))
        tmp_joint, option_list, figure_counter, table_counter = visualize_enrichment_analysis(tmp_joint, signatures, meta_class_column_name, diff_gex_method, enrichr_libraries_filename, enrichr_libraries, enrichment_groupby, libraries_tab, gene_topk, bool_cluster, bool_plot, figure_counter, table_counter)
        
        # summary        
        display(Markdown(f"#### Summary of Enrichment Analysis of {dataset}"))
        option_list.append('leiden')
        table_counter = summary(tmp_joint, option_list, table_counter)
        joint_object_dict[dataset] = tmp_joint

In [None]:
if "integrated-dataset" in integration_option and len(adata_list) > 1:
    display(Markdown(f"### Analysis Results of Integrated Dataset"))
    
    # clustering
    display(Markdown(f"#### Clustering"))
    joint, figure_counter = clustering(adata_norm, adata_adt_norm, dataset="integrated", bool_plot=bool_plot, figure_counter=figure_counter, batch_correction=True)
    
    # protein expression level
    display(Markdown(f"#### Scatter Plot with Protein Expression Levels"))
    figure_counter = protein_levels(adata_norm, adata_adt_norm, joint, bool_plot, figure_counter)
    
    # differential_gene_expression_analysis
    display(Markdown(f"#### Differential Gene Expression Analysis"))
    signatures, bool_cluster, table_counter = differential_gene_expression_analysis(joint, diff_gex_method, enrichment_groupby, table_counter)

    # enrichment analysis
    display(Markdown(f"#### Enrichment Analysis"))
    joint, option_list, figure_counter, table_counter = visualize_enrichment_analysis(joint, signatures, meta_class_column_name, diff_gex_method, enrichr_libraries_filename, enrichr_libraries, enrichment_groupby, libraries_tab, gene_topk, bool_cluster, bool_plot, figure_counter, table_counter)
    
    # summary
    display(Markdown(f"#### Summary of Enrichment Analysis"))
    option_list.append('leiden')
    table_counter = summary(joint, option_list, table_counter)
    joint_object_dict["Integrated Dataset"] = joint

In [None]:
if "inter-dataset" in integration_option and len(adata_list) > 1:
    display(Markdown(f"### Analysis Results of Inter Dataset"))
    bool_plot=True
    # differential_gene_expression_analysis
    display(Markdown(f"#### Differential Gene Expression Analysis"))
    signatures, bool_cluster, table_counter = differential_gene_expression_analysis(joint, diff_gex_method, "batch", table_counter)

    # enrichment analysis
    display(Markdown(f"#### Enrichment Analysis"))
    joint, option_list, figure_counter, table_counter = visualize_enrichment_analysis(joint, signatures, meta_class_column_name, diff_gex_method, enrichr_libraries_filename, enrichr_libraries, "batch", libraries_tab, gene_topk, bool_cluster, bool_plot, figure_counter, table_counter)
    
    # summary
    display(Markdown(f"#### Summary of Enrichment Analysis"))
    option_list.append('batch')
    table_counter = summary(joint, option_list, table_counter)
    joint_object_dict["Inter_Dataset"] = joint

# Download Analysis Results Table

In [None]:
for name, obj in joint_object_dict.items():
    display(create_download_link(obj.obs, filename=f"analysis_results_{name}.csv"))
    display(create_download_link(obj, title="Download h5ad file: {}", filename=f"data_{name}.h5ad"))

# References 

Ashburner, M., Ball, C.A., Blake, J.A., Botstein, D., Butler, H., Cherry, J.M., Davis, A.P., Dolinski, K., Dwight, S.S. and Eppig, J.T. (2000) Gene Ontology: tool for the unification of biology. Nature genetics, 25, 25.
<br>
Butler, Andrew, et al. "Integrating single-cell transcriptomic data across different conditions, technologies, and species." Nature biotechnology 36.5 (2018): 411-420.
<br>
Clark, N.R. and Ma’ayan, A. (2011) Introduction to statistical methods to analyze large data sets: principal components analysis. Sci. Signal., 4, tr3-tr3.
<br>
Clark, Neil R., et al. "The characteristic direction: a geometrical approach to identify differentially expressed genes." BMC bioinformatics 15.1 (2014): 79.
<br>
Consortium, E.P. (2004) The ENCODE (ENCyclopedia of DNA elements) project. Science, 306, 636-640.
<br>
Croft, David, et al. "The Reactome pathway knowledgebase." Nucleic acids research 42.D1 (2014): D472-D477.
<br>
Fernandez, Nicolas F., et al. "Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data." Scientific data 4 (2017): 170151.
<br>
Ilicic, Tomislav, et al. "Classification of low quality cells from single-cell RNA-seq data." Genome biology 17.1 (2016): 29.
<br>
Islam, Saiful, et al. "Quantitative single-cell RNA-seq with unique molecular identifiers." Nature methods 11.2 (2014): 163.
<br>
Kanehisa, M. and Goto, S. (2000) KEGG: kyoto encyclopedia of genes and genomes. Nucleic acids research, 28, 27-30.
<br>
Kuleshov, M.V., Jones, M.R., Rouillard, A.D., Fernandez, N.F., Duan, Q., Wang, Z., Koplev, S., Jenkins, S.L., Jagodnik, K.M. and Lachmann, A. (2016) Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic acids research, 44, W90-W97.
<br>
Lachmann, A., Xu, H., Krishnan, J., Berger, S.I., Mazloom, A.R. and Ma'ayan, A. (2010) ChEA: transcription factor regulation inferred from integrating genome-wide ChIP-X experiments. Bioinformatics, 26, 2438-2444.
<br>
Lachmann, Alexander, and Avi Ma'ayan. "KEA: kinase enrichment analysis." Bioinformatics 25.5 (2009): 684-686.
<br>
Maaten, Laurens van der, and Geoffrey Hinton. "Visualizing data using t-SNE." Journal of machine learning research 9.Nov (2008): 2579-2605.
<br>
McInnes, Leland, John Healy, and James Melville. "Umap: Uniform manifold approximation and projection for dimension reduction." arXiv preprint arXiv:1802.03426 (2018).
<br>
Polański, Krzysztof, et al. "BBKNN: fast batch alignment of single cell transcriptomes." Bioinformatics 36.3 (2020): 964-965.
<br>
Ritchie, Matthew E., et al. "limma powers differential expression analyses for RNA-sequencing and microarray studies." Nucleic acids research 43.7 (2015): e47-e47.
<br>
Traag, Vincent A., Ludo Waltman, and Nees Jan van Eck. "From Louvain to Leiden: guaranteeing well-connected communities." Scientific reports 9.1 (2019): 1-12.
<br>
Weinreb, Caleb, et al. "Fundamental limits on dynamic inference from single-cell snapshots." Proceedings of the National Academy of Sciences 115.10 (2018): E2467-E2476.
<br>
Wolock, Samuel L., Romain Lopez, and Allon M. Klein. "Scrublet: computational identification of cell doublets in single-cell transcriptomic data." Cell systems 8.4 (2019): 281-291.
<br>
Zheng, Grace XY, et al. "Massively parallel digital transcriptional profiling of single cells." Nature communications 8.1 (2017): 1-12.