# scRNA-seq Analysis Pipeline

This pipeline enables you to analyze and visualize your single cell RNA sequencing datasets with an array of algorithms and data visualization methods. The pipeline includes quality control, library size analysis, normalization, plotting most highly expressed genes, plotting samples, MAGIC normalization, clustering, enrichment analysis, trajectory inference, and cell type prediction.

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
# Basic libraries
import pandas as pd
import random
import time
import numpy as np
import warnings

# Visualization
import seaborn as sns
import scipy.stats as ss
import plotly
from plotly import tools
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt; plt.rcdefaults()
from matplotlib import rcParams
from matplotlib.lines import Line2D
from matplotlib_venn import venn2, venn3
import IPython
from IPython.display import HTML, display, Markdown, IFrame, FileLink, Image
from itertools import combinations
from scipy import stats
import chart_studio
import chart_studio.plotly as py
from PIL import Image

# Data analysis
from sklearn.decomposition import PCA
from sklearn.preprocessing import quantile_transform
from sklearn import cluster
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import umap
from rpy2 import robjects
from rpy2.robjects import r, pandas2ri
from magic import MAGIC
import scanpy as sc
import anndata
import DigitalCellSorter
from maayanlab_bioinformatics.enrichment.crisp import enrich_crisp, fisher_overlap

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Span, Select, Legend, PreText, Paragraph, LinearColorMapper, ColorBar, CategoricalColorMapper
from bokeh.layouts import layout, row, column, gridplot
from bokeh.palettes import all_palettes
import colorcet as cc
from bokeh.palettes import Category20

# External Code
from utils import *
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = False)
output_notebook()

In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='Data_Section',
    title='Load your Data',
    subtitle='Load your expression data. In comma/tab separated formats, genes should be in rows and samples shoud be in columns. You can also upload Cell Ranger files (matrix, genes, and barcords files). ',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Normalization_Section',
    title='Select Normalization Methods',
    subtitle='',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Visualization_Section',
    title='Select Visualization Parameters',
    subtitle='',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='DEG_Section',
    title='Select Differentially Exprssed Gene Analysis Parameters',
    subtitle='',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='TI_Section',
    title='Select Trajectory Inference Parameters',
    subtitle='',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Time_TI_Section',
    title='Select Time-series Trajectory Inference Parameters',
    subtitle='',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='CT_Section',
    title='Select Cell Type Prediction Parameters',
    subtitle='',
    img='analysis.png'
    
) %}

In [None]:
%%appyter code_exec

{% set file_kind = TabField(
    name='file_kind',
    label='Data file',
    default='Plain text',
    description='Upload your expression files',
    choices={
        'Plain text': [             
            FileField(
                name='rnaseq_data_filename', 
                label='RNA-seq data file (.csv, .txt or .tsv)', 
                default='GSE110499_example_expression.txt',
                examples={'GSE110499_example_expression.txt': 'https://appyters.maayanlab.cloud/storage/SC_RNA_seq/GSE110499-expression.txt'}, 
                description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
                section='Data_Section'
            ),
            FileField(
                name='meta_data_filename', 
                label='(Optional) Meta data file (.csv, .txt or .tsv)', 
                default='GSE110499_example_metadata.txt',
                examples={'GSE110499_example_metadata.txt': 'https://appyters.maayanlab.cloud/storage/SC_RNA_seq/GSE110499-metadata.txt'}, 
                description='Upload metadata as two-column comma seperated or tab seperated format. The first column contains sample IDs and the other column contains sample labels',
                section='Data_Section'
            ),         
            StringField(
                name='meta_class_column_name', 
                label='(Optional) Class column name in metadata', 
                default='prep-site', 
                description='class column name of metadata', 
                section='Data_Section'
            )


        ],
        
        '.mtx from 10x Genomics': [
            FileField(
                name='mtx_data_filename', 
                label='RNA-seq data file (.mtx)', 
                default='example_matrix.mtx',
                examples={'example_matrix.mtx': 'https://appyters.maayanlab.cloud/storage/SC_RNA_seq/example_matrix.mtx'}, 
                description='Expression data file from 10x Genomics need to be in .mtx format which store the expression data in sparse matrix.',
                section='Data_Section'
            ),
            FileField(
                name='gene_data_filename', 
                label='Gene infomation file (.tsv)', 
                default='example_genes.tsv',
                examples={'example_genes.tsv': 'https://appyters.maayanlab.cloud/storage/SC_RNA_seq/example_genes.tsv'}, 
                description='A tab delimited file of the corresponding genes in the .mtx expression matrix file. The first column should be the gene IDs whereas the secound column corresponds to gene symbols.', 
                section='Data_Section'
            ),
            FileField(
                name='barcode_data_filename', 
                label='Barcode information file (.tsv)', 
                default='example_barcodes.tsv',
                examples={'example_barcodes.tsv': 'https://appyters.maayanlab.cloud/storage/SC_RNA_seq/example_barcodes.tsv'}, 
                description='A tab delimited file of the corresponding barcodes in the .mtx expression matrix file. The first column should be the unique barcodes for cells.', 
                section='Data_Section'
            ),
            FileField(
                name='meta_data_filename_mtx', 
                label='(Optional) Meta data file (.csv, .txt or .tsv)', 
                default='',
                examples={'example_metadata.tsv': 'https://appyters.maayanlab.cloud/storage/SC_RNA_seq/example_metadata.tsv'}, 
                description='Upload metadata as two-column comma seperated or tab seperated format. The first column contains sample IDs and the other column contains sample labels',
                section='Data_Section'
            ),         
            StringField(
                name='meta_class_column_name_mtx', 
                label='(Optional) Class column name in metadata', 
                default='', 
                description='class column name of metadata', 
                section='Data_Section'
            )
        ]
        
    },
    section = 'Data_Section',
) %}



In [None]:
%%appyter code_exec
{% set qc_filter_genes = BoolField(
    name='qc_filter_genes', 
    label='Filter genes by quality control?', 
    default='true', 
    description='Check if you want cells to be filtered by quality control', 
    section='Normalization_Section')
%}

{% set qc_threshold = FloatField(
    name='qc_threshold', 
    label='Mitochondria Quality Control threshold', 
    default='0.05', 
    description='Remove cells that have too many mitochondrial genes expressed.', 
    section='Normalization_Section')
%} 

{% set log_normalization = BoolField(
    name='log_normalization', 
    label='Log normalization?', 
    default='true', 
    description='Check if you want the dataset to be log-transformed', 
    section='Normalization_Section')
%}
{% set normalization_method = ChoiceField(
    name='normalization_method',
    label='Normalization',
    choices={'None': 'None', 'Satija et al. (2015, Nature Biotechnology)': 'Seurat', 'Zheng et al. (2017, Nature Communications)': 'Zheng17','Weinreb et al. (2018, PNAS)': 'Weignreb17'},
    default='Satija et al. (2015, Nature Biotechnology)', 
    description='Standard normlization recipe for scRNA-seq datasets', 
    section='Normalization_Section')
%}
{% set magic = BoolField(
    name='magic', 
    label='MAGIC?', 
    default='true', 
    description='Check if you want impute missing data with Markov Affinity-based Graph Imputation of Cells (MAGIC)', 
    section='Normalization_Section')
%}

In [None]:
%%appyter code_exec
{% set highest_expr_n_genes = IntField(
    name='highest_expr_n_genes', 
    label='Genes for highest expression', 
    min=0, 
    max=100, 
    default=10, 
    description='The number of genes with highest expression values', 
    section='Visualization_Section')
%}

{% set nr_genes = IntField(
    name='nr_genes', 
    label='Genes for dimension reduction', 
    min=0, 
    max=30000, 
    default=500, 
    description='The maximum number of genes for dimension reduction analysis', 
    section='Visualization_Section')
%}
gene_list_for_clustergrammer = {{TextField(
    name='gene_list_for_clustergrammer', 
    label='Gene List for Clustergrammer (Optional)', 
    default='', 
    description='Paste your gene list (One gene per row) for Clustergrammer heatmap plots.', 
    section = 'Visualization_Section')}}
{% set clustering_topk = IntField(
    name='clustering_topk', 
    label='Genes for clustergrammer', 
    min=0, 
    max=1000, 
    default=800, 
    description='The number of genes with largest variance for Clustergrammer', 
    section='Visualization_Section')
%}


In [None]:
%%appyter code_exec

{% set deg = BoolField(
    name='deg',
    label='Differentially expressed gene analysis?',
    default='true',
    description='Check if you want differentially expressed gene analysis', 
    section='DEG_Section')
%}
{% set bar_chart = BoolField(
    name='bar_chart', 
    label='Bar chart?', 
    default='false', 
    description='Check if you want to see bar charts. It may take a while.', 
    section='DEG_Section')
%}
{% set enrichment_groupby = ChoiceField(
    name='enrichment_groupby',
    label='Group for differentially expressed gene analysis',
    choices={'Cluster': 'Cluster', 'User-defined class': 'user_defined_class'},
    default='User-defined class', 
    description='Specify groups for enrichment analysis. Clusters will be automatically generated by a clustering method', 
    section='DEG_Section')
%}

{% set diff_gex_method = ChoiceField(
    name='diff_gex_method',
    label='Differential expression analysis method',
    choices={'limma': 'limma', 'wilcoxon': 'wilcoxon', 'characteristic direction': 'characteristic_direction', 'edgeR': 'edgeR', 'DESeq2': 'DESeq2'},
    default='limma', 
    description='Set a method to get differentially expressed genes', 
    section='DEG_Section')
%}



{% set gene_topk = IntField(
    name='gene_topk', 
    label='Maximum genes for Enrichr', 
    min=0, 
    max=1000, 
    default=200, 
    description='The maximum number of genes discovered by the DEG method', 
    section='DEG_Section')
%}


{% set enrichr_libraries = MultiChoiceField(
    name='enrichr_libraries',
    label='Enrichr Libraries (upto 2)',
    descriptions='Enrichr libraries to be visualized. Select one or two libraries',
    choices=['Gene Ontology',
            'Pathway',
            'Kinase',
            'Transcription Factor',
            'miRNA',
            'Cell Type',
             'Disease'],
    default=['Gene Ontology'],
    section='DEG_Section'
    )
%}

{% set nr_genesets = IntField(
    name='nr_genesets', 
    label='Top ranked gene sets', 
    min=0, 
    max=100,
    default=15, 
    description='the number of result gene sets', 
    section='DEG_Section')
%}


In [None]:
%%appyter code_exec
{% set trajectory = BoolField(
    name='trajectory',
    label='Trajectory Inference?',
    default='true',
    description='Check if you want trajectory inference analysis', 
    section='TI_Section')
%}

{% set trajectory_method = ChoiceField(
    name='trajectory_method',
    label='Trajectory inference method',
    choices={'DPT(diffusion pseudotime)': 'dpt', 'monocle': 'monocle'},
    default='DPT(diffusion pseudotime)', 
    description='Trajectory inference algorithm', 
    section='TI_Section')
%}


In [None]:
%%appyter code_exec
{% set time_series_trajectory = BoolField(
    name='time_series_trajectory', 
    label='Time series trajectory analysis?', 
    default='false', 
    description='Check if you want time-series trajectory analysis', 
    section='Time_TI_Section')
%}
{% set timepoint_labels_column_name = StringField(
    name='timepoint_labels_column_name', 
    label='Timepoint column name in metadata', 
    default='Timepoints', 
    description='Timepoint column name of metadata', 
    section='Time_TI_Section')
%}
timepoint_labels = {{ TextField(
    name='timepoint_labels', 
    label='Ordered timepoint labels in the timepoint label column', 
    default='', 
    description='Paste your timepoint labels in order (One timepoint per row). e.g., 0H, 12H, 24H', 
    section = 'Time_TI_Section') }}


In [None]:
%%appyter code_exec
{% set cell_type_prediction = BoolField(
    name='cell_type_prediction', 
    label='Cell type prediction?', 
    default='false', 
    description='Check if you want cell type prediction', 
    section='CT_Section')
%}

In [None]:
%%appyter code_exec

{%- if file_kind.raw_value == 'Plain text' %}
rnaseq_data_filename = {{ file_kind.value[0] }}
meta_data_filename = {{ file_kind.value[1] }}
meta_class_column_name = {{ file_kind.value[2] }}

{%- elif file_kind.raw_value == '.mtx from 10x Genomics' %}
mtx_data_filename = {{ file_kind.value[0] }}
gene_data_filename = {{ file_kind.value[1] }}
barcode_data_filename = {{ file_kind.value[2] }}
meta_data_filename = {{ file_kind.value[3] }}
meta_class_column_name = {{ file_kind.value[4] }}

{%- elif file_kind.raw_value == '.h5 from 10x Genomics' %}
h5_data_filename = {{ file_kind.value[0] }}
key = {{ file_kind.value[1] }}
{%- endif %}

In [None]:
%%appyter code_exec

qc_filter_genes = {{qc_filter_genes.value}}
qc_threshold = {{qc_threshold.value}}
log_normalization = {{log_normalization.value}}
normalization_method = "{{normalization_method.value}}"
magic = {{magic.value}}

highest_expr_n_genes = {{highest_expr_n_genes.value}}
nr_genes = {{nr_genes.value}}
clustering_topk = {{clustering_topk.value}}

deg = {{deg.value}}
bar_chart = {{bar_chart.value}}
enrichment_groupby = "{{enrichment_groupby.value}}"
diff_gex_method = "{{diff_gex_method.value}}"
gene_topk = {{gene_topk.value}}
enrichr_libraries = {{enrichr_libraries.value}}
nr_genesets = {{nr_genesets.value}}

trajectory = {{trajectory.value}}
trajectory_method = "{{trajectory_method.value}}"

time_series_trajectory = {{time_series_trajectory.value}}
timepoint_labels_column_name = "{{timepoint_labels_column_name.value}}"

cell_type_prediction = {{cell_type_prediction.value}}

In [None]:
warnings.filterwarnings('ignore')
random.seed(0)
chart_studio.tools.set_credentials_file(username='mjjeon', api_key='v0rpMa6lhST28Sq7XqtM')
pandas2ri.activate()
plot_type='interactive'
results = {}
table_counter = 1
figure_counter = 1
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)
sc.settings.figdir="./"

# Load datasets

In [None]:
def load_metadata(adata, meta_data_filename, meta_class_column_name):
    if meta_data_filename != "":
        if meta_data_filename.endswith(".csv"):
            meta_df = pd.read_csv(meta_data_filename, index_col=0)
        else:
            meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0)
        if meta_class_column_name == "":
            raise Exception ("Run time error: Please provide a proper column name for sample classes in metadata")
        try:
            check_df(meta_df, meta_class_column_name)
        except:
            raise Exception (f"Error! Column '{meta_class_column_name}' is not in metadata")
        adata.obs[meta_class_column_name] = meta_df.loc[:, meta_class_column_name]
        adata.var_names_make_unique()

    else:
        meta_class_column_name = "Class"
        adata.obs[meta_class_column_name] = ["Class0"]*adata.n_obs
        adata.var_names_make_unique()
    
    return adata, meta_class_column_name
def display_statistics(data, description=""):
    print(description)
    print("Sample size:", data.n_obs)
    print("Feature size:", data.n_vars)
    

In [None]:
%%appyter code_exec

{%- if file_kind.raw_value == 'Plain text' %}
check_files(rnaseq_data_filename)
    
try:
    if rnaseq_data_filename.endswith(".csv"):
        expr_df = pd.read_csv(rnaseq_data_filename, index_col=0).sort_index()
    else:
        expr_df = pd.read_csv(rnaseq_data_filename, index_col=0, sep="\t").sort_index()
    
    # convert df into anndata
    # adata matrix: sample x gene
    adata = anndata.AnnData(expr_df.T)
    adata.X = adata.X.astype('float64')    
    
except:
    print("Error! Input files are in a wrong format. \
    Please check if the index of the expression data are genes and the columns are sample IDs. \
    Sample IDs in the expression data and the metadata should be matched")

del expr_df

{%- elif file_kind.raw_value == '.mtx from 10x Genomics' %}
adata = load_seurat_files(mtx_data_filename, gene_data_filename, barcode_data_filename)

{%- endif -%}
# load meta data
adata, meta_class_column_name = load_metadata(adata, meta_data_filename, meta_class_column_name)    

In [None]:
table_counter = display_object(table_counter, "Raw data. The table displays the first 5 rows of the quantified RNA-seq expression dataset. Rows represent genes, columns represent samples, and values show the number of mapped reads.", adata.to_df().iloc[:10,:5].T.head(), istable=True)
table_counter = display_object(table_counter, "Metadata. The table displays the metadata associated with the samples in the RNA-seq dataset. Rows represent RNA-seq samples, columns represent metadata categories.", adata.obs.head(), istable=True)
table_counter = display_object(table_counter, "Sample size for each class. The table displays the number of samples in each class.", adata.obs.reset_index().groupby(meta_class_column_name).count(), istable=True)
display_statistics(adata, "### Statistics of data ###") 

In [None]:
%%appyter markdown
{%if qc_filter_genes.value == True %}
# Filter Out Cells Based On Mitochondrial Genes
High expression levels of mitochondrial genes could be an indicator of poor quality cells (Islam, Saiful, et al. 2014, Ilicic, Tomislav, et al. 2016). In a situation where the cell membrane is broken, cytoplasmic RNA will be lost, but RNAs enclosed in the mitochondria will be retained. This analysis removes single cells that are likely having the cell membrane broken.
{% endif %}

In [None]:
%%appyter code_exec
{%if qc_filter_genes.value == True %}
if adata.var_names[0].startswith("ENSG") == False:
    mito_genes = adata.var_names.str.startswith('MT-')    
else:
    gene_symbol_var_names = adata.var_names
    mito_genes = [True if x in gene_id_map_dict and str(gene_id_map_dict[x]).startswith('MT-') else False for x in gene_symbol_var_names ]

# for each cell compute fraction of counts in mito genes vs. all genes
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1) / np.sum(adata.X, axis=1)

sc.pl.violin(adata, ['percent_mito'],
         jitter=0.4, multi_panel=True, show=True, save=True)
figure_counter = display_object(figure_counter, "Violin plot of the percentage of mitochondrial gene expression counts in each cell", istable=False)
display_link("violin.pdf", "Download figure")
adata = adata[adata.obs.percent_mito < qc_threshold, :]
display_statistics(adata, "### Statistics of data after QC ###")
{% endif %}

In [None]:
adata.raw = adata

# Library Size Analysis

In order to quantify gene expression in an RNA-seq dataset, reads generated from the sequencing step are mapped to a reference genome and subsequently aggregated into numeric gene counts. Due to experimental variations and random technical noise, samples in an RNA-seq dataset often have variable amounts of the total RNA. Library size analysis calculates and displays the total number of reads mapped for each sample in the RNA-seq dataset, facilitating the identification of outlying samples and the assessment of the overall quality of the data.

In [None]:
adata.obs['reads'] = adata.X.sum(axis=1)
library_sum_df = pd.DataFrame(adata.obs['reads'])
if len(library_sum_df.index) > 200:
    library_sum_df = library_sum_df.sample(200)
fig = px.histogram(library_sum_df, x='reads')
fig.update_yaxes(title="samples/cells")
fig.show()
figure_counter = display_object(figure_counter, "Histogram of the total number of reads mapped for each sample. The figure contains an interactive bar chart which displays the number of samples according to the total number of reads mapped to each RNA-seq sample in the dataset. Additional information for each sample is available by hovering over the bars.", istable=False)


In [None]:
%%appyter markdown
{% if normalization_method.value != "None" %}
# Normalization
Various normalization methods can be applied based on the selection made by the user (Zheng, Grace XY, et al. 2017, Weinreb, Caleb, et al. 2018, Butler, Andrew, et al. 2018). These normalization methods convert the raw read counts into standardized measures of gene expression by removing factors that may negatively affect the analysis.
{% endif %}

In [None]:
%%appyter code_exec
{% if normalization_method.value != "None" %}
if normalization_method == "Zheng17":
    sc.pp.recipe_zheng17(adata, log=log_normalization, plot=False)
elif normalization_method == "Weinreb17":
    sc.pp.recipe_weinreb17(adata, log=log_normalization)
elif normalization_method == "Seurat":
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)
    sc.pp.normalize_total(adata, target_sum=1e4)        
    if log_normalization:
        sc.pp.log1p(adata)
    sc.pp.scale(adata, max_value=10)
{% endif %}


{% if magic.value == True %}    
adata = run_magic(dataset=adata)
table_counter = display_object(table_counter, "Normalized data. The table displays the expression values after normalization.", adata.uns["magic"].T.head(), istable=True)
display_statistics(adata, "### Statistics of data after normalization ###")
display(create_download_link(adata.to_df().T, filename="Normalized_data.csv"))

{% elif normalization_method.value != "None" %}
table_counter = display_object(table_counter, "Normalized data. The table displays the expression values after normalization.", adata.to_df().T.head(), istable=True)
display_statistics(adata, "### Statistics of data after normalization ###")
display(create_download_link(adata.to_df().T, filename="Normalized_data.csv"))

{% endif %}

# Most Highly Expressed Genes Across All samples/Cells

This analysis displays the genes that yield the highest fraction of counts in each single cell, across all cells.

In [None]:
df = adata.to_df()
df = df.divide(df.sum(axis=1), axis=0)
mean_df = df.mean().sort_values(ascending=False)
yaxis_gene_names = list(mean_df.index)[:highest_expr_n_genes]
median_df = df.median().sort_values(ascending=False)
selected_median_df = median_df.loc[yaxis_gene_names]
sorted_yaxis_gene_names = list(selected_median_df.sort_values(ascending=False).index)
sc.pl.highest_expr_genes(adata, log=True, n_top=highest_expr_n_genes, order=sorted_yaxis_gene_names, show=True, save=True)
figure_counter = display_object(figure_counter, "The {} genes with the highest mean fraction over all cells sorted by median.".format(highest_expr_n_genes), istable=False)
display_link("highest_expr_genes.pdf", "Download figure")    

# Dimensionality Reduction

Principal Component Analysis (PCA ) (Clark, N.R. and Ma’ayan, A., 2011), T-distributed Stochastic Neighbor Embedding (t-SNE) (Maaten, Laurens van der, and Geoffrey Hinton, 2008), Uniform Manifold Approximation and Projection (UMAP) (McInnes, Leland, et al. 2018) are statistical techniques used to identify global patterns in high-dimensional datasets. These techniques are commonly used to explore the similarity of biological samples in RNA-seq datasets. To achieve this, gene expression values are transformed into 2- or 3-dimensional latent vectors and subsequently visualized using a scatter plot.

In [None]:
# Run analysis
results['dimension_reduction_pca'] = run_dimension_reduction(dim_reduction_method="PCA", dataset=adata, meta_class_column_name=meta_class_column_name, magic_normalization=False, nr_genes=nr_genes, plot_type=plot_type)
results['dimension_reduction_tsne'] = run_dimension_reduction(dim_reduction_method="t-SNE", dataset=adata, meta_class_column_name=meta_class_column_name, magic_normalization=False, nr_genes=nr_genes, plot_type=plot_type)
results['dimension_reduction_umap'] = run_dimension_reduction(dim_reduction_method="UMAP", dataset=adata, meta_class_column_name=meta_class_column_name, magic_normalization=False, nr_genes=nr_genes, plot_type=plot_type)

# Display results
# PCA
plot_dimension_reduction(results['dimension_reduction_pca'])
figure_counter = display_object(figure_counter, subcounter="A", caption='3D PCA plot for samples using {} genes having largest variance. \
    The figure displays an interactive, three-dimensional scatter plot of the data. \
    Each point represents an RNA-seq sample. \
    Samples with similar gene expression profiles are closer in the three-dimensional space. \
    If provided, sample groups are indicated using different colors, allowing for easier interpretation of the results.'.format(nr_genes), istable=False)
# t-SNE
plot_dimension_reduction(results['dimension_reduction_tsne'])
figure_counter = display_object(figure_counter, subcounter="B", caption='3D t-SNE plot for samples using {} genes having largest variance. \
    The figure displays an interactive, three-dimensional scatter plot of the data. \
    Each point represents an RNA-seq sample. \
    Samples with similar gene expression profiles are closer in the three-dimensional space. \
    If provided, sample groups are indicated using different colors, allowing for easier interpretation of the results.'.format(nr_genes), istable=False)
# UMAP
plot_dimension_reduction(results['dimension_reduction_umap'])
figure_counter = display_object(figure_counter, subcounter="C", caption='3D UMAP plot for samples using {} genes having largest variance. \
    The figure displays an interactive, three-dimensional scatter plot of the data. \
    Each point represents an RNA-seq sample. \
    Samples with similar gene expression profiles are closer in the three-dimensional space. \
    If provided, sample groups are indicated using different colors, allowing for easier interpretation of the results.'.format(nr_genes), istable=False)

In [None]:
%%appyter markdown
{% if magic.value == True %}
# Imputation of Dropouts with MAGIC
Markov Affinity-based Graph Imputation of Cells (MAGIC) (van Dijk, D., et al. 2017) is an algorithm for denoising high-dimensional data. It is most commonly applied to impute single-cell RNA sequencing data. MAGIC learns the manifold data, using the resultant graph to smooth the features and restore the original structure of the data.
{% endif %}

In [None]:
%%appyter code_exec
{% if magic.value == True %}
# Run analysis
results['magic_dimension_reduction_pca'] = run_dimension_reduction(dim_reduction_method="PCA", dataset=adata, meta_class_column_name=meta_class_column_name, magic_normalization=magic, nr_genes=nr_genes, color_by='auto')
results['magic_dimension_reduction_tsne'] = run_dimension_reduction(dim_reduction_method="t-SNE", dataset=adata, meta_class_column_name=meta_class_column_name, magic_normalization=magic, nr_genes=nr_genes, color_by='auto')
results['magic_dimension_reduction_umap'] = run_dimension_reduction(dim_reduction_method="UMAP", dataset=adata, meta_class_column_name=meta_class_column_name, magic_normalization=magic, nr_genes=nr_genes, color_by='auto')

# Display results
plot_dimension_reduction(results['magic_dimension_reduction_pca'])
figure_counter = display_object(figure_counter, subcounter="A", caption="3D PCA plot for samples using {} genes having largest variance after MAGIC imputation. The figure displays an interactive, three-dimensional scatter plot of the data. Each point represents an RNA-seq sample. Samples with similar gene expression profiles are closer in the three-dimensional space. If provided, sample groups are indicated using different colors, allowing for easier interpretation of the results.".format(nr_genes), istable=False)

plot_dimension_reduction(results['magic_dimension_reduction_tsne'])
figure_counter = display_object(figure_counter, subcounter="B", caption="3D t-SNE plot for samples using {} genes having largest variance after MAGIC imputation. The figure displays an interactive, three-dimensional scatter plot of the data. Each point represents an RNA-seq sample. Samples with similar gene expression profiles are closer in the three-dimensional space. If provided, sample groups are indicated using different colors, allowing for easier interpretation of the results.".format(nr_genes), istable=False)

plot_dimension_reduction(results['magic_dimension_reduction_umap'])
figure_counter = display_object(figure_counter, subcounter="C", caption="3D UMAP plot for samples using {} genes having largest variance after MAGIC imputation. The figure displays an interactive, three-dimensional scatter plot of the data. Each point represents an RNA-seq sample. Samples with similar gene expression profiles are closer in the three-dimensional space. If provided, sample groups are indicated using different colors, allowing for easier interpretation of the results.".format(nr_genes), istable=False)

adata.X = adata.uns['magic']
{% endif %}

# Clustergrammer

Clustergrammer (Fernandez, Nicolas F., et al. 2017) is a web-based tool for visualizing and analyzing high-dimensional data as interactive and hierarchically clustered heatmaps. It is commonly used to explore the similarity between samples in an RNA-seq dataset. In addition to identifying clusters of samples, it also allows to identify the genes which contribute to the clustering. It visualizes the top 800 genes with largest variance by default or you can upload your gene list.

In [None]:
# Run analysis
results['clustergrammer'] = run_clustergrammer(dataset=adata, meta_class_column_name=meta_class_column_name, gene_list=gene_list_for_clustergrammer)

# Display results
plot_clustergrammar(results['clustergrammer'])
figure_counter = display_object(figure_counter, "Clustered heatmap plot. The figure contains an interactive heatmap displaying gene expression for each sample in the RNA-seq dataset. Every row of the heatmap represents a gene, every column represents a sample, and every cell displays normalized gene expression values. The heatmap additionally features color bars beside each column which represent prior knowledge of each sample, such as the tissue of origin or experimental treatment.".format(highest_expr_n_genes), istable=False)
 

# Automatic Identification of Clusters
The Leiden algorithm (Traag, Vincent A., et al. 2019) is a method to identify well-connected clusters in networks. 

In [None]:
sc.pp.neighbors(adata, n_neighbors=15)
sc.tl.leiden(adata, resolution=1.0)
sc.tl.umap(adata, min_dist=0.1)
display(create_download_link(adata.obs["leiden"], filename="clustering.csv"))

# sort by clusters
new_order = adata.obs.sort_values(by='leiden').index.tolist()
adata = adata[new_order, :]

In [None]:
# umap info into dataframe 
umap_df = pd.DataFrame(adata.obsm['X_umap'])
umap_df.columns = ['x', 'y']

values_dict = dict()
values_dict["Clusters"] = adata.obs["leiden"].values

figure_counter = plot_scatter(umap_df, values_dict, ["Clusters"], adata.obs.index.tolist(), "Scatter plot of the samples. Each dot represents a sample and it is colored by ", category=True, dropdown=False, figure_counter=figure_counter)

In [None]:
%%appyter markdown
{%if deg.value == True %}
# Differential Gene Expression Analysis
Gene expression signatures are alterations in the patterns of gene expression that occur as a result of cellular perturbations such as drug treatments, gene knock-downs or diseases. They can be quantified using differential gene expression (DGE) methods (Ritchie, Matthew E., et al. 2015, Clark, Neil R., et al. 2014), which compare gene expression between two groups of samples to identify genes whose expression is significantly altered in the perturbation.
{% endif %}

In [None]:
%%appyter code_exec
{% if deg.value == True %}

{% if enrichment_groupby.value == "user_defined_class" %}
classes = adata.obs[meta_class_column_name].unique().tolist()
cluster = False
if len(classes) < 2:
    print("Warning: Please provide at least 2 classes in the metadata")
{% else %}
meta_class_column_name = "leiden"
classes = sorted(adata.obs["leiden"].unique().tolist())
classes.sort(key=int)
cluster=True

{% endif %}
if len(classes) > 5 and adata.n_obs > 5000:
    if diff_gex_method == "wilcoxon":
        print('Warning: There are too many cells/clusters. It cannot execute the analysis code for the data. The appyter randomly select 1000 samples. If you want to execute it with the whole data, please run it locally.')
    else:
        print('Warning: There are too many cells/clusters. It cannot execute the analysis code for the data. The appyter switched to Wilcoxon rank-sum method and randomly select 1000 samples. If you want to execute it with the whole data, please run it locally.')
        diff_gex_method = "wilcoxon"
    # randomly select 5K samples
    random_selected_samples = random.sample(adata.obs.index.tolist(), 5000)
    adata_random_sampled = adata[random_selected_samples, :]
    signatures = get_signatures(classes, adata_random_sampled, method=diff_gex_method, meta_class_column_name=meta_class_column_name, cluster=cluster)
else:
    signatures = get_signatures(classes, adata, method=diff_gex_method, meta_class_column_name=meta_class_column_name, cluster=cluster)
    
    
if len(classes) > 1:
    
    for label, signature in signatures.items():
        table_counter = display_object(table_counter, "Differentially expressed genes between {} using {}. Every row of the table represents a gene; the columns display the estimated measures of differential expression.".format(label, diff_gex_method), signature, istable=True)
        display(create_download_link(signature, filename="DEG_{}.csv".format(label)))

{% endif %}

In [None]:
%%appyter markdown
{%if deg.value == True %}
# Enrichment Analysis
Enrichment analysis is a bioinformatics method used to identify prior knowledge terms which are over-represented in a given gene set by comparing the gene set to many annotated gene sets. The prior-knowledge gene sets can represent cell signaling pathways, molecular functions, diseases, and a wide variety of other terms obtained by processing data from multiple resources. Enrichr (Kuleshov, M.V., et al. 2016) is a web-based application that  performs enrichment analysis against a large collection of gene-set libraries. It provides various interactive visualizations to display enrichment results.
{% endif %}

In [None]:
%%appyter code_exec
{%if deg.value == True %}
if diff_gex_method == "characteristic_direction":
    fc_colname = "CD-coefficient"
    sort_genes_by = "CD-coefficient"
    ascending = False
elif diff_gex_method == "limma":
    fc_colname = "logFC"
    sort_genes_by = "t"
    ascending = False
elif diff_gex_method == "edgeR":
    fc_colname = "logFC"
    sort_genes_by = "PValue"
    ascending = True
elif diff_gex_method == "DESeq2":
    fc_colname = "log2FoldChange"
    sort_genes_by = "padj"
    ascending = True
elif diff_gex_method == "wilcoxon":
    fc_colname = "logfoldchanges"
    sort_genes_by = "pvals"
    ascending = True

results['enrichr'] = {}
{% if enrichment_groupby.value != "user_defined_class" %}
default_node_size = 120000 / len(adata.obs.index)
node_size = min(100, 120000 / len(adata.obs.index))
sc.pl.umap(adata, color=['leiden'], size=node_size)
{% else %}
default_node_size = 120000 / len(adata.obs.index)
node_size = min(100, 120000 / len(adata.obs.index))
sc.pl.umap(adata, color=[meta_class_column_name], size=node_size)

{% endif %}
for label, signature in signatures.items():
    # Run analysis
    {% if enrichment_groupby.value == "user_defined_class" %}
    case_name = label.split(" vs. ")[1]
    {% else %}
    case_name = label.split(" vs. ")[0]
    {% endif %}
    results['enrichr'][label] = run_enrichr(signature=signature, signature_label=label, fc_colname=fc_colname,geneset_size=gene_topk, sort_genes_by = sort_genes_by,ascending=ascending)
    display(Markdown("*Enrichment Analysis Result: {} (Up-regulated in {})*".format(label, case_name)))
    display_link("https://amp.pharm.mssm.edu/Enrichr/enrich?dataset={}".format(results['enrichr'][label]["upregulated"]["shortId"]))
    display(Markdown("*Enrichment Analysis Result: {} (Down-regulated in {})*".format(label, case_name)))
    display_link("https://amp.pharm.mssm.edu/Enrichr/enrich?dataset={}".format(results['enrichr'][label]["downregulated"]["shortId"]))
table_counter = display_object(table_counter, "The table displays links to Enrichr containing the results of enrichment analyses generated by analyzing the up-regulated and down-regulated genes from a differential expression analysis. By clicking on these links, users can interactively explore and download the enrichment results from the Enrichr website.", istable=True)
{% endif %}

In [None]:
%%appyter markdown
{% if deg.value == True %}
# Bar Charts for Selected Libraries
{% endif %}

In [None]:
%%appyter markdown
{% if deg.value == True and "Gene Ontology" in enrichr_libraries.value %}
### Gene Ontology Enrichment Analysis
Gene Ontology (GO) (Ashburner, M., et al. 2000) is a major initiative aimed at unifying the representation of gene attributes across all species. It contains a large collection of experimentally validated and predicted associations between genes and biological terms. This information can be leveraged by Enrichr to identify enriched biological processes, molecular functions and cellular components which are over-represented in the up-regulated genes from each cluster.
{% endif %}

In [None]:
%%appyter code_exec
{% if deg.value == True and "Gene Ontology" in enrichr_libraries.value %}

results['go_enrichment'] = {}
for label, signature in signatures.items():
    # Run analysis
    results['go_enrichment'][label] = get_enrichr_results_by_library(results['enrichr'][label], label, library_type='go', version='2018')
{% endif %}

{% if deg.value == True and "Gene Ontology" in enrichr_libraries.value and bar_chart.value == True %}    
for label, signature in signatures.items():
    {% if enrichment_groupby.value == "user_defined_class" %}

    case_name = label.split(" vs. ")[1]
    {% else %}
    case_name = label.split(" vs. ")[0]
    {% endif %}

    # Create dataframe
    enrichment_results = results['go_enrichment'][label]
    enrichment_dataframe = pd.concat([enrichment_results['upregulated'], enrichment_results['downregulated']])

    # Plot barcharts
    libraries = enrichment_dataframe['gene_set_library'].unique()   
    for gene_set_library in libraries:
        plot_library_barchart(enrichment_results, gene_set_library, enrichment_results['signature_label'], case_name, enrichment_results['sort_results_by'], nr_genesets=nr_genesets, plot_type=plot_type) # 10 300
    figure_counter = display_object(figure_counter, "Enrichment Analysis Results for {} in Gene Onotology. The figure contains interactive bar charts displaying the results of the Gene Ontology enrichment analysis generated using Enrichr. The x axis indicates the -log10(P-value) for each term. Significant terms are highlighted in bold. Additional information about enrichment results is available by hovering over each bar.".format(label), istable=False)
{% endif %}    



In [None]:
%%appyter markdown
{% if deg.value == True and "Pathway" in enrichr_libraries.value %}
### Pathway Enrichment Analysis
Biological pathways are sequences of interactions between biochemical compounds which play a key role in determining cellular behavior. Databases such as KEGG (Kanehisa et al. 2000), Reactome (Croft et al. 2014) and WikiPathways (Kelder et al. 2012) contain a large number of associations between such pathways and genes. This information can be leveraged by Enrichr to identify the biological pathways which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.
{% endif %}

In [None]:
%%appyter code_exec
{% if deg.value == True and "Pathway" in enrichr_libraries.value %}
# Initialize results
results['pathway_enrichment'] = {}

# Loop through results
for label, signature in signatures.items():
    # Run analysis
    results['pathway_enrichment'][label] = get_enrichr_results_by_library(results['enrichr'][label], label, library_type='pathway')
{% endif %}
{% if deg.value == True and "Pathway" in enrichr_libraries.value and bar_chart.value == True %}    

    for label, signature in signatures.items():
        {% if enrichment_groupby.value == "user_defined_class" %}
        case_name = label.split(" vs. ")[1]
        {% else %}
        case_name = label.split(" vs. ")[0]
        {% endif %}

        # Create dataframe
        enrichment_results = results['pathway_enrichment'][label]
        enrichment_dataframe = pd.concat([enrichment_results['upregulated'], enrichment_results['downregulated']])

        # Plot barcharts
        libraries = enrichment_dataframe['gene_set_library'].unique()   
        for gene_set_library in libraries:
            # Display results
            plot_library_barchart(enrichment_results, gene_set_library, enrichment_results['signature_label'], case_name, enrichment_results['sort_results_by'], nr_genesets=nr_genesets, plot_type=plot_type)
        figure_counter = display_object(figure_counter, "Enrichment Analysis Results for {} in KEGG Pathways, WikiPathways and Reactome Pathways. The figure contains interactive bar charts displaying the results of the Gene Ontology enrichment analysis generated using Enrichr. The x axis indicates the -log10(P-value) for each term. Significant terms are highlighted in bold. Additional information about enrichment results is available by hovering over each bar.".format(label), istable=False)
    
{% endif %}

In [None]:
%%appyter markdown
{% if deg.value == True and "Transcription Factor" in enrichr_libraries.value %}
### Transcription Factor Enrichment Analysis
Transcription Factors (TFs) are proteins involved in the transcriptional regulation of gene expression. Databases such as ChEA (Lachmann et al. 2010) and ENCODE (Consortium, 2014) contain a large number of associations between TFs and their transcriptional targets. This information can be leveraged by Enrichr to identify the transcription factors whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.
{% endif %}

In [None]:
%%appyter code_exec
{% if deg.value == True and "Transcription Factor" in enrichr_libraries.value %}
# Initialize results
results['tf_enrichment'] = {}
# Loop through results
for label, signature in signatures.items():
    # Run analysis
    results['tf_enrichment'][label] = get_enrichr_result_tables_by_library(enrichr_results=results['enrichr'][label], signature_label=label, library_type='tf')
    {% if bar_chart.value == True%}
        table_counter = display_table(results['tf_enrichment'][label], "Transcription Factor", table_counter)
    {% endif %} 
{% endif %} 

In [None]:
%%appyter markdown
{% if deg.value == True and "Kinase" in enrichr_libraries.value %}
### Kinase Enrichment Analysis
Protein kinases are enzymes that modify other proteins by chemically adding phosphate groups. Databases such as KEA (Lachmann et al. 2009) contain a large number of associations between kinases and their substrates. This information can be leveraged by Enrichr to identify the protein kinases whose substrates are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.
{% endif %}

In [None]:
%%appyter code_exec
{% if deg.value == True and "Kinase" in enrichr_libraries.value %}
# Initialize results
results['kinase_enrichment'] = {}

# Loop through results
for label, enrichr_results in results['enrichr'].items():
    # Run analysis
    results['kinase_enrichment'][label] = get_enrichr_result_tables_by_library(enrichr_results=enrichr_results, signature_label=label, library_type="ke")

    # Display results
    {% if bar_chart.value == True%}
        table_counter = display_table(results['kinase_enrichment'][label], "Kinase", table_counter)
    {% endif %}
{% endif %}

In [None]:
%%appyter markdown
{% if deg.value == True and "miRNA" in enrichr_libraries.value %}
### miRNA Enrichment Analysis
microRNAs (miRNAs) are small non-coding RNA molecules which play a key role in the post-transcriptional regulation of gene expression. Databases such as TargetScan (Agarwal et al. 2015) and MiRTarBase (Chou et al. 2016) contain a large number of associations between miRNAs and their targets. This information can be leveraged by Enrichr to identify the miRNAs whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.
{% endif %}

In [None]:
%%appyter code_exec
{% if deg.value == True and "miRNA" in enrichr_libraries.value %}

results['mirna_enrichment'] = {}

# Loop through results
for label, enrichr_results in results['enrichr'].items():
    # Run analysis
    results['mirna_enrichment'][label] = get_enrichr_result_tables_by_library(enrichr_results=enrichr_results, signature_label=label, library_type="mirna")

    # Display results
    {% if bar_chart.value == True%}
        table_counter = display_table(results['mirna_enrichment'][label], "miRNA", table_counter)
    {% endif %}
{% endif %}

In [None]:
%%appyter markdown
{% if deg.value == True and "Cell Type" in enrichr_libraries.value %}
### Cell Type Enrichment Analysis
{% endif %}

In [None]:
%%appyter code_exec
{% if deg.value == True and "Cell Type" in enrichr_libraries.value %}

results['celltype_enrichment'] = {}

for label, signature in signatures.items():
    # Run analysis
    results['celltype_enrichment'][label] = get_enrichr_results_by_library(results['enrichr'][label], label, library_type='celltype')

{% endif %}
{% if deg.value == True and "Cell Type" in enrichr_libraries.value and bar_chart.value == True %}    

    for label, signature in signatures.items():
        {% if enrichment_groupby.value == "user_defined_class" %}
        case_name = label.split(" vs. ")[1]
        {% else %}
        case_name = label.split(" vs. ")[0]
        {% endif %}

        # Create dataframe
        enrichment_results = results['celltype_enrichment'][label]
        enrichment_dataframe = pd.concat([enrichment_results['upregulated'], enrichment_results['downregulated']])

        # Plot barcharts
        libraries = enrichment_dataframe['gene_set_library'].unique()   
        for gene_set_library in libraries:
            plot_library_barchart(enrichment_results, gene_set_library, enrichment_results['signature_label'], case_name, enrichment_results['sort_results_by'], nr_genesets=nr_genesets, plot_type=plot_type) # 10 300
        figure_counter = display_object(figure_counter, "Enrichment Analysis Results for {} in Cell Type. The figure contains interactive bar charts displaying the results of the Gene Ontology enrichment analysis generated using Enrichr. The x axis indicates the -log10(P-value) for each term. Significant terms are highlighted in bold. Additional information about enrichment results is available by hovering over each bar.".format(label), istable=False)
{% endif %}

In [None]:
%%appyter markdown
{% if deg.value == True and "Disease" in enrichr_libraries.value %}
### Disease Enrichment Analysis
{% endif %}

In [None]:
%%appyter code_exec
{% if deg.value == True and "Disease" in enrichr_libraries.value %}

results['disease_enrichment'] = {}
for label, signature in signatures.items():
    # Run analysis
    results['disease_enrichment'][label] = get_enrichr_results_by_library(results['enrichr'][label], label, library_type='disease', version='2018')
{% endif %}
{% if deg.value == True and "Disease" in enrichr_libraries.value and bar_chart.value == True %}    
    
    for label, signature in signatures.items():
        {% if enrichment_groupby.value == "user_defined_class" %}
        case_name = label.split(" vs. ")[1]
        {% else %}
        case_name = label.split(" vs. ")[0]
        {% endif %}

        # Create dataframe
        enrichment_results = results['disease_enrichment'][label]
        enrichment_dataframe = pd.concat([enrichment_results['upregulated'], enrichment_results['downregulated']])

        # Plot barcharts
        libraries = enrichment_dataframe['gene_set_library'].unique()   
        for gene_set_library in libraries:
            plot_library_barchart(enrichment_results, gene_set_library, enrichment_results['signature_label'], case_name, enrichment_results['sort_results_by'], nr_genesets=nr_genesets, plot_type=plot_type) # 10 300
        figure_counter = display_object(figure_counter, "Enrichment Analysis Results for {} in Disease. The figure contains interactive bar charts displaying the results of the Gene Ontology enrichment analysis generated using Enrichr. The x axis indicates the -log10(P-value) for each term. Significant terms are highlighted in bold. Additional information about enrichment results is available by hovering over each bar.".format(label), istable=False)
    

{% endif %}

In [None]:
%%appyter markdown
{% if deg.value == True %}
# Interactive Scatter Plots of Enriched Terms
{% endif %}

In [None]:
%%appyter code_exec
{% if deg.value == True%}
    
library_option_list = set()
for label, signature in signatures.items():
    if cluster == True:
        cluster_names = [label.split(" vs. ")[0].replace("Cluster ", "")]
    else:
        cluster_names = label.split(" vs. ")
    
    for key in results.keys():
        if key.endswith("enrichment") == False:
            continue
        enrichment_results = results[key][label]
        meta_df = adata.obs
        if cluster == True:
            for cluster_name in cluster_names:            
                for direction in ['upregulated']:
                    if direction in enrichment_results:
                        enrichment_dataframe = enrichment_results[direction]
                    else:
                        enrichment_dataframe = enrichment_results["enrichment_dataframe"]
                    libraries = enrichment_dataframe['gene_set_library'].unique()  
                    for library in libraries:
                        enrichment_dataframe_library = enrichment_dataframe[enrichment_dataframe['gene_set_library']==library]
                        top_term = enrichment_dataframe_library.iloc[0]['term_name']
                        if cluster == True:
                            meta_df.loc[meta_df['leiden']==cluster_name, library] = top_term
                        else:
                            meta_df.loc[meta_df[meta_class_column_name]==cluster_name, library] = top_term
                        library_option_list.add(library)
        else: # cluster == False
            for direction in ['upregulated', 'downregulated']:
                if direction in enrichment_results:
                    enrichment_dataframe = enrichment_results[direction]
                    libraries = enrichment_dataframe['gene_set_library'].unique()  
                    for library in libraries:
                        enrichment_dataframe_library = enrichment_dataframe[enrichment_dataframe['gene_set_library']==library]
                        top_term = enrichment_dataframe_library.iloc[0]['term_name']
                        if direction == "upregulated":
                            meta_df.loc[meta_df[meta_class_column_name]==cluster_names[0], library] = top_term
                        else:
                            meta_df.loc[meta_df[meta_class_column_name]==cluster_names[1], library] = top_term
                        library_option_list.add(library)
                else:
                    enrichment_dataframe = enrichment_results["enrichment_dataframe"]
                    libraries = enrichment_dataframe['gene_set_library'].unique()  
                    for library in libraries:
                        enrichment_dataframe_library = enrichment_dataframe[enrichment_dataframe['gene_set_library']==library]
                        top_term = enrichment_dataframe_library.iloc[0]['term_name']
                        bottom_term = enrichment_dataframe_library.iloc[-1]['term_name']
                        meta_df.loc[meta_df[meta_class_column_name]==cluster_names[0], library] = top_term
                        meta_df.loc[meta_df[meta_class_column_name]==cluster_names[1], library] = bottom_term
                        library_option_list.add(library)
                    
library_option_list = list(library_option_list)

# umap info into dataframe 
umap_df = pd.DataFrame(adata.obsm['X_umap'])
umap_df.columns = ['x', 'y']

option_list = library_option_list  
adata_selected = adata.obs[option_list].fillna("NaN")
values_dict = dict(zip(adata_selected.T.index.tolist(), adata_selected.T.values))

figure_counter = plot_scatter(umap_df, values_dict, option_list, adata.obs.index.tolist(), "Scatter plot of the samples. Each dot represents a sample and it is colored by enriched terms in library ", location='below', category=True, dropdown=True, figure_counter=figure_counter)

{% endif %}


In [None]:
%%appyter markdown
{% if trajectory.value == True %}
# Trajectory Inference
Trajectory inference is a computational technique used in single-cell transcriptomics to arrange cells based on their progression through the differentiation process. It orders single cells in pseudotime, placing them along a trajectory corresponding to a biological process such as cell differentiation by taking advantage of single cells’ asynchronous progression through those processes. Diffusion pseudotime (DPT) (Haghverdi, Laleh, et al. 2016), which measures transitions between cells using diffusion-like random walks is one of the methods used to identify such trajectories. Monocle (Trapnell, Cole, et al. 2014) is a method for ordering cells by learning an explicit principal graph from the single cell expression data with advanced machine learning techniques (Reversed Graph Embedding), which robustly and accurately resolves complicated biological processes.
{% endif %}

In [None]:
%%appyter code_exec
{% if trajectory.value == True %}
if trajectory_method == "monocle":
    # Run analysis
    results['monocle'] = run_monocle(dataset=adata, plot_type=plot_type, color_by='Pseudotime')

    # Display results
    plot_monocle(results['monocle'])
    
elif trajectory_method == "dpt":
    adata.uns['iroot'] = 0
    sc.pl.umap(adata, color=['leiden'], size=node_size)
    sc.tl.dpt(adata)
    sc.pl.umap(adata, color=['dpt_pseudotime'], size=node_size)
    display_link("draw_graph_fa.pdf", "Download figure")
figure_counter = display_object(figure_counter, "Trajectory inference result using {}. Each point represents an RNA-seq sample. Sample colors are based on pseudotime.".format(trajectory_method), istable=False)
{% endif %}

In [None]:
%%appyter markdown
{% if time_series_trajectory.value == True %}
# Time-series Trajectory Analysis
When time-series data is available, it is possible to order cells by their progression through a dynamic
process using time information. Tempora (Tran, Thinh N., and Gary Bader 2016), a pathway-based cell trajectory inference method that orders cells using time information from the data, infers developmental linages based on biological pathway information. This time-series based analysis can generate insights into dynamic processes and their biological regulation. 
{% endif %}

In [None]:
%%appyter code_exec
{% if time_series_trajectory.value == True %}
run_tempora(adata, timepoint_labels_column_name, timepoint_labels)
display(Image.open("Tempora_plot.jpg"))
figure_counter = display_object(figure_counter, "Time-series trajectory inference result using Tempora. Tempora visualizes the result as a network, with the piechart at each node representing the composition of cells collected at different time points in the experiment and the arrow connecting each pair of nodes representing lineage relationship between them.", istable=False)
display(FileLink("Tempora_plot.jpg", result_html_prefix="Download figure"))
{% endif %}

In [None]:
%%appyter markdown
{% if cell_type_prediction.value == True %}
# Cell Type Prediction with DigitalCellSorter 
DigitalCellSorter (Domanskyi, Sergii, et al. 2019) is an unbiased cell type recognition algorithm for scRNA-seq. It leverages a manual assessment using a few selected markers of cell types and infers the cell of origin for each of the single cells independently.
{% endif %}

In [None]:
%%appyter code_exec
{% if cell_type_prediction.value == True %}
DCS = DigitalCellSorter.DigitalCellSorter(nComponentsPCA=3)
DCS.annotationMethod='pDCS'
DCS.makePlots = False
df_expr = adata.raw.to_adata().to_df().T
DCS.prepare(df_expr)
DCS.process()

ann= DCS.annotate()

# Run analysis
results['dimension_reduction'] = run_dimension_reduction(dim_reduction_method, dataset=adata, meta_class_column_name=meta_class_column_name, nr_genes=nr_genes, plot_type=plot_type)
ids = [x[1] for x in DCS.df_expr.columns]
clusters = [ann[x[2]] for x in DCS.df_expr.columns] 
new_cluster_df = pd.DataFrame([ids, clusters]).T.set_index(0)
new_cluster_df.columns = [meta_class_column_name]
results['dimension_reduction']['sample_metadata'] = new_cluster_df

# # Display results
plot_dimension_reduction(results['dimension_reduction'])
figure_counter = display_object(figure_counter, "Cell type prediction result using DigitalCellSorter. Each point represents an RNA-seq sample. Predicted sample cell types are indicated using different colors.", istable=False)

{% endif %}

# References 

Ashburner, M., Ball, C.A., Blake, J.A., Botstein, D., Butler, H., Cherry, J.M., Davis, A.P., Dolinski, K., Dwight, S.S. and Eppig, J.T. (2000) Gene Ontology: tool for the unification of biology. Nature genetics, 25, 25.
<br>
Butler, Andrew, et al. "Integrating single-cell transcriptomic data across different conditions, technologies, and species." Nature biotechnology 36.5 (2018): 411-420.
<br>
Clark, N.R. and Ma’ayan, A. (2011) Introduction to statistical methods to analyze large data sets: principal components analysis. Sci. Signal., 4, tr3-tr3.
<br>
Clark, Neil R., et al. "The characteristic direction: a geometrical approach to identify differentially expressed genes." BMC bioinformatics 15.1 (2014): 79.
<br>
Consortium, E.P. (2004) The ENCODE (ENCyclopedia of DNA elements) project. Science, 306, 636-640.
<br>
Croft, David, et al. "The Reactome pathway knowledgebase." Nucleic acids research 42.D1 (2014): D472-D477.
<br>
Domanskyi, Sergii, et al. "Polled Digital Cell Sorter (p-DCS): Automatic identification of hematological cell types from single cell RNA-sequencing clusters." BMC bioinformatics 20.1 (2019): 369.
<br>
Fernandez, Nicolas F., et al. "Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data." Scientific data 4 (2017): 170151.
<br>
Haghverdi, Laleh, et al. "Diffusion pseudotime robustly reconstructs lineage branching." Nature methods 13.10 (2016): 845.
<br>
Ilicic, Tomislav, et al. "Classification of low quality cells from single-cell RNA-seq data." Genome biology 17.1 (2016): 29.
<br>
Islam, Saiful, et al. "Quantitative single-cell RNA-seq with unique molecular identifiers." Nature methods 11.2 (2014): 163.
<br>
Kanehisa, M. and Goto, S. (2000) KEGG: kyoto encyclopedia of genes and genomes. Nucleic acids research, 28, 27-30.
<br>
Kuleshov, M.V., Jones, M.R., Rouillard, A.D., Fernandez, N.F., Duan, Q., Wang, Z., Koplev, S., Jenkins, S.L., Jagodnik, K.M. and Lachmann, A. (2016) Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic acids research, 44, W90-W97.
<br>
Lachmann, A., Xu, H., Krishnan, J., Berger, S.I., Mazloom, A.R. and Ma'ayan, A. (2010) ChEA: transcription factor regulation inferred from integrating genome-wide ChIP-X experiments. Bioinformatics, 26, 2438-2444.
<br>
Lachmann, Alexander, and Avi Ma'ayan. "KEA: kinase enrichment analysis." Bioinformatics 25.5 (2009): 684-686.
<br>
Maaten, Laurens van der, and Geoffrey Hinton. "Visualizing data using t-SNE." Journal of machine learning research 9.Nov (2008): 2579-2605.
<br>
McInnes, Leland, John Healy, and James Melville. "Umap: Uniform manifold approximation and projection for dimension reduction." arXiv preprint arXiv:1802.03426 (2018).
<br>
Ritchie, Matthew E., et al. "limma powers differential expression analyses for RNA-sequencing and microarray studies." Nucleic acids research 43.7 (2015): e47-e47.
<br>
Traag, Vincent A., Ludo Waltman, and Nees Jan van Eck. "From Louvain to Leiden: guaranteeing well-connected communities." Scientific reports 9.1 (2019): 1-12.
<br>
Tran, Thinh N., and Gary Bader. "Tempora: cell trajectory inference using time-series single-cell RNA sequencing data." bioRxiv (2019): 846907.
<br>
Trapnell, Cole, et al. "The dynamics and regulators of cell fate decisions are revealed by pseudotemporal ordering of single cells." Nature biotechnology 32.4 (2014): 381.
<br>
van Dijk, D., Nainys, J., Sharma, R., Kathail, P., Carr, A.J., Moon, K.R., Mazutis, L., Wolf, G., Krishnaswamy, S. and Pe'er, D. (2017) MAGIC: A diffusion-based imputation method reveals gene-gene interactions in single-cell RNA-sequencing data. BioRxiv, 111591.
<br>
Weinreb, Caleb, et al. "Fundamental limits on dynamic inference from single-cell snapshots." Proceedings of the National Academy of Sciences 115.10 (2018): E2467-E2476.
<br>
Zheng, Grace XY, et al. "Massively parallel digital transcriptional profiling of single cells." Nature communications 8.1 (2017): 1-12.