# Bulk RNA-seq Analysis pipeline

This pipeline enables you to analyze and visualize your bulk RNA sequencing datasets with an array of downstream analysis and visualization tools. The pipeline includes: PCA/UMAP/t-SNE analysis, Clustergrammer interactive heatmap, library size analysis, differential gene expression analysis, enrichment analysis, and L1000 small molecule search.

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
# Basic libraries
import pandas as pd
import os
import random
import time
import numpy as np
import warnings
import base64  
import json
from pandas.api.types import CategoricalDtype

# Visualization
import plotly
from plotly import tools
import plotly.express as px
import plotly.graph_objs as go

import matplotlib.pyplot as plt; plt.rcdefaults()
from matplotlib import rcParams

import IPython
from IPython.display import HTML, display, Markdown, IFrame

# Data analysis
from itertools import combinations
import scipy.spatial.distance as dist
import scipy.stats as ss
from sklearn.decomposition import PCA
from sklearn.preprocessing import quantile_transform

from rpy2 import robjects
from rpy2.robjects import r, pandas2ri

# External Code
from utils import *

from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)

%matplotlib inline

In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='Data_Section',
    title='Load your Data',
    subtitle='Load your metadata and expression data in comma/tab separated formats. Genes should be in rows and samples shoud be in columns. Example files are downloadable here: <a href="https://appyters.maayanlab.cloud/storage/Bulk_RNA_seq/GSE70466.zip"> link </a>',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Normalization_Section',
    title='Select Normalization Methods',
    subtitle='',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='Visualization_Section',
    title='Select Visualization Parameters',
    subtitle='',
    img='analysis.png'
    
) %}

{% do SectionField(
    name='DEG_Section',
    title='Select Differentially Exprssed Gene Analysis Parameters',
    subtitle='',
    img='analysis.png'
    
) %}

In [None]:
%%appyter code_exec
{% set meta_data_filename = FileField(
    name='meta_data_filename', 
    label='Meta data file (.csv or .txt)', 
    default='GSE70466_example_metadata.txt',

    examples={'GSE70466_example_metadata.txt': "https://appyters.maayanlab.cloud/storage/Bulk_RNA_seq/GSE70466-metadata.txt"}, 
    description='Upload metadata as two-column comma seperated or tab seperated format. One column contains sample ids and the other column contains sample labels', 
    section='Data_Section')

%}
{% set rnaseq_data_filename = FileField(
    name='rnaseq_data_filename', 
    label='RNA-seq data file (.csv or .txt)', 
    default='GSE70466_example_expression.txt',
    examples={'GSE70466_example_expression.txt': "https://appyters.maayanlab.cloud/storage/Bulk_RNA_seq/GSE70466-expression.txt"}, 
    description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
    section='Data_Section')

%}
{% set meta_class_column_name = StringField(
    name='meta_class_column_name', 
    label='Class column name in metadata', 
    default='cell line', 
    description='class column name of metadata', 
    section='Data_Section')
%}

{% set control_name = StringField(
    name='control_name', 
    label='Control label', 
    default='PrEC', 
    description='name of control class',
    section='Data_Section')
%}


In [None]:
%%appyter code_exec
{% set filter_genes = BoolField(
    name='filter_genes', 
    label='Filter genes?', 
    default='true',
    description='Check if User wants to filter genes with lowest variances', 
    section='Normalization_Section',
) 
%}

{% set low_expression_threshold = FloatField(
    name='low_expression_threshold', 
    label='Low expression threshold', 
    default=0.3, 
    min=0,
    max=15.0,
    step=0.01,
    description='Threshold to filter out low expression genes. The value should vary based on the user dataset.', 
    section='Normalization_Section'
)
%}

{% set logCPM_normalization = BoolField(
    name='logCPM_normalization', 
    label='logCPM normalization?', 
    default='true', 
    description='Check if User wants the dataset to be logCPM-transformed', 
    section='Normalization_Section')
%}

{% set log_normalization = BoolField(
    name='log_normalization', 
    label='log normalization?', 
    default='false', 
    description='Check if User wants the dataset to be log-transformed', 
    section='Normalization_Section')
%}

{% set z_normalization = BoolField(
    name='z_normalization', 
    label='Z normalization?', 
    default='true', 
    description='Check if User wants the dataset to be normalized with Z-normalized method', 
    section='Normalization_Section')
%}

{% set q_normalization = BoolField(
    name='q_normalization', 
    label='Quantile normalization?', 
    default='false', 
    description='Check if User wants the dataset to be normalized with Quantile normalization method', 
    section='Normalization_Section')
%}

In [None]:
%%appyter code_exec
{% set interactive_plot = BoolField(
    name='interactive_plot', 
    label='Interactive plots?', 
    default='false', 
    description='Check if User wants interactive plots', 
    section='Visualization_Section')
%}

{% set visualization_method = ChoiceField(
    name='visualization_method', 
    label='Visualization Methods', 
    choices = {'PCA': 'PCA', 'UMAP': 'UMAP', 't-SNE': 't-SNE'},
    default='PCA', 
    description='Select a visualization method', 
    section='Visualization_Section')
%}

{% set nr_genes = IntField(
    name='nr_genes', 
    label='Genes for Dimension Reduction', 
    min=0, 
    max=30000, 
    default=2500, 
    description='The maximum number of genes for dimension reduction', 
    section='Visualization_Section')
%}

{% set gene_list_for_clustergrammer = TextField(
    name='gene_list_for_clustergrammer', 
    label='Gene List for Clustergrammer (Optional)', 
    default='', 
    description='Paste your gene list (One gene per row) for Clustergrammer heatmap plots.', 
    section = 'Visualization_Section')
%}

{% set clustering_topk = IntField(
    name='clustering_topk', 
    label='Genes for clustergrammer', 
    min=0, 
    max=1000, 
    default=800, 
    description='The number of genes with largest variance for Clustergrammer', 
    section='Visualization_Section')
%}

In [None]:
%%appyter code_exec
{% set diff_gex_method = ChoiceField(
    name='diff_gex_method',
    label='Differential expression analysis method',
    choices={'limma': 'limma','characteristic direction': 'characteristic_direction', 'edgeR': 'edgeR', 'DESeq2': 'DESeq2'},
    default='limma', 
    description='Set a method to get differentially expressed genes', 
    section='DEG_Section')
%}
{% set diff_gex_plot_method = ChoiceField(
    name='diff_gex_plot_method',
    label='Differential expression analysis plotting method',
    choices={'Volcano plot': 'volcano','MA plot': 'MA_plot'},
    default='Volcano plot', 
    description='Set a plot method to see differentially expressed genes. Available for limma/edgeR/DESeq2.', 
    section='DEG_Section')
%}


{% set pvalue_threshold = FloatField(
    name='pvalue_threshold', 
    label='P-value threshold', 
    min=0, 
    max=1, 
    default=0.05, 
    description='Threshold to highlight significantly differentially expressed genes.', 
    section='DEG_Section')
%}
{% set logfc_threshold = FloatField(
    name='logfc_threshold', 
    label='logFC threshold',
    min=0,
    max=1000,
    default=1.5, 
    description='Threshold to highlight diffentially expressed genes.', 
    section='DEG_Section')
%}

{% set gene_topk = IntField(
    name='gene_topk', 
    label='Maximum genes for Enrichr', 
    min=0, 
    max=1000, 
    default=500, 
    description='The maximum number of genes discovered by the Characteristic Direction method', 
    section='DEG_Section')
%}


{% set enrichr_libraries = MultiChoiceField(
    name='enrichr_libraries',
    label='Enrichr Libraries (upto 2)',
    descriptions='Enrichr libraries to be visualized. Select one or two libraries',
    choices=['Gene Ontology',
            'Pathway',
            'Kinase',
            'Transcription Factor',
            'miRNA'],
    default=['Gene Ontology', 'Pathway'],
    section='DEG_Section'
    )
%}


{% set nr_genesets = IntField(
    name='nr_genesets', 
    label='Top ranked gene sets', 
    min=0, 
    max=20, 
    default=15, 
    description='The number of result gene sets', 
    section='DEG_Section')
%}

{% set small_molecule_method = ChoiceField(
    name='small_molecule_method',
    label='Small molecule analysis method',
    choices={'L1000CDS2': 'L1000CDS2','L1000FWD': 'L1000FWD'},
    default='L1000FWD', 
    description='Set a small molecule analysis method', 
    section='DEG_Section')
%}

{% set l1000_topk = IntField(
    name='l1000_topk', 
    label='Genes for L1000CDS2 or L1000FWD', 
    min=0, 
    max=1000, 
    default=500, 
    description='The number of genes to L1000CDS2 or L1000FWD', 
    section='DEG_Section')
%}

{% set nr_drugs = IntField(
    name='nr_drugs', 
    label='Top ranked drugs from L1000CDS2 or L1000FWD', 
    min=0, 
    max=20, 
    default=7, 
    description='The number of result drugs', 
    section='DEG_Section')
%}


In [None]:
%%appyter code_exec
rnaseq_data_filename = "{{rnaseq_data_filename.value}}"
meta_data_filename = "{{meta_data_filename.value}}"
meta_class_column_name = "{{meta_class_column_name.value}}"
control_name = "{{control_name.value}}"

interactive_plot = {{interactive_plot.value}}
filter_genes = {{filter_genes.value}}
low_expression_threshold = {{low_expression_threshold.value}}


nr_genes = {{nr_genes.value}}
gene_list_for_clustergrammer = "{{gene_list_for_clustergrammer.value}}"
clustering_topk = {{clustering_topk.value}}

diff_gex_method = "{{diff_gex_method.value}}"
diff_gex_plot_method = "{{diff_gex_plot_method.value}}"
pvalue_threshold = {{pvalue_threshold.value}}
logfc_threshold = {{logfc_threshold.value}}
gene_topk = {{gene_topk.value}}
enrichr_libraries = {{enrichr_libraries.value}}
nr_genesets = {{nr_genesets.value}}

small_molecule_method = "{{small_molecule_method.value}}"
l1000_topk = {{l1000_topk.value}}
nr_drugs = {{nr_drugs.value}}

In [None]:
warnings.filterwarnings('ignore')
random.seed(0)
pandas2ri.activate()
notebook_metadata = dict()
notebook_metadata["tables"] = dict()
notebook_metadata["figures"] = dict()
notebook_metadata["input_parameters"] = dict()
if interactive_plot == True:
    plot_type='interactive'
else:
    plot_type='static'
results = {}
table_counter = 1
figure_counter = 1

In [None]:
%%appyter code_exec


notebook_metadata["input_parameters"]["rnaseq_data_filename"] = rnaseq_data_filename
notebook_metadata["input_parameters"]["meta_data_filename"] = meta_data_filename
notebook_metadata["input_parameters"]["meta_class_column_name"] = meta_class_column_name
notebook_metadata["input_parameters"]["control_name"] = control_name

notebook_metadata["input_parameters"]["filter_genes"] = filter_genes
notebook_metadata["input_parameters"]["low_expression_threshold"] = low_expression_threshold
notebook_metadata["input_parameters"]["logCPM_normalization"] = {{logCPM_normalization.value}}
notebook_metadata["input_parameters"]["log_normalization"] = {{log_normalization.value}}
notebook_metadata["input_parameters"]["z_normalization"] = {{z_normalization.value}}
notebook_metadata["input_parameters"]["q_normalization"] = {{q_normalization.value}}

notebook_metadata["input_parameters"]["visualization_method"] = "{{visualization_method.value}}"
notebook_metadata["input_parameters"]["nr_genes"] = nr_genes
notebook_metadata["input_parameters"]["gene_list_for_clustergrammer"] = gene_list_for_clustergrammer
notebook_metadata["input_parameters"]["clustering_topk"] = clustering_topk

notebook_metadata["input_parameters"]["diff_gex_method"] = diff_gex_method
notebook_metadata["input_parameters"]["diff_gex_plot_method"] = diff_gex_plot_method
notebook_metadata["input_parameters"]["pvalue_threshold"] = pvalue_threshold
notebook_metadata["input_parameters"]["logfc_threshold"] = logfc_threshold
notebook_metadata["input_parameters"]["gene_topk"] = gene_topk
notebook_metadata["input_parameters"]["enrichr_libraries"] = enrichr_libraries
notebook_metadata["input_parameters"]["nr_genesets"] = nr_genesets
notebook_metadata["input_parameters"]["small_molecule_method"] = small_molecule_method
notebook_metadata["input_parameters"]["l1000_topk"] = l1000_topk
notebook_metadata["input_parameters"]["nr_drugs"] = nr_drugs



# Load datasets

In [None]:
%%appyter code_exec
try:
    check_files(rnaseq_data_filename)
except:
    print("Error! Please load an RNA-seq expression file in txt, tsv or csv format")
    pass
    
try:    
    check_files(meta_data_filename)
except:
    print("Error! Please load a metadata file in txt, tsv or csv format")
    pass

if rnaseq_data_filename.endswith(".csv"):
    expr_df = pd.read_csv(rnaseq_data_filename, index_col=0).sort_index()
else:
    expr_df = pd.read_csv(rnaseq_data_filename, index_col=0, sep="\t").sort_index()
if meta_data_filename.endswith(".csv"):
    meta_df = pd.read_csv(meta_data_filename, index_col=0, dtype=str)
else:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
    
meta_df.index = meta_df.index.map(str)
  
# Match samples between the metadata and the datasets
try:
    check_df(meta_df, meta_class_column_name)
except:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")

meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)


expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format. \
    Please check if the index of the expression data are genes and the columns are sample IDs. \
    Sample IDs in the expression data and the metadata should be matched")
dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df

In [None]:
%%appyter markdown
{% if filter_genes.value == True %}
Filter out low expressed genes 
{% endif %}

In [None]:
%%appyter code_exec
{% if filter_genes.value == True %}
## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]

## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df
{% endif %}

In [None]:
dataset['dataset_metadata'] = meta_df

table_counter, notebook_metadata = display_object(table_counter, "Raw RNA-seq expression data. The table displays the first 5 rows of the quantified RNA-seq expression dataset. Rows represent genes, columns represent samples, and values show the number of mapped reads.", notebook_metadata, "raw_exp.csv", dataset[current_dataset].head(), istable=True)
table_counter, notebook_metadata = display_object(table_counter, "Metadata. The table displays the metadata associated with the samples in the RNA-seq dataset. Rows represent RNA-seq samples, columns represent metadata categories.", notebook_metadata, "metadata.csv", dataset['dataset_metadata'].head(), istable=True)
table_counter, notebook_metadata = display_object(table_counter, "Sample size for each class. The table displays the number of samples in each class.", notebook_metadata, "num_of_samples_in_class.csv", dataset['dataset_metadata'].reset_index().groupby(meta_class_column_name).count(), istable=True)

In [None]:
%%appyter markdown
{% if logCPM_normalization.value == True or log_normalization.value == True or z_normalization.value == True or q_normalization.value == True %}
Normalization methods (
{% if logCPM_normalization.value %}count per million (CPM), {% endif %} {% if log_normalization.value %} log transformation, {% endif %} {% if z_normalization.value %} Z normalization, {% endif %}  {% if q_normalization.value %}quantile normalization {% endif %}) will be applied to convert raw read counts into informative measures of gene expression and remove factors that affect the analysis.
{% endif %}

In [None]:
%%appyter code_exec
{% if logCPM_normalization.value == True or log_normalization.value == True or z_normalization.value == True or q_normalization.value == True %}

dataset, normalization = normalize(dataset, current_dataset, {{logCPM_normalization}}, {{log_normalization}}, {{z_normalization}}, {{q_normalization}})
table_counter, notebook_metadata = display_object(table_counter,"Normalized data. The table displays the expression values after normalization.",  notebook_metadata, "normalized_exp.csv", dataset[normalization].head(), istable=True)
display(create_download_link(dataset[normalization], filename="normalized_exp.csv"))

{% endif %}

# Visualize Samples

In [None]:
%%appyter markdown
{% if visualization_method.value == "PCA" %}
Principal Component Analysis (PCA) (Clark et al. 2011) is a statistical technique used to identify global patterns in high-dimensional datasets. It is commonly used to explore the similarity of biological samples in RNA-seq datasets. To achieve this, gene expression values are transformed into Principal Components (PCs), a set of linearly uncorrelated features which represent the most relevant sources of variance in the data, and subsequently visualized using a scatter plot.
{% endif %}

In [None]:
%%appyter code_exec
{% if visualization_method.value == "PCA" %}

method = "PCA"
{% elif visualization_method.value == "UMAP"%}

method = "UMAP"
{% elif visualization_method.value == "t-SNE"%}

method = "t-SNE"
{% endif %}

# Run analysis
results[method] = run_dimension_reduction(dataset=dataset, method=method,\
                         nr_genes=nr_genes, normalization=normalization, plot_type=plot_type)
# Display results
plot_name = "{}_plot_of_samples.png".format(method)
figure_counter, notebook_metadata = plot_samples(results[method], meta_class_column_name=meta_class_column_name, counter=figure_counter, plot_name=plot_name, notebook_metadata=notebook_metadata, plot_type=plot_type)


# Clustergrammer

Clustergrammer (Fernandez et al. 2017) is a web-based tool for visualizing and analyzing high-dimensional data as interactive and hierarchically clustered heatmaps. It is commonly used to explore the similarity between samples in an RNA-seq dataset. In addition to identifying clusters of samples, it also allows to identify the genes which contribute to the clustering.

In [None]:
# Run analysis
results['clustergrammer'] = run_clustergrammer(dataset=dataset, meta_class_column_name=meta_class_column_name, nr_genes=clustering_topk, normalization=normalization, z_score=True, gene_list=gene_list_for_clustergrammer)

# Display results
plot_clustergrammar(results['clustergrammer'])
caption = "Clustered heatmap plot. The figure contains an interactive heatmap displaying gene expression for each sample in the RNA-seq dataset. Every row of the heatmap represents a gene, every column represents a sample, and every cell displays normalized gene expression values. The heatmap additionally features color bars beside each column which represent prior knowledge of each sample, such as the tissue of origin or experimental treatment."

figure_counter, notebook_metadata = display_object(figure_counter, caption, notebook_metadata, saved_filename=results['clustergrammer'], istable=False)

# Library size analysis

In order to quantify gene expression in an RNA-seq dataset, reads generated from the sequencing step are mapped to a reference genome and subsequently aggregated into numeric gene counts. Due to experimental variations and random technical noise, samples in an RNA-seq datasets often have variable amounts of the total RNA. Library size analysis calculates and displays the total number of reads mapped for each sample in the RNA-seq dataset, facilitating the identification of outlying samples and the assessment of the overall quality of the data.

In [None]:
meta_df['sum'] = expr_df.sum().tolist()

fig = px.histogram(meta_df["sum"])
fig.update_yaxes(title="samples/cells")
fig.update_xaxes(title="reads")
if plot_type == "static":
    fig.show(renderer="png")
else:
    fig.show()
plot_name = "library_size_plot.png"
fig.write_image(plot_name)
figure_counter, notebook_metadata = display_object(figure_counter, "Histogram of the total number of reads mapped for each sample. The figure contains an interactive bar chart which displays the number of samples according to the total number of reads mapped to each RNA-seq sample in the dataset. Additional information for each sample is available by hovering over the bars.", notebook_metadata, saved_filename=plot_name, istable=False)

# Differential Gene Expression 

Gene expression signatures are alterations in the patterns of gene expression that occur as a result of cellular perturbations such as drug treatments, gene knock-downs or diseases. They can be quantified using differential gene expression (DGE) methods (Ritchie et al. 2015, Clark et al. 2014), which compare gene expression between two groups of samples to identify genes whose expression is significantly altered in the perturbation. 

In [None]:
signatures = get_signatures(classes, dataset, normalization, diff_gex_method, meta_class_column_name, filter_genes)

for label, signature in signatures.items():
    case_label = label.split(" vs. ")[1]
    table_counter, notebook_metadata = display_object(table_counter, "Differentially expressed genes between {} using {}. The figure displays a browsable table containing the gene expression signature generated from a differential gene expression analysis. Every row of the table represents a gene; the columns display the estimated measures of differential expression.".format(label, diff_gex_method), notebook_metadata, "DEG_results_{}.csv".format(label), signature, istable=True)
    display(create_download_link(signature, filename="DEG_results_{}.csv".format(label)))

In [None]:
%%appyter code_exec
{% if diff_gex_method.value == "limma" or diff_gex_method.value == "edgeR" or diff_gex_method.value == "DESeq2"%}
    
{% if diff_gex_plot_method.value == "volcano" %}
results['volcano_plot'] = {}
# Loop through signatures
for label, signature in signatures.items():
    results['volcano_plot'][label] = run_volcano(signature, label, dataset, pvalue_threshold, logfc_threshold, plot_type)
    plot_name = plot_volcano(results['volcano_plot'][label])
    figure_counter, notebook_metadata = display_object(figure_counter, "Volcano plot for {}. The figure contains an interactive scatter plot which displays the log2-fold changes and statistical significance of each gene calculated by performing a differential gene expression analysis. Genes with logFC > {} and p-value < {} in red and genes with logFC < -{} and p-value < {} in blue. Additional information for each gene is available by hovering over it.".format(label, logfc_threshold, pvalue_threshold, logfc_threshold, pvalue_threshold), notebook_metadata, plot_name, istable=False)

{% elif diff_gex_plot_method.value == "MA_plot" %}
# Initialize results
results['ma_plot'] = {}

# Loop through signatures
for label, signature in signatures.items():
    # Run analysis
    results['ma_plot'][label] = run_maplot(signature=signature, signature_label=label, pvalue_threshold=pvalue_threshold, logfc_threshold=logfc_threshold, plot_type=plot_type)
    # Display results
    plot_name = plot_maplot(results['ma_plot'][label])
    figure_counter, notebook_metadata = display_object(figure_counter, "MA plot for {}. The figure contains an interactive scatter plot which displays the average expression and statistical significance of each gene calculated by performing differential gene expression analysis. Genes with logFC > {} and p-value < {} in red and genes with logFC < -{} and p-value < {} in blue. Additional information for each gene is available by hovering over it.".format(label, logfc_threshold, pvalue_threshold, logfc_threshold, pvalue_threshold), notebook_metadata, plot_name, istable=False)

{% endif %}
{% endif %}

# Enrichment Analysis using Enrichr

Enrichment analysis is a statistical procedure used to identify biological terms which are over-represented in a given gene set. These include signaling pathways, molecular functions, diseases, and a wide variety of other biological terms obtained by integrating prior knowledge of gene function from multiple resources. Enrichr (Kuleshov et al. 2016) is a web-based application which allows to perform enrichment analysis using a large collection of gene-set libraries and various interactive approaches to display enrichment results.

In [None]:
# Loop through signatures
results = {}
results['enrichr']= {}
if diff_gex_method == "characteristic_direction":
    fc_colname = "CD-coefficient"
    sort_genes_by = "CD-coefficient"
    ascending = False
elif diff_gex_method == "limma":
    fc_colname = "logFC"
    sort_genes_by = "t"
    ascending = False
elif diff_gex_method == "edgeR":
    fc_colname = "logFC"
    sort_genes_by = "PValue"
    ascending = True
elif diff_gex_method == "DESeq2":
    fc_colname = "log2FoldChange"
    sort_genes_by = "padj"
    ascending = True
enrichr_link_dict = dict()
for label, signature in signatures.items():    
    case_label = label.split(" vs. ")[1]
    # Run analysis
    results['enrichr'][label] = run_enrichr(signature=signature, signature_label=label, fc_colname=fc_colname,geneset_size=gene_topk, sort_genes_by = sort_genes_by,ascending=ascending)
    tmp_enrichr_link_dict = dict()
    title_up = f"Enrichment Analysis Result: {label} (up-regulated in {case_label})"  
    title_down = f"Enrichment Analysis Result: {label} (down-regulated in {case_label})"  
    enrichr_link_dict[title_up] = dict()
    enrichr_link_dict[title_up]["link"] = "<a href=https://maayanlab.cloud/Enrichr/enrich?dataset={} target=\"_blank\">link to Enrichr</a>".format(results['enrichr'][label]["upregulated"]["shortId"])
    enrichr_link_dict[title_down] = dict()
    enrichr_link_dict[title_down]["link"] = "<a href=https://maayanlab.cloud/Enrichr/enrich?dataset={} target=\"_blank\">link to Enrichr</a>".format(results['enrichr'][label]["downregulated"]["shortId"])

enrichr_link_df = pd.DataFrame.from_dict(enrichr_link_dict).T
table_counter, notebook_metadata = display_object(table_counter, "The table displays links to Enrichr containing the results of enrichment analyses generated by analyzing the up-regulated and down-regulated genes from a differential expression analysis. By clicking on these links, users can interactively explore and download the enrichment results from the Enrichr website.", notebook_metadata=notebook_metadata, saved_filename="enrichr_links.csv", df=enrichr_link_df, ishtml=True)

In [None]:
%%appyter markdown
{% if "Gene Ontology" in enrichr_libraries.value %}
# GO Enrichment Analysis
Gene Ontology (GO) (Ashburner et al. 2000) is a major bioinformatics initiative aimed at unifying the representation of gene attributes across all species. It contains a large collection of experimentally validated and predicted associations between genes and biological terms. This information can be leveraged by Enrichr to identify the biological processes, molecular functions and cellular components which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.
{% endif %}

In [None]:
%%appyter code_exec
{% if "Gene Ontology" in enrichr_libraries.value %}

results['go_enrichment'] = {}
for label, signature in signatures.items():
    # Run analysis
    results['go_enrichment'][label] = get_enrichr_results_by_library(results['enrichr'][label], label, library_type='go', version='2018')
    
for label, signature in signatures.items():
    # Create dataframe
    enrichment_results = results['go_enrichment'][label]
    enrichment_dataframe = pd.concat([enrichment_results['upregulated'], enrichment_results['downregulated']])

    # Plot barcharts
    libraries = enrichment_dataframe['gene_set_library'].unique()   
    for gene_set_library in libraries:
        plot_name = "{}_barchart_{}.png".format(gene_set_library, label)
        plot_library_barchart(enrichment_results, gene_set_library, enrichment_results['signature_label'], enrichment_results['sort_results_by'], nr_genesets=nr_genesets, plot_type=plot_type, plot_name=plot_name) # 10 300
        figure_counter, notebook_metadata = display_object(figure_counter, "Enrichment Analysis Results for {} in Gene Onotology ({}). The figure contains interactive bar charts displaying the results of the Gene Ontology enrichment analysis generated using Enrichr. The x axis indicates the -log10(P-value) for each term. Significant terms are highlighted in bold. Additional information about enrichment results is available by hovering over each bar.".format(label, gene_set_library), notebook_metadata, saved_filename=plot_name, istable=False)
{% endif %}

In [None]:
%%appyter markdown
{% if "Pathway" in enrichr_libraries.value %}
# Pathway Enrichment Analysis
Biological pathways are sequences of interactions between biochemical compounds which play a key role in determining cellular behavior. Databases such as KEGG (Kanehisa et al. 2000), Reactome (Croft et al. 2014) and WikiPathways (Kelder et al. 2012) contain a large number of associations between such pathways and genes. This information can be leveraged by Enrichr to identify the biological pathways which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.
{% endif %}

In [None]:
%%appyter code_exec
{% if "Pathway" in enrichr_libraries.value %}
# Initialize results
results['pathway_enrichment'] = {}

# Loop through results
for label, enrichr_results in results['enrichr'].items():
    # Run analysis
    results['pathway_enrichment'][label] = get_enrichr_results_by_library(enrichr_results=enrichr_results, signature_label=label, plot_type=plot_type, library_type='pathway', sort_results_by='pvalue')

for label, signature in signatures.items():
    # Create dataframe
    enrichment_results = results['pathway_enrichment'][label]
    enrichment_dataframe = pd.concat([enrichment_results['upregulated'], enrichment_results['downregulated']])

    # Plot barcharts
    libraries = enrichment_dataframe['gene_set_library'].unique()   
    for gene_set_library in libraries:
        # Display results
        plot_name = "{}_barchart_{}.png".format(gene_set_library, label)
        plot_library_barchart(enrichment_results, gene_set_library, enrichment_results['signature_label'], enrichment_results['sort_results_by'], nr_genesets=nr_genesets, plot_type=plot_type)
        figure_counter, notebook_metadata = display_object(figure_counter, "Enrichment Analysis Results for {} in {}. The figure contains interactive bar charts displaying the results of the pathway enrichment analysis generated using Enrichr. The x axis indicates the -log10(P-value) for each term. Significant terms are highlighted in bold. Additional information about enrichment results is available by hovering over each bar.".format(label, gene_set_library), notebook_metadata, saved_filename=plot_name, istable=False)
{% endif %}

In [None]:
%%appyter markdown
{% if "Transcription Factor" in enrichr_libraries.value %}
# Transcription Factor Enrichment Analysis
Transcription Factors (TFs) are proteins involved in the transcriptional regulation of gene expression. Databases such as ChEA (Lachmann et al. 2010) and ENCODE (Consortium, 2014) contain a large number of associations between TFs and their transcriptional targets. This information can be leveraged by Enrichr to identify the transcription factors whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.
{% endif %}

In [None]:
%%appyter code_exec
{% if "Transcription Factor" in enrichr_libraries.value %}
# Initialize results
results['tf_enrichment'] = {}

# Loop through results
for label, enrichr_results in results['enrichr'].items():
#     # Run analysis
    results['tf_enrichment'][label] = get_enrichr_result_tables_by_library(enrichr_results=enrichr_results, signature_label=label)
    table_counter, notebook_metadata = display_table(results['tf_enrichment'][label], "Transcription Factor", notebook_metadata, table_counter)
    
{% endif %}   

In [None]:
%%appyter markdown
{% if "Kinase" in enrichr_libraries.value %}
# Kinase Enrichment Analysis
Protein kinases are enzymes that modify other proteins by chemically adding phosphate groups. Databases such as KEA (Lachmann et al. 2009) contain a large number of associations between kinases and their substrates. This information can be leveraged by Enrichr to identify the protein kinases whose substrates are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.
{% endif %}

In [None]:
%%appyter code_exec
{% if "Kinase" in enrichr_libraries.value %}
# Initialize results
results['kinase_enrichment'] = {}

# Loop through results
for label, enrichr_results in results['enrichr'].items():
    # Run analysis
    results['kinase_enrichment'][label] = get_enrichr_result_tables_by_library(enrichr_results=enrichr_results, signature_label=label, library_type="ke")

    # Display results
    table_counter, notebook_metadata = display_table(results['kinase_enrichment'][label], "Kinase", notebook_metadata, table_counter)
{% endif %}

In [None]:
%%appyter markdown
{% if "miRNA" in enrichr_libraries.value %}
# miRNA Enrichment Analysis
microRNAs (miRNAs) are small non-coding RNA molecules which play a key role in the post-transcriptional regulation of gene expression. Databases such as TargetScan (Agarwal et al. 2015) and MiRTarBase (Chou et al. 2016) contain a large number of associations between miRNAs and their targets. This information can be leveraged by Enrichr to identify the miRNAs whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.
{% endif %}

In [None]:
%%appyter code_exec
{% if "miRNA" in enrichr_libraries.value %}

results['mirna_enrichment'] = {}

# Loop through results
for label, enrichr_results in results['enrichr'].items():
    # Run analysis
    results['mirna_enrichment'][label] = get_enrichr_result_tables_by_library(enrichr_results=enrichr_results, signature_label=label, library_type="mirna")

    # Display results
    table_counter, notebook_metadata = display_table(results['mirna_enrichment'][label], "miRNA", notebook_metadata, table_counter)
{% endif %}

In [None]:
%%appyter markdown
{% if small_molecule_method.value == "L1000CDS2" %}
# L1000CDS2 Query
L1000CDS2 (Duan et al. 2016) is a web-based tool for querying gene expression signatures against signatures created from human cell lines treated with over 20,000 small molecules and drugs for the LINCS project. It is commonly used to identify small molecules which mimic or reverse the effects of a gene expression signature generated from a differential gene expression analysis.
{% endif %}

In [None]:
%%appyter code_exec
{% if small_molecule_method.value == "L1000CDS2" %}
# Initialize results
results['l1000cds2'] = {}

# Loop through signatures
for label, signature in signatures.items(): 
    # Run analysis
    results['l1000cds2'][label] = run_l1000cds2(signature=signature, nr_genes=l1000_topk, signature_label=label, plot_type=plot_type)

    # Display results
    plot_name = "L1000CDS2_{}.png".format(label)
    figure_counter, notebook_metadata = plot_l1000cds2(results['l1000cds2'][label], counter=figure_counter, nr_drugs=nr_drugs, notebook_metadata=notebook_metadata, plot_name=plot_name)
{% endif %}    

In [None]:
%%appyter markdown
{% if small_molecule_method.value == "L1000FWD" %}
# L1000FWD Query
L1000FWD (Wang et al. 2018) is a web-based tool for querying gene expression signatures against signatures created from human cell lines treated with over 20,000 small molecules and drugs for the LINCS project.
{% endif %}

In [None]:
%%appyter code_exec
{% if small_molecule_method.value == "L1000FWD" %}
# Initialize results
results['l1000fwd'] = {}

# Loop through signatures
for label, signature in signatures.items():
    display(Markdown("*L1000FWD for {}*".format(label)))
    
    # Run analysis
    results['l1000fwd'][label] = run_l1000fwd(signature=signature, signature_label=label, nr_genes=l1000_topk)

    # Display results
    figure_counter, table_counter, notebook_metadata = plot_l1000fwd(results['l1000fwd'][label], figure_counter=figure_counter, table_counter=table_counter, notebook_metadata=notebook_metadata)
{% endif %}

In [None]:
# save metadata of the notebook as json
with open("notebook_metadata.json", "w") as fw:
    json.dump(notebook_metadata, fw)

# References

Agarwal, Vikram, et al. "Predicting effective microRNA target sites in mammalian mRNAs." elife 4 (2015): e05005.
<br>
Ashburner, M., Ball, C.A., Blake, J.A., Botstein, D., Butler, H., Cherry, J.M., Davis, A.P., Dolinski, K., Dwight, S.S. and Eppig, J.T. (2000) Gene Ontology: tool for the unification of biology. Nature genetics, 25, 25.
<br>
Chou, Chih-Hung, et al. "miRTarBase 2016: updates to the experimentally validated miRNA-target interactions database." Nucleic acids research 44.D1 (2016): D239-D247.
<br>
Clark, N.R. and Ma’ayan, A. (2011) Introduction to statistical methods to analyze large data sets: principal components analysis. Sci. Signal., 4, tr3-tr3.
<br>
Clark, Neil R., et al. "The characteristic direction: a geometrical approach to identify differentially expressed genes." BMC bioinformatics 15.1 (2014): 79.
<br>
Consortium, E.P. (2004) The ENCODE (ENCyclopedia of DNA elements) project. Science, 306, 636-640.
<br>
Croft, David, et al. "The Reactome pathway knowledgebase." Nucleic acids research 42.D1 (2014): D472-D477.
<br>
Duan, Q., et al. "L1000cds2: Lincs l1000 characteristic direction signatures search engine. NPJ Syst Biol Appl. 2016; 2: 16015." (2016).
<br>
Fernandez, Nicolas F., et al. "Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data." Scientific data 4 (2017): 170151.
<br>
Kanehisa, M. and Goto, S. (2000) KEGG: kyoto encyclopedia of genes and genomes. Nucleic acids research, 28, 27-30.
<br>
Kelder, Thomas, et al. "WikiPathways: building research communities on biological pathways." Nucleic acids research 40.D1 (2012): D1301-D1307.
<br>
Kuleshov, M.V., Jones, M.R., Rouillard, A.D., Fernandez, N.F., Duan, Q., Wang, Z., Koplev, S., Jenkins, S.L., Jagodnik, K.M. and Lachmann, A. (2016) Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic acids research, 44, W90-W97.
<br>
Lachmann, A., Xu, H., Krishnan, J., Berger, S.I., Mazloom, A.R. and Ma'ayan, A. (2010) ChEA: transcription factor regulation inferred from integrating genome-wide ChIP-X experiments. Bioinformatics, 26, 2438-2444.
<br>
Lachmann, Alexander, and Avi Ma'ayan. "KEA: kinase enrichment analysis." Bioinformatics 25.5 (2009): 684-686.
<br>
Ritchie, Matthew E., et al. "limma powers differential expression analyses for RNA-sequencing and microarray studies." Nucleic acids research 43.7 (2015): e47-e47.
<br>
Wang, Zichen, et al. "L1000FWD: fireworks visualization of drug-induced transcriptomic signatures." Bioinformatics 34.12 (2018): 2150-2152.