# Independent Enrichment Analysis

This Appyter performs enrichment analysis using given input signature against input background signatures. It also visualizes the enrichment analysis results.

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
from maayanlab_bioinformatics.enrichment.crisp import enrich_crisp, fisher_overlap
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display, FileLink, Markdown, HTML

# Manhattan Plot Imports
import matplotlib.patches as mpatches
import matplotlib.cm as cm

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Span
from bokeh.layouts import layout, row, column, gridplot
from bokeh.palettes import all_palettes

import base64

In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='Signature_Section',
    title='Submit Your Gene Set',
    subtitle='Upload a text file containing your gene set or copy and paste your gene set into the text box below (one gene per row). You can also try the default gene set provided.',
    img='analysis.png'
    
) %}
{% do SectionField(
    name='Library_Section',
    title='Submit Your Gene Set Library',
    subtitle='Load your gene set library. Upload a GMT file containing your gene signatures. You can also load the default gene set library.',
    img='analysis.png'
    
) %}


In [None]:
%%appyter hide_code

{% set gene_set_kind = TabField(
    name='gene_set_kind',
    label='Gene Set',
    default='Paste',
    description='Paste or upload your gene set',
    choices={
        'Paste': [
            TextField(
                name='gene_set_input',
                label='Gene Set',
                default='hexachlorophene\nlopinavir\nbazedoxifene\nabemaciclib\ncamostat\nmefloquine\ncyclosporine\nanidulafungin\nchloroquine\namodiaquine\nloperamide\nalmitrine\nhydroxychloroquine\nniclosamide\nivacaftor\nproscillaridin\nremdesivir',
                description='Paste your gene set (one gene per row). Gene names in the gene set should match the gene names in the GMT file.',
                section = 'Signature_Section'
            ),
        ],
        'Upload': [
            FileField(
                name='gene_set_filename',
                label='Gene Set File',
                default='',
                description='Upload your gene set as a text file (one gene per row). Gene names in the gene set should match the gene names in the GMT file.',
                section = 'Signature_Section'
            ),
        ],
    },
    section = 'Signature_Section',
) %}

In [None]:
%%appyter code_exec
{% set library_filename = FileField(
    name='library_filename', 
    label='Library file (.gmt or .txt)', 
    default='L1000FWD_GO_Biological_Processes_Down.txt',
    examples={'L1000FWD_GO_Biological_Processes_Down.txt': "https://maayanlab.cloud/DrugEnrichr/geneSetLibrary?mode=text&libraryName=L1000FWD_GO_Biological_Processes_Down"}, 
    description='A tab-delimited file format that describes gene sets. Visit https://bit.ly/35crtXQ for more information.', 
    section='Library_Section')

%}

In [None]:
%%appyter code_exec

{%- if gene_set_kind.raw_value == 'Paste' %}
gene_set_input = {{ gene_set_kind.value[0] }}
{%- else %}
gene_set_filename = {{ gene_set_kind.value[0] }}
{%- endif %}
library_filename = "{{library_filename.value}}"
library_name = library_filename.replace("_", " ").replace(".txt", "").replace(".gmt", "")

In [None]:
output_notebook()
# Table Parameters
significance_value = 0.05
display_topk = 20

# Bar Chart Parameters
figure_file_format = ['png', 'svg']
output_file_name = 'Enrichment_analysis_results_bar'
color = 'lightskyblue'
final_output_file_names = ['{0}.{1}'.format(output_file_name, file_type) for file_type in figure_file_format]
topk = 10

# Manhattan Plot Parameters
manhattan_colors = ['#003f5c', '#7a5195', '#ef5675', '#ffa600']

In [None]:
%%appyter code_exec

{%- if gene_set_kind.raw_value == 'Paste' %}
genes = gene_set_input.split('\n')
genes = [x.strip() for x in genes]
{%- else %}
open_gene_set_file = open(gene_set_filename,'r')
lines = open_gene_set_file.readlines()
genes = [x.strip() for x in lines]
open_gene_set_file.close()
{%- endif %}

In [None]:
def load_library(library_filename):
    library_data = list()
    geneset_names = list()
    with open(library_filename, "r") as f:
        lines = f.readlines()
        for line in lines:
            splited = line.strip().split("\t")
            geneset = [splited[0]]
            geneset.append(" ".join(splited[2:]))
            library_data.append(geneset)
            geneset_names.append(splited[0])
    
    return library_data, geneset_names


# Enrichment analysis
def get_library_iter(library_data):
    for member in library_data:        
        term = member[0]
        gene_set = member[1].split(' ')
        yield term, gene_set

def get_enrichment_results(genes, library_data):
    return sorted(enrich_crisp(genes, get_library_iter(library_data), 20000, True), key=lambda r: r[1].pvalue)

def get_pvalue(row, unzipped_results, all_results):
    if row['Name'] in list(unzipped_results[0]):
        index = list(unzipped_results[0]).index(row['Name'])
        return all_results[index][1].pvalue
    else:
        return 1
    
# Call enrichment results and return a plot and dataframe for Scatter Plot
def get_values(obj_list):
    pvals = []
    odds_ratio = []
    n_overlap = []
    overlap = []
    for i in obj_list:
        pvals.append(i.pvalue)
        odds_ratio.append(i.odds_ratio)
        n_overlap.append(i.n_overlap)
        overlap.append(i.overlap)
    return pvals, odds_ratio, n_overlap, overlap


def enrichment_analysis(genes, library_filename):
    library_data, geneset_names = load_library(library_filename)
    all_results = get_enrichment_results(genes, library_data)
    unzipped_results = list(zip(*all_results))
    pvals, odds_ratio, n_overlap, overlap = get_values(unzipped_results[1])
    df = pd.DataFrame({"Gene Set":unzipped_results[0], "p value": pvals, \
                       "odds_ratio": odds_ratio, "n_overlap": n_overlap, "overlap": overlap})
    df["-log(p value)"] = -np.log10(df["p value"])
    return [list(unzipped_results[0])], [pvals], df

# Output a table of significant p-values
def create_download_link(df, title = "Download CSV file of this table", filename = "data.csv"):  
    csv = df.to_csv(index = False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload, title=title, filename=filename)
    return HTML(html)


In [None]:
# Bar Chart Functions
def enrichr_figure(all_terms, all_pvalues, plot_names, all_libraries, bar_color, topk=10): 
    all_terms = [all_terms[0][:topk]]
    all_pvalues = [all_pvalues[0][:topk]]
    # Bar colors
    if bar_color != 'lightgrey':
        bar_color_not_sig = 'lightgrey'
        edgecolor=None
        linewidth=0
    else:
        bar_color_not_sig = 'white'
        edgecolor='black'
        linewidth=1    

    plt.figure(figsize=(24, 12))
    
    i = 0
    bar_colors = [bar_color if (x < 0.05) else bar_color_not_sig for x in all_pvalues[i]]
    fig = sns.barplot(x=np.log10(all_pvalues[i])*-1, y=all_terms[i], palette=bar_colors, edgecolor=edgecolor, linewidth=linewidth)
    fig.axes.get_yaxis().set_visible(False)
    fig.set_title(all_libraries[i], fontsize=26)
    fig.set_xlabel('−log₁₀(p‐value)', fontsize=25)
    fig.tick_params(axis='x', which='major', labelsize=20)
    if max(np.log10(all_pvalues[i])*-1)<1:
        fig.xaxis.set_ticks(np.arange(0, max(np.log10(all_pvalues[i])*-1), 0.1))
    for ii,annot in enumerate(all_terms[i]):
        if all_pvalues[i][ii] < 0.05:
            annot = '  *'.join([annot, str(str(np.format_float_scientific(all_pvalues[i][ii], precision=2)))]) 
        else:
            annot = '  '.join([annot, str(str(np.format_float_scientific(all_pvalues[i][ii], precision=2)))])

        title_start= max(fig.axes.get_xlim())/200
        fig.text(title_start, ii, annot, ha='left', wrap = True, fontsize = 26)

    fig.spines['right'].set_visible(False)
    fig.spines['top'].set_visible(False)
    # Save results 
    for plot_name in plot_names:
        plt.savefig(plot_name, bbox_inches = 'tight')
    
    # Show plot 
    plt.show()  

In [None]:
# Create Manhattan Plots
def manhattan(df):
    df = df.sort_values("Gene Set")
    list_of_xaxis_values = df["Gene Set"].values.tolist()

    # define the output figure and the features we want
    p = figure(x_range = list_of_xaxis_values, plot_height=300, plot_width=750, tools='pan, box_zoom, hover, reset, save')

    # loop over all libraries
    r = []
    color_index = 0
    if color_index >= len(manhattan_colors):
        color_index = 0 

    # calculate actual p value from -log(p value)
    actual_pvalues = []
    for log_value in df["-log(p value)"].values.tolist():
        actual_pvalues += ["{:.5e}".format(10**(-1*log_value))]

    # define ColumnDataSource with our data for this library
    source = ColumnDataSource(data=dict(
        x = df["Gene Set"].values.tolist(),
        y = df["-log(p value)"].values.tolist(),
        pvalue = actual_pvalues,
    ))

    # plot data from this library
    r += [p.circle(x = 'x', y = 'y', size=5, fill_color=manhattan_colors[color_index], line_color = manhattan_colors[color_index], line_width=1, source = source)]
    color_index += 1

    p.background_fill_color = 'white'
    p.xaxis.major_tick_line_color = None 
    p.xaxis.major_label_text_font_size = '0pt'
    p.y_range.start = 0
    p.yaxis.axis_label = '-log(p value)'

    p.hover.tooltips = [
        ("Gene Set", "@x"),
        ("p value", "@pvalue"),
    ]
    p.output_backend = "svg"
    
    # returns the plot
    return p

# Enrichment Analysis

Enrichment analysis is a statistical procedure used to identify gene sets which are over-represented in a given gene set. The table below displays the top 20 enrichment analysis results for the given gene set library. The table contains the gene set names, p values, odds ratio, the number of overlapping genes, overlapping genes and -log(p value) values. The table is sorted by p values in ascending order. The full results are downloadable in the CSV format

In [None]:
results, pvals, results_df = enrichment_analysis(genes, library_filename)

In [None]:
if 'p value' in results_df.columns:
    sorted_df = results_df.sort_values(by = ['p value'])
#     filtered_df = sorted_df[sorted_df['p value'] <= significance_value]
    filtered_df = sorted_df.iloc[:display_topk]
    if len(filtered_df) != 0:
        display(HTML(filtered_df.to_html(index = False)))
        display(Markdown(f"*Table 1. Enrichment analysis results of {library_name}*"))        
        display(create_download_link(sorted_df))

# Bar Chart

In [None]:
display(Markdown(f"The bar chart below shows the top {topk} enriched terms in a given gene set library. Colored bars correspond to terms with significant p-values (<0.05). The bar chart is downloadable as an image in the PNG and SVG formats. "))

In [None]:
enrichr_figure(results, pvals, final_output_file_names, [library_name], color, topk)
display(Markdown(f"*Figure 1. Bar chart of the top {topk} enriched terms in {library_name}, along with their corresponding p-values. Colored bars correspond to terms with significant p-values (<0.05).*"))     
    
# Download Bar Chart
for i, file in enumerate(final_output_file_names):
    display(FileLink(file, result_html_prefix=str('Download ' + figure_file_format[i] + ': ')))
    


# Manhattan Plot

In the Manhattan plot below, each line on the x-axis denotes a single gene set from the library, while the y-axis measures the −log₁₀(p‐value) for each gene set. Hovering over a point will display the name of the gene set and the associated p-value. You can also zoom, pan, and save the plot as an svg using the toolbar on the right.

In [None]:
show(manhattan(results_df))
display(Markdown(f"*Figure 2. Manhattan plot that displays gene sets from {library_name} and their p-values on a -log10 scale.*"))     