# GSEA Appyter
### This appyter performs gene set enrichment analysis. A table will display the enrichment statistics for the top gene sets chosen by the input criteria (with the full results available for download as a CSV file), and an interactive Enrichment Plot and Hit Indices Plot pair will also be generated for these sets.

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
#imports
import os
import numpy as np
import pandas as pd
import scipy
import math
from decimal import Decimal

#loading Enrichr libraries
import requests
import urllib.request

#performing GSEA
import gseapy as gp
from gseapy.gsea import GSEA

#creating visualizations
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display, HTML, Markdown, FileLink

In [None]:
%%appyter hide
{% do SectionField(
    name='gsea', 
    title='Gene Set Enrichment Analysis', 
    subtitle='Perform GSEA by inputting the following files. Follow the link to learn about the proper data formats: https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats.',
    img='plot_icon.png'
) %}
{% do SectionField(
    name='library', 
    title='Library', 
    subtitle='Choose an Enrichr gene set library or upload your own (.gmt).',
    img='library_icon.png'
) %}
{% do SectionField(
    name='data', 
    title='Data Submission', 
    subtitle='Option 1: Upload a preranked gene list (.rnk). Option 2: Upload an expression dataset with gene counts (.gct) and a phenotype labels file (.cls), and then choose a ranking method. Please make sure that the gene symbols used in the expression file match the types used in the library file for the appyter to work properly.',
    img='data_icon.png'
) %}
{% do SectionField(
    name='result', 
    title='Result Options', 
    subtitle='Choose how to get the top gene sets. Note: the appyter may not be able to support showing a large amount of gene sets.',
    img='results_icon.png'
) %}

In [None]:
%%appyter hide
{% set library_tab = TabField(
    name='library_tab',
    label='Submit Your Library',
    default='Select an Enrichr Library',
    description='',
    choices={'Upload':[FileField(
            name='geneset_filename',
            label='Library',
            description='GMT file containing sample gene set.',
            default='',
            examples={
        'Descartes_Cell_Types_and_Tissue_2021.gmt': 'https://appyters.maayanlab.cloud/storage/GSEA_Appyter/Descartes_Cell_Types_and_Tissue_2021.gmt'}
            )],
        'Select an Enrichr Library': [ChoiceField(
            name='enrichr_choice',
            label='Enrichr Library',
            default='Data_Acquisition_Method_Most_Popular_Genes',
            description='',
            choices=['ARCHS4_Cell-lines',
            'ARCHS4_IDG_Coexp',
            'ARCHS4_Kinases_Coexp',
            'ARCHS4_TFs_Coexp',
            'ARCHS4_Tissues',
            'Achilles_fitness_decrease',
            'Achilles_fitness_increase',
            'Aging_Perturbations_from_GEO_down',
            'Aging_Perturbations_from_GEO_up',
            'Allen_Brain_Atlas_10x_scRNA_2021',
            'Allen_Brain_Atlas_down',
            'Allen_Brain_Atlas_up',
            'BioCarta_2013',
            'BioCarta_2015',
            'BioCarta_2016',
            'BioPlanet_2019',
            'BioPlex_2017',
            'CCLE_Proteomics_2020',
            'CORUM',
            'COVID-19_Related_Gene_Sets',
            'Cancer_Cell_Line_Encyclopedia',
            'ChEA_2013',
            'ChEA_2015',
            'ChEA_2016',
            'Chromosome_Location',
            'Chromosome_Location_hg19',
            'ClinVar_2019',
            'DSigDB',
            'Data_Acquisition_Method_Most_Popular_Genes',
            'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
            'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019',
            'Descartes_Cell_Types_and_Tissue_2021',
            'DisGeNET',
            'Disease_Perturbations_from_GEO_down',
            'Disease_Perturbations_from_GEO_up',
            'Disease_Signatures_from_GEO_down_2014',
            'Disease_Signatures_from_GEO_up_2014',
            'DrugMatrix',
            'Drug_Perturbations_from_GEO_2014',
            'Drug_Perturbations_from_GEO_down',
            'Drug_Perturbations_from_GEO_up',
            'ENCODE_Histone_Modifications_2013',
            'ENCODE_Histone_Modifications_2015',
            'ENCODE_TF_ChIP-seq_2014',
            'ENCODE_TF_ChIP-seq_2015',
            'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
            'ESCAPE',
            'Elsevier_Pathway_Collection',
            'Enrichr_Libraries_Most_Popular_Genes',
            'Enrichr_Submissions_TF-Gene_Coocurrence',
            'Enrichr_Users_Contributed_Lists_2020',
            'Epigenomics_Roadmap_HM_ChIP-seq',
            'GO_Biological_Process_2013',
            'GO_Biological_Process_2015',
            'GO_Biological_Process_2017',
            'GO_Biological_Process_2017b',
            'GO_Biological_Process_2018',
            'GO_Cellular_Component_2013',
            'GO_Cellular_Component_2015',
            'GO_Cellular_Component_2017',
            'GO_Cellular_Component_2017b',
            'GO_Cellular_Component_2018',
            'GO_Molecular_Function_2013',
            'GO_Molecular_Function_2015',
            'GO_Molecular_Function_2017',
            'GO_Molecular_Function_2017b',
            'GO_Molecular_Function_2018',
            'GTEx_Tissue_Sample_Gene_Expression_Profiles_down',
            'GTEx_Tissue_Sample_Gene_Expression_Profiles_up',
            'GWAS_Catalog_2019',
            'GeneSigDB',
            'Gene_Perturbations_from_GEO_down',
            'Gene_Perturbations_from_GEO_up',
            'Genes_Associated_with_NIH_Grants',
            'Genome_Browser_PWMs',
            'HMDB_Metabolites',
            'HMS_LINCS_KinomeScan',
            'HomoloGene',
            'HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression',
            'HumanCyc_2015',
            'HumanCyc_2016',
            'Human_Gene_Atlas',
            'Human_Phenotype_Ontology',
            'InterPro_Domains_2019',
            'Jensen_COMPARTMENTS',
            'Jensen_DISEASES',
            'Jensen_TISSUES',
            'KEA_2013',
            'KEA_2015',
            'KEGG_2013',
            'KEGG_2015',
            'KEGG_2016',
            'KEGG_2019_Human',
            'KEGG_2019_Mouse',
            'KEGG_2021_Human',
            'Kinase_Perturbations_from_GEO_down',
            'Kinase_Perturbations_from_GEO_up',
            'L1000_Kinase_and_GPCR_Perturbations_down',
            'L1000_Kinase_and_GPCR_Perturbations_up',
            'LINCS_L1000_Chem_Pert_down',
            'LINCS_L1000_Chem_Pert_up',
            'LINCS_L1000_Ligand_Perturbations_down',
            'LINCS_L1000_Ligand_Perturbations_up',
            'Ligand_Perturbations_from_GEO_down',
            'Ligand_Perturbations_from_GEO_up',
            'MCF7_Perturbations_from_GEO_down',
            'MCF7_Perturbations_from_GEO_up',
            'MGI_Mammalian_Phenotype_2013',
            'MGI_Mammalian_Phenotype_2017',
            'MGI_Mammalian_Phenotype_Level_3',
            'MGI_Mammalian_Phenotype_Level_4',
            'MGI_Mammalian_Phenotype_Level_4_2019',
            'MSigDB_Computational',
            'MSigDB_Hallmark_2020',
            'MSigDB_Oncogenic_Signatures',
            'Microbe_Perturbations_from_GEO_down',
            'Microbe_Perturbations_from_GEO_up',
            'Mouse_Gene_Atlas',
            'NCI-60_Cancer_Cell_Lines',
            'NCI-Nature_2015',
            'NCI-Nature_2016',
            'NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions',
            'NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions',
            'NIH_Funded_PIs_2017_Human_AutoRIF',
            'NIH_Funded_PIs_2017_Human_GeneRIF',
            'NURSA_Human_Endogenous_Complexome',
            'OMIM_Disease',
            'OMIM_Expanded',
            'Old_CMAP_down',
            'Old_CMAP_up',
            'PPI_Hub_Proteins',
            'Panther_2015',
            'Panther_2016',
            'Pfam_Domains_2019',
            'Pfam_InterPro_Domains',
            'PheWeb_2019',
            'Phosphatase_Substrates_from_DEPOD',
            'ProteomicsDB_2020',
            'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
            'Rare_Diseases_AutoRIF_ARCHS4_Predictions',
            'Rare_Diseases_AutoRIF_Gene_Lists',
            'Rare_Diseases_GeneRIF_ARCHS4_Predictions',
            'Rare_Diseases_GeneRIF_Gene_Lists',
            'Reactome_2013',
            'Reactome_2015',
            'Reactome_2016',
            'SILAC_Phosphoproteomics',
            'SubCell_BarCode',
            'SysMyo_Muscle_Gene_Sets',
            'TF-LOF_Expression_from_GEO',
            'TF_Perturbations_Followed_by_Expression',
            'TG_GATES_2020',
            'TRANSFAC_and_JASPAR_PWMs',
            'TRRUST_Transcription_Factors_2019',
            'Table_Mining_of_CRISPR_Studies',
            'TargetScan_microRNA',
            'TargetScan_microRNA_2017',
            'Tissue_Protein_Expression_from_Human_Proteome_Map',
            'Tissue_Protein_Expression_from_ProteomicsDB',
            'Transcription_Factor_PPIs',
            'UK_Biobank_GWAS_v1',
            'Virus-Host_PPI_P-HIPSTer_2020',
            'VirusMINT',
            'Virus_Perturbations_from_GEO_down',
            'Virus_Perturbations_from_GEO_up',
            'WikiPathway_2021_Human',
            'WikiPathways_2013',
            'WikiPathways_2015',
            'WikiPathways_2016',
            'WikiPathways_2019_Human',
            'WikiPathways_2019_Mouse',
            'dbGaP',
            'huMAP',
            'lncHUB_lncRNA_Co-Expression',
            'miRTarBase_2017'])]
        },
    section='library')
%}

In [None]:
%%appyter hide
{% set data_tab = TabField(
    name='data_tab',
    label='Submit Your Data',
    default='Option 1',
    description='',
    choices={
        'Option 1': [FileField(
                name='ranked_filename',
                label='Preranked Gene List',
                default='logFC_ranked_GSE70466.rnk',
                examples={'logFC_ranked_GSE70466.rnk': 'https://appyters.maayanlab.cloud/storage/GSEA_Appyter/logFC_ranked_GSE70466.rnk'},
                description='RNK file containing preranked genelist.',
                section = 'data'
            )],
        'Option 2': [FileField(
                name='expression_filename',
                label='Expression Dataset',
                description='GCT file containing sample expression dataset.',
                default='GSE70466.gct',
                examples={
        'GSE70466.gct': 'https://appyters.maayanlab.cloud/storage/GSEA_Appyter/GSE70466.gct'},
                section='data'),
            FileField(
                name='phenotype_filename',
                label='Phenotype Labels',
                description='CLS file containing sample expression dataset.',
                default='GSE70466.cls',
                examples={
        'GSE70466.cls': 'https://appyters.maayanlab.cloud/storage/GSEA_Appyter/GSE70466.cls'},
                section='data'),
            ChoiceField(
                name='ranking_method',
                label='Method for Ranking Genes',
                choices={'Log2 Ratio of Class Means': 'log2_ratio_of_classes', 'Difference of Class Means': 'diff_of_classes', 'Ratio of Class Means (Fold Change)': 'ratio_of_classes', 'T-test': 't_test', 'Signal-to-Noise': 'signal_to_noise'},
                default='Log2 Ratio of Class Means',
                description='The method used to calculate a correlation or ranking.',
                section='data')]
            },
        section='data')
    %}

In [None]:
%%appyter hide
{% set result_number = IntField(
    name='result_number',
    label='Number of Top Gene Sets',
    min=1,
    max=50,
    default=5,
    description='The number of gene sets that will be displayed in the enrichment results table and plots.',
    section='result')
%}
{% set result_criteria = ChoiceField(
    name='result_criteria',
    label='Criteria for Top Gene Sets',
    choices={'Enrichment Score': 'es', 'Normalized Enrichment Score': 'nes', 'P-Value': 'pval', 'False Discovery Rate': 'fdr'},
    default='P-Value',
    description='The top gene sets will be calculated with the chosen criteria.',
    section='result')
%}

In [None]:
%%appyter code_exec
{%- if library_tab.raw_value == 'Upload'%}
library_tab = 'Upload'
library_filename = {{ library_tab.value[0] }}
library_name = library_filename.replace('_', ' ').replace('.gmt', '')

{%- else %}
library_tab = 'Select an Enrichr Library'
library_filename = '{{ library_tab.value[0] }}'
library_name = '{{ library_tab.value[0] }}'
{%- endif %}

{%- if data_tab.raw_value == 'Option 1'%}
data_tab = 'Option 1'
ranked_filename = {{ data_tab.value[0] }}

{%- else %}
data_tab = 'Option 2'
expression_filename = {{ data_tab.value[0] }}
phenotype_filename = {{ data_tab.value[1] }}
ranking_method = '{{ data_tab.value[2] }}'
{%- endif %}

result_number = {{ result_number.value }}
result_criteria = '{{ result_criteria.value }}'

In [None]:
%%appyter markdown
<h2>Loading Data</h2>

This may take a while... Please allow up to around five minutes for the data to load, for inputs similar in size to the default examples and input choices. Once the data is done loading, a message will be printed underneath this section.

In [None]:
#checks if inputs are valid
def checkInputs():
    if data_tab == 'Option 1':
        if library_filename == '' or ranked_filename == '':
            raise Exception('Please submit necessary materials for Option 1 to continue.')
        if ranked_filename.endswith('.rnk') == False:
            raise Exception('Please upload a RNK file (ends in .rnk).')
    else:
        if (library_filename == '' or expression_filename == '' or
        phenotype_filename == ''):
            raise Exception('Please submit necessary materials for Option 2 to continue.')
        if expression_filename.endswith('.gct') == False:
            raise Exception('Please upload a GCT file (ends in .gct).')
        if phenotype_filename.endswith('.cls') == False:
            raise Exception('Please upload a CLS file (ends in .cls).')
    if (library_tab == 'Upload' and library_filename.endswith('.gmt') 
    == False):
        raise Exception('Please upload a GMT file (ends in .gmt).')

#performs GSEA
def gsea():
    if data_tab == 'Option 1':
        rnk = pd.read_csv(ranked_filename, header=None, sep="\t")
        results = gp.prerank(rnk=rnk, gene_sets=library_filename, max_size=500)
    else:
        results = gp.gsea(data=expression_filename, gene_sets=library_filename,
        cls=phenotype_filename, max_size=500)
    return results

In [None]:
#loads a preranked list and converts it to a dataframe for Option 1
#creates a ranked list with chosen ranking method and converts to dataframe for Option 2
def loadRanked():
    if data_tab == 'Option 1':
        values = pd.read_csv(ranked_filename, header=None, sep="\t")
        if len(values.index) < 5000:
            raise Exception('Current ranked gene list has less than 5000 genes. We expect all human coding genes to be in the list, which is around 20000.')
        values.sort_values(by=values.columns[1], ascending=False, inplace=True)
        values.reset_index(drop=True, inplace=True)
        values.columns = ['Gene', 'Rank']
    elif data_tab == 'Option 2':
        pos, neg, classes = gp.parser.gsea_cls_parser(phenotype_filename)
        obj = GSEA(data=expression_filename, gene_sets=library_filename,
        classes=classes)
        exp, classesDict = obj.load_data(classes)
        values = gp.algorithm.ranking_metric(df=exp, pos=pos, neg=neg, 
        method=ranking_method, classes=classesDict, ascending=False)
        values = pd.DataFrame({'Gene':values.index, 'Rank':values.values})
    return values

In [None]:
#loads libraries
def downloadLibrary(name):
    with open(f"{name}", "w") as fw:
        with urllib.request.urlopen(f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName={name}') as f:
            for line in f.readlines():
                fw.write(line.decode('utf-8'))
                fw.flush()
def loadLibrary(library_filename):
    if library_tab == 'Select an Enrichr Library':
        downloadLibrary(library_filename)
    library_data = dict()
    with open(library_filename, 'r') as f:
        lines = f.readlines()
        for line in lines:
            splited = line.strip().split("\t")
            elements = splited[2:]
            if len(elements) > 0:
                # to upper case
                library_data[splited[0]] = [x.upper() for x in elements]
    if len(library_data) < result_number:
        raise Exception('There cannot be less gene sets in the library than the number of gene sets displayed for the results.')
    return library_data

In [None]:
%%appyter_code_exec
#loading all needed data
try:
    checkInputs()
except Exception as e:
    print(e)
result = gsea().results
ranked = loadRanked()
library_data = loadLibrary(library_filename)
print('Data loaded successfully!')

In [None]:
%%appyter markdown
<h2>Enrichment Results Table</h2>

The following table displays the enrichment analysis result statistics of the top gene sets with the chosen criteria. The columns can be re-arranged by dragging. 

The table is available for download as a PNG file by clicking the camera icon on the upper right. The CSV file containing the full version of the results sorted by the input criteria is available for download by clicking the link below the table.

In [None]:
#makes values appropriate for table viewing
def validateNumber(num):
    num = round(num, 4)
    if abs(num) < 0.01:
        num = '{:.2E}'.format(Decimal(num)) #scientific notation
    return num

#create a downloadable csv file of all the results
def create_download_link(df, title = "Download CSV file of all results", filename = "GSEA_Enrichment_Results_data.csv"):  
    csv = df.to_csv(filename, index = False)
    html = f'<a href="{filename}" target=_blank>{title}</a>'
    return HTML(html)

In [None]:
#organizes gsea results into sorted dataframe (for download)
result_df = pd.DataFrame.from_dict(result, orient='index')
if result_criteria == 'pval' or result_criteria == 'fdr':
    ascent = True
elif result_criteria == 'es' or result_criteria == 'nes':
    ascent = False
result_df.index.name = 'gene set'
sorted_result_df = result_df.sort_values(by=result_criteria, ascending=ascent)
sorted_result_df.reset_index(inplace=True)

#gets top gene sets' data (calculated statistics only) needed for the table display
top_result_df = sorted_result_df.copy(deep=True)
top_result_df = top_result_df.drop(['geneset_size', 'matched_size', 'genes', 'ledge_genes', 'RES', 'hit_indices'], 1)
top_result_df = top_result_df.truncate(before=1, after=result_number)
top_result_df.set_index(keys='gene set', inplace=True)
top_result_df = top_result_df.applymap(validateNumber)

In [None]:
#displays this section
fig1 = go.Figure(data=[go.Table(columnwidth=[200, 50, 50, 50, 50],
    header=dict(values=['Gene Set', 'ES', 'NES', 'P-value', 'FDR'], height=40), 
    cells=dict(values=[top_result_df.index, top_result_df.es, top_result_df.nes, 
    top_result_df.pval, top_result_df.fdr], height=30))])
fig1.update_layout(width=750, font_size=14)
fig1.show()
display(create_download_link(sorted_result_df))

In [None]:
%%appyter markdown
<h2>Generating GSEA Plots</h2>

For each gene set library, the top gene sets with the best chosen criteria will be plotted below. Choose which gene set to view with the drop down menu. Note that the hit indices plot will only be present for individual gene sets.

Various interactive options are available at the toolbar on the upper right. Lines can be toggled on and off by clicking its name in the legend. Hovering over the running sum line on the Enrichment Plot will display the gene, its ranking on the list of genes, and the running sum score at that point. Also be aware that zooming in and out of the Enrichment Plot will simultaneously zoom for the Hit Indices plot.

This plot pairing is available for download as a PNG file by clicking the camera icon on the toolbar.

In [None]:
#breaks strings that are too long into new lines
def lineBreak(string):
    words = string.split(" ")
    name = ""
    check = ""
    for i in range(len(words)):
        check += words[i] + " "
        if len(check) > 20 and i < len(words)-1:
            name += check + "<br>"
            check = ""
    name += check
    return name

In [None]:
#converts necessary dataframes to list
rankedGenes = ranked['Gene'].tolist()
rankedCorrelation = ranked['Rank'].tolist()
top = top_result_df.index.tolist()

#initialize figure basics
fig2 = make_subplots(rows=2, cols=1, row_heights = [0.8, 0.2],
        shared_xaxes=True, vertical_spacing = 0.10,
        subplot_titles=("Enrichment Plot", "Hit Indices"))
arbitrary = [dict(type="line", xref="x2", yref="y2", x0=0, y0=0,
            x1=len(rankedGenes), y1=0, line=dict(color="White", width=0.5)), 
            dict(type="line", xref="x2", yref="y2", x0=0, y0=-1,
            x1=0, y1=1, line=dict(color="White", width=0.5))]
            #pair of arbitrary lines that maintain scale of plot
fig2.update_layout(shapes=arbitrary, height=600, width=750, font_size=16)
buttons = [dict(label='All Top Gene Sets', method="update",
            args=[{"visible": [True for i in range(len(top))]}, {"shapes": arbitrary}])]

for geneSet in top:
#plots values for each top gene set 
    xvals = [i for i in range(len(rankedGenes))]
    #gets hit indices and running sum vector
    r = gp.algorithm.enrichment_score(rankedGenes, rankedCorrelation, library_data[geneSet])
    hits, yvals = r[2], r[3]
    index = top.index(geneSet)
    #creates enrichment plot
    setName = lineBreak(geneSet)
    fig2.add_trace(go.Scatter(x=xvals, y=yvals, mode='lines', showlegend=True,
        name=setName, text = ['Gene: {}'.format(gene) for gene in rankedGenes],
        hovertemplate=('%{text}' + 
        '<br>Ranking: %{x}' + 
        '<br>Running Sum: %{y}')),
        row=1, col=1)
    #creates hit indices
    lines = []
    for x in hits:
        line = dict(type='line', x0=x, y0=-1, x1=x, y1=1, xref = 'x2', yref = 'y2')
        lines.append(line)
    #creates drop down menu option
    visible = [True if i==index else False for i in range(len(top))]
    button = dict(label=f'{geneSet}', method="update",
        args=[{"visible": visible}, {"shapes": lines}])
    buttons.append(button)

#make drop down menu
fig2.update_layout(updatemenus=[
    dict(buttons=buttons, pad = {"r": 10, "t": 10}, showactive = True,
    x = 0, xanchor = "left", y = 1.2, yanchor = "top")])

fig2.update_xaxes(title_text="Gene Rankings", row=2, col=1)
fig2.update_yaxes(title_text="Enrichment Score (ES)", row=1, col=1)
fig2.update_yaxes(visible=False, showticklabels=False, row=2, col=1)

fig2.show()