In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter markdown
# Gene Set Library Synopsis
This Appyter counts the total genes, total gene sets, and individual gene frequencies in a submitted gene set library.
Results are visualized as tables and bar graphs.

In [None]:
#%% Imports
import appyter
import pandas as pd
import numpy as np
import base64
import math

import matplotlib.pyplot as plt; plt.rcdefaults()
%matplotlib inline

import IPython
from IPython.display import HTML, display, FileLink, Markdown, IFrame

import urllib

from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)

from itertools import chain

from bokeh.io import output_notebook, export_svg
from bokeh.io.export import get_screenshot_as_png
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Span, ranges, LabelSet
from bokeh.layouts import layout, row, column, gridplot
from bokeh.palettes import all_palettes
output_notebook()

In [None]:
%%appyter hide_code
{% do SectionField(
    name='introSection', 
    title='Gene Set Library Synopsis', 
    subtitle='This Appyter performs basic analysis on a submitted gene set library.', 
    img='gene-library-3.png'
) %}
{% do SectionField(
    name='geneEntrySection', 
    title='1. Enter a Gene Library',
    subtitle='Upload a GMT file containing a gene set library or select an existing Enrichr library. You may also choose to use the default gene set library provided.',
    img='data-upload-icon.png'
) %}
{% do SectionField(
    name='barGraphSection', 
    title='2. Bar Graph',
    subtitle='Generate a bar graph of the most common genes in the library.',
    img='bar-icon.png'
) %}

In [None]:
%%appyter code_exec
# Inputting libraries and settings
{% set library_kind = TabField(
    name = 'library_kind',
    label = 'Library',
    default = 'Upload a library',
    description = '',
    choices = {
        'Upload a library': [
            FileField(
                name = 'library_filename',
                label = 'Gene Library File (.gmt or .txt)',
                default = 'Cancer_Cell_Line_Encyclopedia.gmt', 
                examples = {'Cancer_Cell_Line_Encyclopedia.gmt': url_for('static', filename = 'Cancer_Cell_Line_Encyclopedia.gmt'),
                            'Cancer_Cell_Line_Encyclopedia.txt': url_for('static', filename = 'Cancer_Cell_Line_Encyclopedia.txt')}, 
                description = 'GMT is a tab-delimited file format that describes sets. Visit https://bit.ly/35crtXQ for more information and http://www.molmine.com/magma/fileformats.html to create your own.',
                section = 'geneEntrySection')
        ],
        
        'Select a library from Enrichr': [
            ChoiceField(
                name = 'enrichr_library',
                description = 'Select one Enrichr library whose genes will be counted',
                label = 'Enrichr Library',
                default = 'Cancer_Cell_Line_Encyclopedia',
                section = 'geneEntrySection',
                choices = [
                    'ARCHS4_Cell-lines',
                    'ARCHS4_IDG_Coexp',
                    'ARCHS4_Kinases_Coexp',
                    'ARCHS4_TFs_Coexp',
                    'ARCHS4_Tissues',
                    'Achilles_fitness_decrease',
                    'Achilles_fitness_increase',
                    'Aging_Perturbations_from_GEO_down',
                    'Aging_Perturbations_from_GEO_up',
                    'Allen_Brain_Atlas_10x_scRNA_2021',
                    'Allen_Brain_Atlas_down',
                    'Allen_Brain_Atlas_up',
                    'BioCarta_2013',
                    'BioCarta_2015',
                    'BioCarta_2016',
                    'BioPlanet_2019',
                    'BioPlex_2017',
                    'CCLE_Proteomics_2020',
                    'CORUM',
                    'COVID-19_Related_Gene_Sets',
                    'Cancer_Cell_Line_Encyclopedia',
                    'ChEA_2013',
                    'ChEA_2015',
                    'ChEA_2016',
                    'Chromosome_Location',           
                    'Chromosome_Location_hg19',
                    'ClinVar_2019',
                    'dbGaP',
                    'DSigDB',
                    'Data_Acquisition_Method_Most_Popular_Genes',
                    'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
                    'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019',
                    'DisGeNET',
                    'Disease_Perturbations_from_GEO_down',
                    'Disease_Perturbations_from_GEO_up',
                    'Disease_Signatures_from_GEO_down_2014',
                    'Disease_Signatures_from_GEO_up_2014',
                    'DrugMatrix',
                    'Drug_Perturbations_from_GEO_2014',
                    'Drug_Perturbations_from_GEO_down',
                    'Drug_Perturbations_from_GEO_up',
                    'ENCODE_Histone_Modifications_2013',
                    'ENCODE_Histone_Modifications_2015',
                    'ENCODE_TF_ChIP-seq_2014',
                    'ENCODE_TF_ChIP-seq_2015',
                    'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
                    'ESCAPE',
                    'Elsevier_Pathway_Collection',
                    'Enrichr_Libraries_Most_Popular_Genes',
                    'Enrichr_Submissions_TF-Gene_Coocurrence',
                    'Enrichr_Users_Contributed_Lists_2020',
                    'Epigenomics_Roadmap_HM_ChIP-seq',
                    'GO_Biological_Process_2013',
                    'GO_Biological_Process_2015',
                    'GO_Biological_Process_2017',
                    'GO_Biological_Process_2017b',
                    'GO_Biological_Process_2018',
                    'GO_Cellular_Component_2013',
                    'GO_Cellular_Component_2015',
                    'GO_Cellular_Component_2017',
                    'GO_Cellular_Component_2017b',
                    'GO_Cellular_Component_2018',
                    'GO_Molecular_Function_2013',
                    'GO_Molecular_Function_2015',
                    'GO_Molecular_Function_2017',
                    'GO_Molecular_Function_2017b',
                    'GO_Molecular_Function_2018',
                    'GTEx_Tissue_Sample_Gene_Expression_Profiles_down',
                    'GTEx_Tissue_Sample_Gene_Expression_Profiles_up',
                    'GWAS_Catalog_2019',
                    'GeneSigDB',
                    'Gene_Perturbations_from_GEO_down',
                    'Gene_Perturbations_from_GEO_up',
                    'Genes_Associated_with_NIH_Grants',
                    'Genome_Browser_PWMs',
                    'HMDB_Metabolites',
                    'HMS_LINCS_KinomeScan',
                    'HomoloGene',
                    'HumanCyc_2015',
                    'HumanCyc_2016',
                    'Human_Gene_Atlas',
                    'Human_Phenotype_Ontology',
                    'huMAP',
                    'InterPro_Domains_2019',
                    'Jensen_COMPARTMENTS',
                    'Jensen_DISEASES',
                    'Jensen_TISSUES',
                    'KEA_2013',
                    'KEA_2015',
                    'KEGG_2013',
                    'KEGG_2015',
                    'KEGG_2016',
                    'KEGG_2019_Human',
                    'KEGG_2019_Mouse',
                    'Kinase_Perturbations_from_GEO_down',
                    'Kinase_Perturbations_from_GEO_up',
                    'L1000_Kinase_and_GPCR_Perturbations_down',
                    'L1000_Kinase_and_GPCR_Perturbations_up',
                    'LINCS_L1000_Chem_Pert_down',
                    'LINCS_L1000_Chem_Pert_up',
                    'LINCS_L1000_Ligand_Perturbations_down',
                    'LINCS_L1000_Ligand_Perturbations_up',
                    'Ligand_Perturbations_from_GEO_down',
                    'Ligand_Perturbations_from_GEO_up',
                    'lncHUB_lncRNA_Co-Expression',
                    'MCF7_Perturbations_from_GEO_down',
                    'MCF7_Perturbations_from_GEO_up',
                    'MGI_Mammalian_Phenotype_2013',
                    'MGI_Mammalian_Phenotype_2017',
                    'MGI_Mammalian_Phenotype_Level_3',
                    'MGI_Mammalian_Phenotype_Level_4',
                    'MGI_Mammalian_Phenotype_Level_4_2019',
                    'MSigDB_Computational',
                    'MSigDB_Hallmark_2020',
                    'MSigDB_Oncogenic_Signatures',
                    'Microbe_Perturbations_from_GEO_down',
                    'Microbe_Perturbations_from_GEO_up',
                    'miRTarBase_2017',
                    'Mouse_Gene_Atlas',
                    'NCI-60_Cancer_Cell_Lines',
                    'NCI-Nature_2015',
                    'NCI-Nature_2016',
                    'NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions',
                    'NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions',
                    'NIH_Funded_PIs_2017_Human_AutoRIF',
                    'NIH_Funded_PIs_2017_Human_GeneRIF',
                    'NURSA_Human_Endogenous_Complexome',
                    'OMIM_Disease',
                    'OMIM_Expanded',
                    'Old_CMAP_down',
                    'Old_CMAP_up',
                    'PPI_Hub_Proteins',
                    'Panther_2015',
                    'Panther_2016',
                    'Pfam_Domains_2019',
                    'Pfam_InterPro_Domains',
                    'PheWeb_2019',
                    'Phosphatase_Substrates_from_DEPOD',
                    'ProteomicsDB_2020',
                    'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
                    'Rare_Diseases_AutoRIF_ARCHS4_Predictions',
                    'Rare_Diseases_AutoRIF_Gene_Lists',
                    'Rare_Diseases_GeneRIF_ARCHS4_Predictions',
                    'Rare_Diseases_GeneRIF_Gene_Lists',
                    'Reactome_2013',
                    'Reactome_2015',
                    'Reactome_2016',
                    'SILAC_Phosphoproteomics',
                    'SubCell_BarCode',
                    'SysMyo_Muscle_Gene_Sets'
                    'TF-LOF_Expression_from_GEO',
                    'TF_Perturbations_Followed_by_Expression',
                    'TG_GATES_2020',
                    'TRANSFAC_and_JASPAR_PWMs',
                    'TRRUST_Transcription_Factors_2019',
                    'Table_Mining_of_CRISPR_Studies',
                    'TargetScan_microRNA',
                    'TargetScan_microRNA_2017',
                    'Tissue_Protein_Expression_from_Human_Proteome_Map',
                    'Tissue_Protein_Expression_from_ProteomicsDB.csv',
                    'Transcription_Factor_PPIs',
                    'UK_Biobank_GWAS_v1',
                    'Virus-Host_PPI_P-HIPSTer_2020',
                    'VirusMINT',
                    'Virus_Perturbations_from_GEO_down',
                    'Virus_Perturbations_from_GEO_up',
                    'WikiPathways_2013',
                    'WikiPathways_2015',
                    'WikiPathways_2016',
                    'WikiPathways_2019_Human',
                    'WikiPathways_2019_Mouse'
                ]
            )
        ],
    },
    section = 'geneEntrySection',
) %}

# Bar Graph Field
bargraph = {{ BoolField(
    name = 'bargraph',
    label = 'Bar Graph?',
    default = 'true',
    description = 'Select \'Yes\' if you would like to generate a bar graph. Otherwise, select \'No\'',
    section = 'barGraphSection'
)}}

# Choose number of genes in bar graph
num_bar_genes = {{ IntField(
    name='num_bar_genes', 
    label='Displayed Genes', 
    min=2, 
    max=20, 
    default=20, 
    description='Select the number of genes to display in the bar graph.', 
    section='barGraphSection'
)}}

# Choose the orientation of the graph: horizontal or vertical bars
orient = "{{ ChoiceField(name = 'orient', label = 'Orientation', choices = ['Horizontal', 'Vertical'], default = 'Horizontal', description = 'Choose whether your bar graph will be displayed horizontally or vertically', section = 'barGraphSection') }}"

# Choose color of bars
color = "{{ ChoiceField(name = 'color', label = 'Color', choices = ['Black', 'Blue', 'Red', 'Green', 'Grey', 'Orange', 'Purple', 'Yellow', 'Pink'], default = 'Black', section = 'barGraphSection') }}"

# Choose whether gene counts are displayed on bar graph
counts = {{ BoolField(name = 'counts', label = 'Show Counts?', default = 'true', description = 'Choose \'Yes\' to label the bars with the gene count (number of appearances in library).', section = 'barGraphSection') }}

In [None]:
# Color for Bar plot
color_conversion = {
    'Black': 'black',
    'Blue': 'lightskyblue',
    'Red': 'tomato',
    'Green': 'mediumspringgreen',
    'Grey': 'lightgrey',
    'Orange': 'orange',
    'Purple': 'plum',
    'Yellow': 'yellow',
    'Pink': 'lightpink'
}

bar_color = color_conversion[color]

In [None]:
%%appyter code_exec
{%- if library_kind.raw_value == 'Upload a library' %}
library_kind = "Upload a library"
library_filename = {{ library_kind.value[0] }}
library_name = library_filename.replace("_", " ").replace(".txt", "").replace(".gmt", "")

{%- else %}
library_kind = "Select a library from Enrichr"
library_filename = "{{ library_kind.value[0] }}"
library_name = "{{ library_kind.value[0] }}"
{%- endif %}

In [None]:
# Download library from the Enrichr site using its file name
def download_library(library_filename):
    with open(f"{library_filename}", "w") as fw:
        with urllib.request.urlopen(f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName={library_filename}') as f:
            for line in f.readlines():
                fw.write(line.decode('utf-8'))
                fw.flush()

In [None]:
# Load library 
def load(library_filename):
    if library_kind == "Select a library from Enrichr":
        download_library(library_filename)
    library_data, library_genes = load_library(library_filename)
    # to upper case
    return library_data, library_genes

# Returns a dictionary (library_data) where the values are all the elements
def load_library(library_filename):
    library_data = dict()
    with open(library_filename, "r") as f:
        lines = f.readlines()
        library_genes = [''] * len(lines)
        i = 0
        for line in lines:
            splited = line.strip().split("\t")
            elements = pd.Series(splited[2:]).dropna()
            if len(elements) > 0:
                # to upper case
                allxs = [x.upper() for x in elements]
                library_data[splited[0]] = allxs
                library_genes[i] = (' ').join(allxs)
            i = i + 1    
    return library_data, library_genes

In [None]:
%%appyter code_exec
# Count the number of each gene 
library_data, library_genes = load(library_filename)
if library_kind == "Select a library from Enrichr":
    library_name = library_name.replace("_", " ")
vals = list(library_data.values())
all_genes = list(chain(*vals))
all_genes_unique = np.unique(np.array(all_genes))
all_sets = list(library_data.keys())

count_frame = pd.Series(all_genes).value_counts().sort_index().reset_index().reset_index(drop=True)
count_frame.columns = ['GENE', 'COUNT']
count_frame.dropna()
count_frame.sort_values(by=['COUNT'], inplace=True, ascending=False)
count_frame = count_frame.reset_index(drop=True)
top_genes = count_frame.iloc[0:num_bar_genes]

unique_genes = len(count_frame['GENE'])
unique_sets = len(library_data.keys())

# Drop skipped row 
mask = count_frame['GENE'].str.len() > 0
count_frame = count_frame.loc[mask]

In [None]:
# Output a table of genes and counts.
def create_download_link(df, title, filename):  
    csv = df.to_csv(index = False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload, title=title, filename=filename)
    return HTML(html)

In [None]:
%%appyter markdown
# Gene Counts
The table below displays the counts (number of appearances throughout the entire library) of each gene. This preview has a limited number of genes, but the full chart is also available for download.

In [None]:
%%appyter code_exec
# Print out a table of the count_frame dataframe 
counts_filename = library_name.replace(" ", "_") + "_gene_counts.csv"

# First print total number of genes and gene sets (terms)
print("Total genes: {}".format(unique_genes))
print("Total gene sets: {}".format(unique_sets))

display(HTML(count_frame[0:num_bar_genes].to_html(index=False)))
display(Markdown(f"*Table 1. Gene count results for {library_name} library*"))
display(create_download_link(count_frame, "Download this table as a CSV", counts_filename))

In [None]:
%%appyter markdown
# Unmapped Gene Names
This Appyter checks whether your gene set library contains unmapped gene names in _-DEC, _-MAR, and _-SEP formats. These conversions frequently occur when gene names are loaded into Excel. For example, either MARC1 or MARCH1 will automatically become '1-MAR'. Read this article for more information: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-1044-7

In [None]:
# Check for unmapped genes and display them 
month_genes = all_genes_unique.copy()
month_genes.sort()
first = -1
last = -1
for i in range(len(month_genes)):
    if len(month_genes[i]) > 4:
        substr = month_genes[i][-4:]
        if(substr == '-DEC' or substr == '-MAR' or substr == '-SEP'):
            if first == -1:
                first = i
            else:
                last = i+1
        else:
            if first != -1:
                break

def month_sorter(month):
    return month[-3]

def date_sorter(month):
    dash = month.index('-')
    return int(month[:dash])

month_genes = month_genes[first:last]
month_genes = sorted(month_genes,key=lambda x: (month_sorter(x), date_sorter(x)))
month_genes = pd.DataFrame(data=month_genes, columns=['GENE NAME'])

# Display if incorrect genes 
if len(month_genes) > 0:
    month_genes_filename = 'unmapped_gene_names_' + library_name
    print('' + str(len(month_genes)) + ' unmapped gene names found.')
    display(HTML(month_genes.to_html(index=False)))
    display(Markdown(f"*Table 2. Unmapped gene names in {library_name} library*"))
    display(create_download_link(month_genes, "Download this table as a CSV", month_genes_filename))
else:
    print("No unmapped gene names found")

In [None]:
%%appyter markdown
# Bar Graph
The bar graph displays the most common genes in your gene set library.

In [None]:
# Bokeh bar graph
if bargraph:
    barsource_v = ColumnDataSource(
            dict(
            x = top_genes['GENE'],
            y = top_genes['COUNT']
        )
    )
    barsource_h = ColumnDataSource(
            dict(
            x = top_genes['COUNT'][::-1],
            y = top_genes['GENE'][::-1]
        )
    )

    bar_title = '' + str(num_bar_genes) + ' Most Common Genes in ' + library_name

    if orient == 'Vertical':
        bokbar = figure(x_range=top_genes['GENE'], plot_height=350, title=bar_title, toolbar_location=None, tools="hover", tooltips='@top', x_axis_label='Genes', y_axis_label='Counts')
        bokbar.vbar(x=top_genes['GENE'], top=top_genes['COUNT'], width=.5, color=bar_color, hover_alpha=.7)
        bokbar.xaxis.major_label_orientation = math.pi/5
        bokbar.xgrid.grid_line_color = None
        bokbar.y_range.start = 0

        if counts:
            labels = LabelSet(x='x', y='y', text='y', level='annotation',
                x_offset=-7, y_offset=0, source=barsource_v, render_mode='canvas', text_font_size = '11px')

            bokbar.add_layout(labels)

    if orient == 'Horizontal':
        bokbar = figure(y_range = top_genes['GENE'][::-1], plot_height=400, title=bar_title, toolbar_location=None, tools="hover", tooltips='@right', x_axis_label='Counts', y_axis_label='Genes')
        bokbar.hbar(y=top_genes['GENE'][::-1],right=top_genes['COUNT'][::-1], height=.5, color=bar_color, hover_alpha=.7)
        bokbar.xgrid.grid_line_color = None
        
        if counts:
            labels = LabelSet(x='x', y='y', text='x', level='annotation',
                x_offset=2, y_offset=-6, source=barsource_h, render_mode='canvas', text_font_size = '11px')

            bokbar.add_layout(labels)

    bokbar.xaxis.axis_label_text_font_style = 'normal'
    bokbar.yaxis.axis_label_text_font_style = 'normal'
    bokbar.title.align = 'center'

    show(bokbar)