In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter markdown

<center>
    <h1 id = "top-of-app"> 
        <div style="font-size:3rem;font-weight:500"> <img src="{{ url_for('static', filename='logo.png') }}" style="height:45px;padding:0 5px;display:inline"/> Gene Set Library Synopsis Appyter</div>
    </h1>
    <br>
    <div style="font-size:2rem;font-weight:500">An appyter for the analysis and visualization of gene set libraries</div>
</center>

In [None]:
%%appyter markdown
# Gene Set Library Synopsis
This Appyter processes, analyzes, and visualizes a collection of gene sets, also known as a gene set library. 

First it will generate summary statistics describing the size of the library and its component gene sets, as well as how well studied the genes and gene sets are. 

Then the Appyter will use text vectorization (TF-IDF) and dimensionality reduction (UMAP) to visualize the library as a scatterplot.

To assess gene set similarity, pairwise Jaccard Indexes will be calculated, and this statistic will serve as the basis for a heatmap. The Appyter will also produce a set of figures focusing on the gene sets with the highest overlap.

Finally, the Appyter will present additional plots describing the composition of your library, including bar graphs of most frequent and most studied genes, a scatterplot of gene sets arranged by size and publication count, and a scatterplot of the library among all Enrichr libraries.

In [None]:
#%% Imports
import appyter
import pandas as pd
import numpy as np
import base64
import math
import seaborn as sns
import fastcluster

import matplotlib.pyplot as plt; plt.rcdefaults()
import matplotlib.colors as colors
%matplotlib inline

import IPython
from IPython.display import HTML, display, FileLink, Markdown, IFrame

import urllib

import itertools
from itertools import chain
from scipy.spatial.distance import squareform, pdist, jaccard
from scipy.cluster.hierarchy import linkage

from bokeh.io import output_notebook, export_svg
from bokeh.io.export import get_screenshot_as_png
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Span, ranges, LabelSet, BasicTicker, ColorBar, LinearColorMapper, PrintfTickFormatter
from bokeh.layouts import layout, row, column, gridplot
from bokeh.palettes import all_palettes, linear_palette, Turbo256, Spectral6
from bokeh.transform import factor_cmap, transform

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import umap.umap_ as umap
from sklearn.decomposition import NMF

output_notebook()

In [None]:
# Notebook display functions 
def figure_legend(label,title="",content=""):
    if len(title)>0:
        display(HTML(f"<div><b>{label}</b>: <i>{title}</i>. {content} </div>"))
    else:
        display(HTML(f"<div><b>{label}</b>: {content} </div>"))

# Figure and table counters
fig_count = 1
table_count = 1

# Output a table of genes and counts.
def create_download_link(df, title, filename):  
    csv = df.to_csv(index = False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload, title=title, filename=filename)
    return HTML(html)

In [None]:
%%appyter hide_code
{% do SectionField(
    name='geneEntrySection', 
    title='1. Submit a Gene Set Library',
    subtitle='Upload a GMT file containing a gene set library (gene names must be in gene symbol format) or select an existing Enrichr library. You may also choose to use the default gene library provided.',
    img='data-upload-icon.png'
) %}
{% do SectionField(
    name='barGraphSection', 
    title='2. Bar Graphs and Histograms',
    subtitle='Bar graphs and histograms describing your library will be generated. Choose parameters to customize these visualizations',
    img='bar-icon.png'
) %}
{% do SectionField(
    name='simSection', 
    title='3. Gene Set Similarity',
    subtitle='Similarity between gene sets will be assessed using the Jaccard Index. You may choose how this information is displayed.',
    img='set-similarity.png'
) %}

In [None]:
%%appyter code_exec
# Inputting libraries and settings
{% set library_kind = TabField(
    name = 'library_kind',
    label = 'Library',
    default = 'Upload a library',
    description = '',
    choices = {
        'Upload a library': [
            FileField(
                name = 'library_filename',
                label = 'Gene Library File (.gmt or .txt)',
                default = 'CellMarker_Augmented_2021.gmt', 
                examples = {'CellMarker_Augmented_2021.gmt': url_for('static', filename = 'CellMarker_Augmented_2021.gmt'),
                            'CellMarker_Augmented_2021.txt': url_for('static', filename = 'CellMarker_Augmented_2021.txt')}, 
                description = 'GMT is a tab-delimited file format that describes sets. Visit https://bit.ly/35crtXQ for more information and http://www.molmine.com/magma/fileformats.html to create your own.',
                section = 'geneEntrySection'),
            MultiCheckboxField(
                name='species',
                label='Species',
                description='Which species are represented by your gene set library?',
                default=['human'],
                section='geneEntrySection',
                choices=[
                    'human',
                    'mouse',
                    'other',
                ],
            )
        ],
        
        'Select a library from Enrichr': [
            ChoiceField(
                name = 'enrichr_library',
                description = 'Select one Enrichr library whose genes will be counted',
                label = 'Enrichr Library',
                default = 'CellMarker_Augmented_2021',
                section = 'geneEntrySection',
                choices = [
                    'ARCHS4_Cell-lines',
                    'ARCHS4_IDG_Coexp',
                    'ARCHS4_Kinases_Coexp',
                    'ARCHS4_TFs_Coexp',
                    'ARCHS4_Tissues',
                    'Achilles_fitness_decrease',
                    'Achilles_fitness_increase',
                    'Aging_Perturbations_from_GEO_down',
                    'Aging_Perturbations_from_GEO_up',
                    'Allen_Brain_Atlas_10x_scRNA_2021',
                    'Allen_Brain_Atlas_down',
                    'Allen_Brain_Atlas_up',
                    'Azimuth_Cell_Types_2021',
                    'BioCarta_2013',
                    'BioCarta_2015',
                    'BioCarta_2016',
                    'BioPlanet_2019',
                    'BioPlex_2017',
                    'CCLE_Proteomics_2020',
                    'CORUM',
                    'COVID-19_Related_Gene_Sets',
                    'Cancer_Cell_Line_Encyclopedia',
                    'CellMarker_Augmented_2021',
                    'ChEA_2013',
                    'ChEA_2015',
                    'ChEA_2016',
                    'Chromosome_Location',           
                    'Chromosome_Location_hg19',
                    'ClinVar_2019',
                    'dbGaP',
                    'DSigDB',
                    'Data_Acquisition_Method_Most_Popular_Genes',
                    'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
                    'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019',
                    'Descartes_Cell_Types_and_Tissue_2021',
                    'DisGeNET',
                    'Disease_Perturbations_from_GEO_down',
                    'Disease_Perturbations_from_GEO_up',
                    'Disease_Signatures_from_GEO_down_2014',
                    'Disease_Signatures_from_GEO_up_2014',
                    'DrugMatrix',
                    'Drug_Perturbations_from_GEO_2014',
                    'Drug_Perturbations_from_GEO_down',
                    'Drug_Perturbations_from_GEO_up',
                    'ENCODE_Histone_Modifications_2013',
                    'ENCODE_Histone_Modifications_2015',
                    'ENCODE_TF_ChIP-seq_2014',
                    'ENCODE_TF_ChIP-seq_2015',
                    'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
                    'ESCAPE',
                    'Elsevier_Pathway_Collection',
                    'Enrichr_Libraries_Most_Popular_Genes',
                    'Enrichr_Submissions_TF-Gene_Coocurrence',
                    'Enrichr_Users_Contributed_Lists_2020',
                    'Epigenomics_Roadmap_HM_ChIP-seq',
                    'GO_Biological_Process_2013',
                    'GO_Biological_Process_2015',
                    'GO_Biological_Process_2017',
                    'GO_Biological_Process_2017b',
                    'GO_Biological_Process_2018',
                    'GO_Cellular_Component_2013',
                    'GO_Cellular_Component_2015',
                    'GO_Cellular_Component_2017',
                    'GO_Cellular_Component_2017b',
                    'GO_Cellular_Component_2018',
                    'GO_Molecular_Function_2013',
                    'GO_Molecular_Function_2015',
                    'GO_Molecular_Function_2017',
                    'GO_Molecular_Function_2017b',
                    'GO_Molecular_Function_2018',
                    'GTEx_Tissue_Sample_Gene_Expression_Profiles_down',
                    'GTEx_Tissue_Sample_Gene_Expression_Profiles_up',
                    'GWAS_Catalog_2019',
                    'GeneSigDB',
                    'Gene_Perturbations_from_GEO_down',
                    'Gene_Perturbations_from_GEO_up',
                    'Genes_Associated_with_NIH_Grants',
                    'Genome_Browser_PWMs',
                    'HMDB_Metabolites',
                    'HMS_LINCS_KinomeScan',
                    'HomoloGene',
                    'HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression',
                    'HumanCyc_2015',
                    'HumanCyc_2016',
                    'Human_Gene_Atlas',
                    'Human_Phenotype_Ontology',
                    'huMAP',
                    'InterPro_Domains_2019',
                    'Jensen_COMPARTMENTS',
                    'Jensen_DISEASES',
                    'Jensen_TISSUES',
                    'KEA_2013',
                    'KEA_2015',
                    'KEGG_2013',
                    'KEGG_2015',
                    'KEGG_2016',
                    'KEGG_2019_Human',
                    'KEGG_2019_Mouse',
                    'Kinase_Perturbations_from_GEO_down',
                    'Kinase_Perturbations_from_GEO_up',
                    'L1000_Kinase_and_GPCR_Perturbations_down',
                    'L1000_Kinase_and_GPCR_Perturbations_up',
                    'LINCS_L1000_Chem_Pert_down',
                    'LINCS_L1000_Chem_Pert_up',
                    'LINCS_L1000_Ligand_Perturbations_down',
                    'LINCS_L1000_Ligand_Perturbations_up',
                    'Ligand_Perturbations_from_GEO_down',
                    'Ligand_Perturbations_from_GEO_up',
                    'lncHUB_lncRNA_Co-Expression',
                    'MCF7_Perturbations_from_GEO_down',
                    'MCF7_Perturbations_from_GEO_up',
                    'MGI_Mammalian_Phenotype_2013',
                    'MGI_Mammalian_Phenotype_2017',
                    'MGI_Mammalian_Phenotype_Level_3',
                    'MGI_Mammalian_Phenotype_Level_4',
                    'MGI_Mammalian_Phenotype_Level_4_2019',
                    'MSigDB_Computational',
                    'MSigDB_Hallmark_2020',
                    'MSigDB_Oncogenic_Signatures',
                    'Microbe_Perturbations_from_GEO_down',
                    'Microbe_Perturbations_from_GEO_up',
                    'miRTarBase_2017',
                    'Mouse_Gene_Atlas',
                    'NCI-60_Cancer_Cell_Lines',
                    'NCI-Nature_2015',
                    'NCI-Nature_2016',
                    'NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions',
                    'NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions',
                    'NIH_Funded_PIs_2017_Human_AutoRIF',
                    'NIH_Funded_PIs_2017_Human_GeneRIF',
                    'NURSA_Human_Endogenous_Complexome',
                    'OMIM_Disease',
                    'OMIM_Expanded',
                    'Old_CMAP_down',
                    'Old_CMAP_up',
                    'PanglaoDB_Augmented_2021',
                    'PPI_Hub_Proteins',
                    'Panther_2015',
                    'Panther_2016',
                    'Pfam_Domains_2019',
                    'Pfam_InterPro_Domains',
                    'PheWeb_2019',
                    'Phosphatase_Substrates_from_DEPOD',
                    'ProteomicsDB_2020',
                    'RNAseq_Automatic_GEO_Signatures_Human_Down',
                    'RNAseq_Automatic_GEO_Signatures_Human_Up',
                    'RNAseq_Automatic_GEO_Signatures_Mouse_Down',
                    'RNAseq_Automatic_GEO_Signatures_Mouse_Up',
                    'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
                    'Rare_Diseases_AutoRIF_ARCHS4_Predictions',
                    'Rare_Diseases_AutoRIF_Gene_Lists',
                    'Rare_Diseases_GeneRIF_ARCHS4_Predictions',
                    'Rare_Diseases_GeneRIF_Gene_Lists',
                    'Reactome_2013',
                    'Reactome_2015',
                    'Reactome_2016',
                    'SILAC_Phosphoproteomics',
                    'SubCell_BarCode',
                    'SysMyo_Muscle_Gene_Sets',
                    'TF-LOF_Expression_from_GEO',
                    'TF_Perturbations_Followed_by_Expression',
                    'TG_GATES_2020',
                    'TRANSFAC_and_JASPAR_PWMs',
                    'TRRUST_Transcription_Factors_2019',
                    'Table_Mining_of_CRISPR_Studies',
                    'TargetScan_microRNA',
                    'TargetScan_microRNA_2017',
                    'Tissue_Protein_Expression_from_Human_Proteome_Map',
                    'Tissue_Protein_Expression_from_ProteomicsDB',
                    'Transcription_Factor_PPIs',
                    'UK_Biobank_GWAS_v1',
                    'Virus-Host_PPI_P-HIPSTer_2020',
                    'VirusMINT',
                    'Virus_Perturbations_from_GEO_down',
                    'Virus_Perturbations_from_GEO_up',
                    'WikiPathways_2013',
                    'WikiPathways_2015',
                    'WikiPathways_2016',
                    'WikiPathways_2019_Human',
                    'WikiPathways_2019_Mouse'
                ]
            )
        ],
    },
    section = 'geneEntrySection',
) %}


# Choose the orientation of the graph: horizontal or vertical bars
orient_bar = "{{ ChoiceField(name = 'orient_bar', label = 'Orientation', choices = ['Horizontal', 'Vertical'], default = 'Horizontal', description = 'Choose whether your bar graphs will be displayed horizontally or vertically', section = 'barGraphSection') }}"

# Choose color of bars
color_bar = "{{ ChoiceField(name = 'color_bar', label = 'Bar Color', choices = ['Black', 'Blue', 'Red', 'Green', 'Grey', 'Orange', 'Purple', 'Yellow', 'Pink'], default = 'Black', section = 'barGraphSection') }}"

# Choose whether gene counts are displayed on bar graph
counts_bar = {{ BoolField(name = 'counts_bar', label = 'Show Counts?', default = 'true', description = 'Choose \'Yes\' to label the bars with their lengths.', section = 'barGraphSection') }}

# Choose number of genes in bar graph
num_bar_genes = {{ IntField(
    name='num_bar_genes', 
    label='Top Genes', 
    min=2, 
    max=20, 
    default=20, 
    description='The number of genes that will be included in figures describing top genes (ex: most frequent, most published)', 
    section='barGraphSection'
)}}

# Choose number of genes in bar graph
jac_cutoff = {{ FloatField(
    name='jac_cutoff', 
    label='Jaccard Index High Threshold', 
    min=0.10, 
    max=0.99, 
    default=0.40, 
    description='The Jaccard Index will be calculated to measure similarity between sets in your library (0 = no shared genes, 1 = identical). Choose a threshold for what is considered a high Jaccard Index.', 
    section='simSection'
)}}
         
# Choose which visualizations are generated for most similar sets
jac_interactive = {{ BoolField(name = 'jac_interactive', label = 'Interactive Heatmap of Most Similar Sets?', default = 'true', description = 'Choose \'Yes\' to generate an interactive heatmap of the gene sets with similarity greater than the threshold you set above.', section = 'simSection') }}

In [None]:
# Color for Bar plot
color_conversion = {
    'Black': 'black',
    'Blue': 'lightskyblue',
    'Red': 'tomato',
    'Green': 'mediumspringgreen',
    'Grey': 'lightgrey',
    'Orange': 'orange',
    'Purple': 'plum',
    'Yellow': 'yellow',
    'Pink': 'lightpink'
}

bar_color = color_conversion[color_bar]

In [None]:
%%appyter code_exec
{%- if library_kind.raw_value == 'Upload a library' %}
library_kind = "Upload a library"
library_filename = {{ library_kind.value[0] }}
library_name = library_filename.replace("_", " ").replace(".txt", "").replace(".gmt", "")
species = {{library_kind.value[1]}}

{%- else %}
library_kind = "Select a library from Enrichr"
library_filename = "{{ library_kind.value[0] }}"
library_name = "{{ library_kind.value[0] }}"
species = ['human']
if 'MOUSE' in library_name.upper().split("_"):
    species = ['mouse']
{%- endif %}

In [None]:
# Download library from the Enrichr site using its file name
def download_library(library_filename):
    with open(f"{library_filename}", "w") as fw:
        with urllib.request.urlopen(f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName={library_filename}') as f:
            for line in f.readlines():
                fw.write(line.decode('utf-8'))
                fw.flush()

In [None]:
# Load library 
def remove_comma(gene):
    try:
        comma = gene.index(',')
        return gene[0:comma]
    except:
        return gene

def load(library_filename, hasNA):
    if library_kind == "Select a library from Enrichr":
        download_library(library_filename)
    library_data, library_genes, hasNA = load_library(library_filename, hasNA)
    # to upper case
    return library_data, library_genes, hasNA

# Returns a dictionary (library_data) where the values are all the elements
def load_library(library_filename, hasNA):
    library_data = dict()
    with open(library_filename, "r") as f:
        lines = f.readlines()
        library_genes = [''] * len(lines)
        i = 0
        for line in lines:
            splited = line.strip().split("\t")
            elements = pd.Series(splited[2:]).dropna()
            if len(elements) > 0:
                # to upper case
                allxs = [x.upper() for x in elements]
                allxs = pd.Series(allxs).apply(lambda x: remove_comma(x)).to_list()
                
                if 'NA' in allxs:
                    allxs.remove('NA')
                    hasNA = True
                library_data[splited[0]] = allxs
                library_genes[i] = (' ').join(allxs)
            i = i + 1    
    return library_data, library_genes, hasNA

In [None]:
# Method for gene novelty
def gene_novelty_label(pub_count):
    if pub_count <= 7:
        return 'highly understudied'
    if pub_count <= 25:
        return 'understudied'
    if pub_count <= 87:
        return 'studied'
    return 'well studied'

In [None]:
# Create geneRIF dictionary and novelty mapping dictionaries
generif_df = pd.read_csv("https://appyters.maayanlab.cloud/storage/Gene_Set_Library_Synopsis/generif.tsv", delimiter="\t", header=None)
generif_df = generif_df.rename(columns={0:'Species',1:'Number',2:'Gene',3:'PMID',4:'Date'})
generif_genes = generif_df['Gene']
generif_s_genes = generif_genes.squeeze().str.upper()
generif_counts = generif_s_genes.value_counts()
generif_dict = generif_counts.to_dict()

novel_map_dict = {"highly understudied": 3, "understudied": 2, "studied": 1, "well studied": 0}
novel_map_dict_rev = {3: "highly understudied", 2: "understudied", 1: "studied", 0: "well studied"}

In [None]:
%%appyter code_exec
# Load library, create genes list, set list, gene size list
hasNA = False
library_data, library_genes, hasNA = load(library_filename, hasNA)
if library_kind == "Select a library from Enrichr":
    library_name = library_name.replace("_", " ")
vals = list(library_data.values())
keys = list(library_data.keys())

all_genes = list(chain(*vals))
all_genes_unique = list(np.unique(np.array(all_genes)))
all_genes_unique = np.array(all_genes_unique)
all_sets = list(library_data.keys())

gs_sizes = [0]*len(vals)
for i in range(0, len(vals)): 
    gs_sizes[i] = len(vals[i])

In [None]:
# Make dataframes of gene sets and their genes in 1) list form, 2) string form
library_data_onemap = dict()
library_data_onemap_str = dict()
for i in range(len(vals)):
    library_data_onemap[keys[i]] = [vals[i]]
    library_data_onemap_str[keys[i]] = (" ").join(vals[i])

library_data_onemap = pd.DataFrame(data=library_data_onemap).transpose()
library_data_onemap = library_data_onemap.rename(columns= {0:'Genes'})
library_data_onemap_str = pd.DataFrame(data={0:library_data_onemap_str})
library_data_onemap_str = library_data_onemap_str.rename(columns= {0:'Genes'})

In [None]:
%%appyter markdown
# 1. Unmapped Gene Names
This Appyter checks whether your gene set library contains unmapped gene names in _-DEC, _-MAR, and _-SEP formats. These conversions frequently occur when gene names are loaded into Excel. For example, either MARC1 or MARCH1 will automatically become '1-MAR'. Read this article for more information: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-1044-7. This section also checks for genes labeled NA, which means Not Available.

In [None]:
def month_sorter(month):
    return month[-3]

def date_sorter(month):
    dash = month.index('-')
    return int(month[:dash])

In [None]:
# Check for unmapped genes and display them 
month_genes = all_genes_unique.copy()
month_genes.sort()
first = -1
last = -1
for i in range(len(month_genes)):
    if len(month_genes[i]) > 4:
        substr = month_genes[i][-4:]
        if(substr == '-DEC' or substr == '-MAR' or substr == '-SEP'):
            if first == -1:
                first = i
            last = i+1
        else:
            if first != -1:
                break

month_genes = month_genes[first:last]
month_genes = sorted(month_genes,key=lambda x: (month_sorter(x), date_sorter(x)))
if hasNA:
    month_genes.append('NA')
month_genes = pd.DataFrame(data=month_genes, columns=['Gene Name'])

# Display if unmapped genes 
if len(month_genes) > 0:
    month_genes_filename = 'unmapped_gene_names_' + library_name
    found_genes_text = '' + str(len(month_genes)) + ' unmapped gene names found.'
    display(Markdown(found_genes_text))
    display(HTML(month_genes.to_html(index=False)))
    figure_legend(f"Table {table_count}", content=f"Unmapped gene names in {library_name}")
    display(create_download_link(month_genes, "Download this table as a CSV", month_genes_filename))
    table_count = table_count + 1
else:
    print("No unmapped gene names found")

In [None]:
%%appyter markdown
# 2. Descriptive Statistics
The Appyter will present descriptive statistics for your library such as: total genes, total gene sets, average genes per set, and frequency of each gene. Results will be displayed in downloadable tables.

In [None]:
# Count the number of each gene 
count_frame = pd.Series(all_genes).value_counts().sort_index().reset_index().reset_index(drop=True)
count_frame.columns = ['Gene', 'Count']
count_frame.dropna()
count_frame.sort_values(by=['Count'], inplace=True, ascending=False)
count_frame = count_frame.reset_index(drop=True)
# Drop skipped rows 
mask = count_frame['Gene'].str.len() > 0
count_frame = count_frame.loc[mask]
count_frame = count_frame[~count_frame['Gene'].isin(['NA'])]

count_frame['Publications'] = count_frame['Gene'].map(generif_dict).replace(np.nan, 0)
count_frame['Publications'] = count_frame['Publications'].astype(int)
count_frame['Novelty'] = count_frame['Publications'].apply(lambda x: gene_novelty_label(x))
pubhist_dat = list(count_frame['Publications'].replace(0,np.nan).dropna())

# Make a copy to sort by publications
count_frame2 = count_frame.copy(deep=True)
count_frame2.sort_values(by=['Publications'], inplace=True, ascending=False)
top_genes = count_frame.iloc[0:num_bar_genes]
top_pub_genes = count_frame2.iloc[0:num_bar_genes]

In [None]:
# Calculate novelty statistic for the library as a whole
count_frame2['Novelty Num'] = count_frame2['Novelty'].map(novel_map_dict)
novelty_weighted_sum = count_frame2['Count'] * count_frame2['Novelty Num']
novelty_weighted_sum = novelty_weighted_sum.sum()
count_sum = count_frame2['Count'].sum()

library_nov_exact = round(novelty_weighted_sum/count_sum, 1)
library_nov = math.floor(round(novelty_weighted_sum/count_sum, 1))
library_nov_term = novel_map_dict_rev[library_nov]

In [None]:
# Make table describing gene sets
geneset_df = library_data_onemap.copy(deep=True)
geneset_df = geneset_df.reset_index()
geneset_df = geneset_df.rename(columns={'index':'Gene Set'})
geneset_df['Size'] = [0] * len(all_sets)
geneset_df['Mean Publications/Gene'] = [0] * len(all_sets)
geneset_df['Novelty'] = [''] * len(all_sets)

for i in range(0,len(geneset_df['Gene Set'])):
    genes = geneset_df.iloc[i,1]
    temp = count_frame2[count_frame2['Gene'].isin(genes)]
    tot_pubs = sum(temp['Publications'])
    av_pubs = np.mean(temp['Publications'])
    tot_genes = len(genes)
    novelty_num = sum(temp['Novelty Num']/tot_genes)
    novelty = library_nov_term = novel_map_dict_rev[math.floor(novelty_num)]
    
    geneset_df['Size'][i] = tot_genes
    geneset_df['Mean Publications/Gene'][i] = av_pubs
    geneset_df['Novelty'][i] = novelty

geneset_df = geneset_df.sort_values(by='Size', ascending=False).drop(columns=['Genes']).reset_index(drop=True)

In [None]:
# Descriptive statistics summary table
# Totals
unique_genes = len(count_frame['Gene'])
unique_sets = len(library_data.keys())
avg_genes = round(np.mean(gs_sizes), 2)
median_genes = round(np.median(gs_sizes), 2)

median_publications = round(np.median(count_frame['Publications']), 2)
avg_publications = round(np.mean(count_frame['Publications']), 2)

# Novelty counts and percentages
novelty_counts = count_frame['Novelty'].value_counts()
novelty_counts_gs = geneset_df['Novelty'].value_counts()

tot_pub = np.sum(count_frame['Publications'])
genes_nov_dict = {'highly understudied': 0, 'understudied': 0, 'studied': 0, 'well studied':0}
gs_nov_dict = {'highly understudied': 0, 'understudied': 0, 'studied': 0, 'well studied':0}
genes_nov_pcnt_dict = {'highly understudied': '', 'understudied': '', 'studied': '', 'well studied':''}
gs_nov_pcnt_dict = {'highly understudied': '', 'understudied': '', 'studied': '', 'well studied':''}

# Reassign counts of highly understudied, understudied, etc. only if at least one gene of that type exists
# Calculate percentages of each novelty 
for key in genes_nov_dict.keys():
    if key in novelty_counts:
        genes_nov_dict[key] = novelty_counts[key]
    if key in novelty_counts_gs:
        gs_nov_dict[key] = novelty_counts_gs[key]
    
    genes_nov_pcnt_dict[key] = str(round(genes_nov_dict[key]/unique_genes * 100, 2)) + "%"
    gs_nov_pcnt_dict[key] = str(round(gs_nov_dict[key]/unique_sets * 100, 2)) + "%"

## Load stats for all human and mouse genes
all_human_genes = pd.read_csv("https://appyters.maayanlab.cloud/storage/Gene_Set_Library_Synopsis/all_human_genes_df.csv", header=0)
all_mouse_genes = pd.read_csv("https://appyters.maayanlab.cloud/storage/Gene_Set_Library_Synopsis/all_mouse_genes_df.csv", header=0)

# Change table display based on library composition
other = False
specname = 'Human'
all_species_genes=all_human_genes
if 'human' not in species:
    if 'mouse' in species:
        all_species_genes = all_mouse_genes
        specname = 'Mouse'
    if species==['other']:
        other = True
spec_col_name = 'All ' + specname + ' Genes'
    
## Make and display tables
lib_vs_spec_title_col = ["Total Genes", "Total Publications", "Highly Understudied Genes", "Understudied Genes", "Studied Genes", "Well Studied Genes"]
lib_col = [unique_genes, tot_pub, genes_nov_dict['highly understudied'], genes_nov_dict['understudied'], genes_nov_dict['studied'], genes_nov_dict['well studied']]
lib_col_pcnt = ['', '', genes_nov_pcnt_dict['highly understudied'], genes_nov_pcnt_dict['understudied'], genes_nov_pcnt_dict['studied'], genes_nov_pcnt_dict['well studied']]

lib_vs_spec_df = pd.DataFrame(data = {'': lib_vs_spec_title_col, f"{library_name}": lib_col, ' ': lib_col_pcnt})
lib_col = [avg_publications, median_publications]
lib_vs_spec_mm_df = pd.DataFrame(data = {'': ["Mean Publications/Gene", "Median Publications/Gene"], f"{library_name}": lib_col})
                                          
if not other:
    spec_col = [all_species_genes['Genes'][0], all_species_genes['Publications'][0], all_species_genes['Highly Understudied'][0], all_species_genes['Understudied'][0], all_species_genes['Studied'][0], all_species_genes['Well Studied'][0]]
    spec_col_pcnt = ['', '', str(all_species_genes['Highly Understudied'][1]) + "%", str(all_species_genes['Understudied'][1]) + "%", str(all_species_genes['Studied'][1]) + "%", str(all_species_genes['Well Studied'][1]) + "%"]
    lib_col = [unique_genes, tot_pub, genes_nov_dict['highly understudied'], genes_nov_dict['understudied'], genes_nov_dict['studied'], genes_nov_dict['well studied']]
    lib_col_pcnt = ['', '', genes_nov_pcnt_dict['highly understudied'], genes_nov_pcnt_dict['understudied'], genes_nov_pcnt_dict['studied'], genes_nov_pcnt_dict['well studied']]
    lib_vs_spec_df = pd.DataFrame(data = {'': lib_vs_spec_title_col, f"{library_name}": lib_col, ' ': lib_col_pcnt, spec_col_name: spec_col, '  ': spec_col_pcnt})
    lib_vs_spec_df[spec_col_name] = lib_vs_spec_df[spec_col_name].astype(int)

    lib_col = [avg_publications, median_publications]
    spec_col = [all_species_genes['Mean Publications'][0], all_species_genes['Median Publications'][0]]
    lib_vs_spec_mm_df = pd.DataFrame(data = {'': ["Mean Publications/Gene", "Median Publications/Gene"], f"{library_name}": lib_col, spec_col_name: spec_col})

genestat_title_col = ["Gene Sets", "Highly Understudied Sets", "Understudied Sets", "Studied Sets", "Well Studied Sets"]
genestat_col = [unique_sets,  gs_nov_dict['highly understudied'], gs_nov_dict['understudied'], gs_nov_dict['studied'], gs_nov_dict['well studied']]
genestat_col_pcnt = ['', gs_nov_pcnt_dict['highly understudied'], gs_nov_pcnt_dict['understudied'], gs_nov_pcnt_dict['studied'], gs_nov_pcnt_dict['well studied']]
genestat_df = pd.DataFrame(data = {'': genestat_title_col, 'Total': genestat_col, ' ':genestat_col_pcnt})

genestat_title_col = ["Mean", "Median"]
genestat_col = [avg_genes, median_genes]
genestat_mm_df = pd.DataFrame(data = {'': genestat_title_col, 'Genes / Set': genestat_col})

display(HTML(lib_vs_spec_df.to_html(index=False)))
display(HTML(lib_vs_spec_mm_df.to_html(index=False)))
if other:
    figure_legend(f"Tables {table_count}A, {table_count}B", title=f"{library_name} Summary Statistics", content="Descriptive statistics for your library. Novelty ratings are based on Geneshot. Highly understudied genes are associated with 0-7 PubMed IDs (PMIDs); understudied genes with 8-25 PMIDs; studied genes with 26-87 PMIDs; and well studied genes with 88+ PMIDs.")
else:
    figure_legend(f"Tables {table_count}A, {table_count}B", title=f"{library_name} vs. All {specname} Genes", content=f"Descriptive statistics comparing your gene set library to the set of all {specname} genes in GeneRIF. Novelty ratings are based on Geneshot. Highly understudied genes are associated with 0-7 PubMed IDs (PMIDs); understudied genes with 8-25 PMIDs; studied genes with 26-87 PMIDs; and well studied genes with 88+ PMIDs.")

table_count = table_count + 1
display(HTML(genestat_df.to_html(index=False)))
display(HTML(genestat_mm_df.to_html(index=False)))
figure_legend(f"Tables {table_count}A, {table_count}B", title=f"Summary Statistics for Gene Sets in {library_name} Library", content="Statistics describing the composition of gene sets within your library. Novelty ratings were calculated by giving each gene in each set a numerical novelty score (based on its rating of highly understudied, understudied, studied, or well studied), taking a weighted average of those scores, and translating the result back into a term that describes the entire gene set.")
table_count = table_count + 1

In [None]:
def make_bok_hist(title, hist, edges, dat, xaxis_lab, yaxis_lab, tooltips, fill_color,xtype='auto', ytype='auto', xtail=5, ytail=5):
    yrange = 0
    if ytype=='log':
        hist, edges = np.histogram(dat, density=False, bins=10)
        ycap = math.ceil(math.log10(max(pubhist_dat)))
        yrange=(10**0, 10**ycap)
        
    ordered_hist = sorted(hist)
    ordered_edges = sorted(edges)
    
    if ytype=='auto':
        yrange=[ordered_hist[0], ordered_hist[-1]+ytail]
    
    p = figure(title=title, tooltips=tooltips, background_fill_color="#fafafa", toolbar_location="below",x_axis_type = xtype,y_axis_type = ytype,
              x_range=[ordered_edges[0], ordered_edges[-1]+xtail], y_range=yrange)
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color=fill_color, line_color="white", hover_alpha=0.7)
    
    p.y_range.start = 0
    p.xaxis.axis_label = xaxis_lab
    p.yaxis.axis_label = yaxis_lab
    p.grid.grid_line_color="white"
    p.title.align = 'center'
    return p

def get_bin_params(dat_max, dat_min = 0, bins=40):
    rng = dat_max - dat_min
    needed = bins - rng%bins
    new_rng = rng + needed
    maxval = dat_min + new_rng
    
    if needed==bins:
        maxval = dat_max
        new_rng = rng
    
    interval = new_rng/bins
    return maxval, interval

In [None]:
%%appyter code_exec
# Print out a table of the count_frame dataframe 
counts_filename = library_name.replace(" ", "_") + "_gene_counts.csv"

display(HTML(count_frame[0:num_bar_genes].to_html(index=False)))
figure_legend(f"Table {table_count}", title=f"Gene count results for {library_name}", content="This table displays the counts (number of appearances throughout the entire library), number of publication associations (PMIDs), and novelty ratings of each gene. The full chart is also available for download. A weighted average statistic has been used to assign a novelty rating to your library as a whole, representing how well-studied the library is.")
table_count = table_count + 1
display(create_download_link(count_frame, "Download this table as a CSV", counts_filename))
print(f"Based on the novelty and within-library frequencies of each gene, your gene set library is \033[1m{library_nov_term}.")

In [None]:
# Display table of gene set statistics
geneset_df_filename = library_name.replace(" ", "_") + "_gene_set_statistics.csv"
display(HTML(geneset_df.iloc[0:10,:].to_html(index=False)))
figure_legend(f"Table {table_count}", f"Gene Set Statistics in {library_name} Library", "Size (number of genes), average publications per gene, and novelty (calculated using a weighted average across all genes in the set) for each gene set in your library.")
display(create_download_link(geneset_df, "Download this table as a CSV", geneset_df_filename))
table_count = table_count + 1

In [None]:
%%appyter markdown
# 3. Scatterplot Visualization
In this section, the gene sets in your library will be converted into numerical vectors using TF-IDF, transformed into two dimensions using UMAP, and visualized as a scatterplot. 

In [None]:
df = library_data_onemap_str.reset_index().rename(columns={'index':'Name'})

gene_list = df['Genes']

try:
    tfidf_vectorizer = TfidfVectorizer(
        min_df = 3,
        max_df = 0.005,
        max_features = 100000,
        ngram_range=(1, 1)
    )
    tfidf = tfidf_vectorizer.fit_transform(gene_list)
except:
    factor = 0.005
    while factor*unique_sets < 3:
        factor = factor + .005
    
    tfidf_vectorizer = TfidfVectorizer(
        min_df = 3,
        max_df = factor*unique_sets,
        max_features = 100000,
        ngram_range=(1, 1)
    )
    tfidf = tfidf_vectorizer.fit_transform(gene_list)

reducer = umap.UMAP()
reducer.fit(tfidf)
embedding = pd.DataFrame(reducer.transform(tfidf), columns=['x','y'])
embedding = pd.concat([embedding, df], axis=1)

In [None]:
# Prepare dimensionality-reduced matrix for clutstering
mapped_df = embedding.copy(deep=True)
mapped_df = mapped_df.set_index('Name')
mapped_df = mapped_df.drop(columns=['Genes'])
mapped_df = mapped_df.rename_axis("Gene Set").reset_index()

In [None]:
# Plot clustered gene sets
xlabel = 'UMAP Dimension 1'
ylabel = 'UMAP Dimension 2'

source2 = ColumnDataSource(
        data=dict(
            x = mapped_df.x,
            y = mapped_df.y,
            alpha = [0.7] * mapped_df.shape[0],
            size = [7] * mapped_df.shape[0],
            gene_set = mapped_df['Gene Set'],
        )
    )

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
            <span style="font-size: 12px">(@x,@y)</span>
        </div>
    </div>
    """)
tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']
title_emb = 'Gene Sets in ' + library_name + ' Library'
plot_emb = figure(plot_width=1000, plot_height=700, tools=tools_emb, title=title_emb, x_axis_label=xlabel, y_axis_label=ylabel)
plot_emb.circle('x', 'y', size='size', 
                alpha='alpha', line_alpha=0, line_width=0.01, source=source2, name="df", 
                fill_color='grey')
plot_emb.xaxis.axis_label_text_font_style = 'normal'
plot_emb.xaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_style = 'normal'
plot_emb.title.align = 'center'
plot_emb.title.text_font_size = '18px'

show(plot_emb)
figure_legend(f"Fig. {fig_count}", f"Scatterplot of Gene Sets in {library_name} Library", "Gene sets plotted by their UMAP dimensions.")
fig_count = fig_count + 1

In [None]:
%%appyter markdown
# 4. Set Similarity
In this section, the Appyter will compute the pairwise Jaccard index for every pair of gene sets in your library as a measure of set similarity. An index closer to 1 means the gene sets have most genes in common. An index closer to 0 indicates the gene sets share very few terms. The Jaccard indices will serve as the basis for a heatmap. Additional visualizations will be generated for the most similar sets (those with Jaccard indices above the user-specified threshold).

In [None]:
# Put all sets and genes into dataframe where each thing is a list
def jaccard_list(u,v):
    setu = set(chain(*u))
    setv = set(chain(*v))
    return len(setu.intersection(setv)) / len(setu.union(setv))

res = pdist(library_data_onemap[['Genes']], jaccard_list)
distance = pd.DataFrame(squareform(res), index=library_data_onemap.index, columns= library_data_onemap.index)

# Check whether any sets have Jaccard Index > 0; if not, skip all Jaccard plots
jac_zero_tester = pd.Series(res)
jac_zero_tester = jac_zero_tester.replace(0, np.nan)
jac_zero_tester = jac_zero_tester.dropna()
jac_zero = len(jac_zero_tester)

In [None]:
# Filter "distance" to put NA where col# <= row#
mask = np.zeros_like(distance, dtype=bool)
mask[np.triu_indices_from(mask)] = True
masked_dist = distance.mask(mask, '').transpose()
masked_dist_filename = library_name.replace(" ", "_") + "_jaccard_matrix.csv"

display(HTML(masked_dist.iloc[0:10,0:10].to_html(index=True)))
figure_legend(f"Table {table_count}", f"Jaccard Index pairings for {library_name}", "Upper triangle of a pairwise Jaccard matrix comparing each set in the library to each other set.")
display(create_download_link(masked_dist.reset_index(), "Download this table as a CSV", masked_dist_filename))
table_count = table_count + 1

In [None]:
if jac_zero > 0:
    maxval, interval = get_bin_params(max(res)*1000)
    maxval = maxval/1000
    interval = interval/1000
    jac_hist_bin_arr = np.arange(0, maxval, interval)

    #jac_hist_bin_arr = np.arange(0, max(res)+.015, .01)
    hist, edges = np.histogram(res, density=False, bins=jac_hist_bin_arr)
    title = f"Jaccard Indices for {library_name}"

    tooltips = [
        ("range", "@left{0.00}" + "-" + "@right{0.00}"),
        ("pairs", "@top")
        ]

    xaxis_lab = 'Jaccard Index'
    yaxis_lab = 'Gene Set Pairs'

    jac_hist = make_bok_hist(title, hist, edges, res, xaxis_lab, yaxis_lab, tooltips, bar_color, xtail=max(edges)/15, ytail=max(hist)/15)
else:
    print("All pairwise Jaccard Indices in your library are equal to 0. This means that no gene set in your library shares a single term (gene) with another gene set. The remaining tables and visualizations in this section of the analysis will not be generated.")

In [None]:
def make_bok_heatmap(df, title):
    colors = ['#ffffb2', '#fed976', '#feb24c', '#fd8d3c', '#f03b20', '#bd0026']
    mapper = LinearColorMapper(palette=colors, low=df.jaccard.min(), high=round(df.jaccard.max(), 1), nan_color="white")

    rng = pd.unique(df['set1'])
    pwidth = 1000* int(math.ceil((len(rng)/(jac_cutoff*100))))
    pheight = 700* int(math.ceil((len(rng)/(jac_cutoff*100))))
    if pwidth == 0:
        pwidth = 1000
    if pheight == 0:
        pheight = 500
        
    source = ColumnDataSource(df)
    
    p = figure(title=title,
               x_range=rng, y_range=list(reversed(rng)),
               x_axis_location="above", plot_width=pwidth, plot_height=pheight,
               toolbar_location='below',
               tooltips=[
                   ('Set 1', '@set1'),
                   ('Set 2', '@set2'),
                   ('Jaccard Index', '@jaccard')
               ])

    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "7px"
    if len(df['set1']) < 15:
        p.axis.major_label_text_font_size = "10px"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = np.pi/2
    p.title.align = 'center'

    p.rect(x = 'set2', y = 'set1', width=1, height=1,
           source=source,
           fill_color=transform('jaccard', mapper),
           line_color=None)

    color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="12px",
                         ticker=BasicTicker(desired_num_ticks=len(colors)),
                         formatter=PrintfTickFormatter(format="%.2f"),
                         label_standoff=6, border_line_color=None)
    p.add_layout(color_bar, 'right')
    return(p)

In [None]:
if jac_zero > 0:
    res_cut = res[res>jac_cutoff]
    if len(res_cut) > 0:
        datmin = math.floor(jac_cutoff * 100)*10
        datmax = math.ceil(max(res_cut)*100)*10
        maxval, interval = get_bin_params(datmax, dat_min=datmin)
        datmin = datmin/1000
        maxval = maxval/1000
        interval = interval/1000
        #jac_hist_cut_bin_arr = np.arange(math.floor(jac_cutoff * 100)/100.0, max(res_cut)+.01, .005)
        jac_hist_cut_bin_arr = np.arange(datmin, maxval, interval)
        hist_cut, edges_cut = np.histogram(res_cut, density=False, bins=jac_hist_cut_bin_arr)
        title_cut = f"Jaccard Indices > {jac_cutoff} for {library_name}"
        tooltips_cut = [
            ("range", "@left{0.000}" + "-" + "@right{0.000}"),
            ("pairs", "@top")
            ]
        jac_hist_cut = make_bok_hist(title_cut, hist_cut, edges_cut, res_cut, xaxis_lab, yaxis_lab, tooltips_cut, bar_color, xtail=max(edges_cut)/15, ytail=max(hist_cut)/15)
    else:
        print(f"There are no gene set pairs with a Jaccard Index greater than {jac_cutoff} in your library. The corresponding tables and visualizations will not be generated.")

In [None]:
# If user wants interactive Jaccard heatmap for high indices, create one 
if jac_interactive and jac_zero > 0:
    dist_cut_indexes = np.where(distance > jac_cutoff)
    dist_cut_rows_cols = np.unique(np.array(list(chain(*list(dist_cut_indexes)))))
    dist_cut = distance.iloc[dist_cut_rows_cols, dist_cut_rows_cols]

    mask2 = np.zeros_like(dist_cut, dtype=bool)
    mask2[np.triu_indices_from(mask2)] = True
    dist_cut_masked = dist_cut.mask(mask2, np.nan).transpose()

    dist_set1 = np.array(np.repeat(dist_cut_masked.index, len(dist_cut_masked.index)))
    sep = ",,"
    dist_set2 = (sep.join(list(dist_cut_masked.index))+",,")*len(list(dist_cut_masked.index))
    dist_set2 = (dist_set2.split(",,"))[0:-1]
    dist_vals = list(chain(*list(dist_cut_masked.values)))

    dist_heat_df = pd.DataFrame(data={'set1': dist_set1, 'set2': dist_set2, 'jaccard': dist_vals})

    dist_heat_title = f"Jaccard Indices for {library_name}"

    # Make smaller heatmap
    jac_heat_cut = make_bok_heatmap(dist_heat_df, dist_heat_title)

In [None]:
# Show Jaccard histograms
if jac_zero > 0:
    if len(res_cut) > 0:
        show(row(jac_hist, jac_hist_cut))
        figure_legend(f"Fig. {fig_count} and {fig_count +1}","Jaccard Indices Histograms", content=f"The histogram on the left displays the full range of Jaccard Indices for your library. The histogram on the right displays only those indices greater than {jac_cutoff}.")
        fig_count = fig_count + 2
    else:
        show(jac_hist)
        figure_legend(f"Fig. {fig_count}","Jaccard Indices Histogram", content=f"This histogram displays the full range of Jaccard Indices for your library. There were no indices greater than {jac_cutoff}.")
        fig_count = fig_count + 1

In [None]:
# Table of highest Jaccard Indices
if jac_zero > 0:
    dist_heat_df_disp = dist_heat_df[dist_heat_df['jaccard'] > jac_cutoff].reset_index(drop=True)
    dist_heat_df_disp = dist_heat_df_disp.rename(columns={"set1": "Gene Set 1", "set2": "Gene Set 2", "jaccard": "Jaccard Index"})
    
    display(HTML(dist_heat_df_disp.sort_values(by="Jaccard Index").iloc[0:10,:].to_html(index=False)))
    figure_legend(f"Table {table_count}",f"Jaccard Indices > {jac_cutoff}", content=f"This table displays all gene set pairings with a Jaccard Index greater than {jac_cutoff}.")
    high_jac_filename = library_name.replace(" ", "_") + "_high_jaccard_indices.csv"
    display(create_download_link(dist_heat_df_disp.sort_values(by="Jaccard Index"), "Download this table as a CSV", high_jac_filename))
    table_count = table_count + 1

In [None]:
# Get row indices in order and offer in table
def sns_heatmap_to_df(g):
    rows = g.dendrogram_row.reordered_ind
    cmap_df = distance.reset_index().rename(columns={'index': 'gene set'}).reset_index().rename(columns={'index': 'original index'})
    cmap_df_dict = dict(zip(cmap_df['original index'], cmap_df['gene set']))
    cmap_df = pd.DataFrame(data={'original index': rows})
    cmap_df['gene set'] = cmap_df['original index'].map(cmap_df_dict)
    cmap_df_ret = pd.DataFrame(data={'gene set': cmap_df['gene set'], 
                                     'original index':cmap_df['original index'],
                                    'new index':cmap_df.index})
    return cmap_df_ret

In [None]:
# Clustered heatmap of full library- only if selected, or if dendrogram is not possible 
sns_clust = None
full_heat_possible = True

if jac_zero > 0 and unique_sets > 0:
    try: 
        sns_clust = sns.clustermap(distance, cmap="Reds", figsize=(13,13))
    
    except:
        print("Unable to generate heatmap. Try a smaller library.")
        full_heat_possible = False 
    
    if full_heat_possible:
        sns_clust.ax_row_dendrogram.set_visible(False)
        sns_clust.ax_col_dendrogram.set_visible(False)
        sns_clust.ax_cbar.set_position((0, 0, .03, .4))
        figure_legend(f"Fig. {fig_count}", "Heatmap", f"This heatmap displays the Jaccard Indices of all gene sets in your library.")
        fig_count = fig_count + 1

        sns_clust_filename = library_name.replace(" ", "_") + "_jaccard_heatmap.png"
        plt.savefig(sns_clust_filename, bbox_inches = 'tight')
        display(FileLink(sns_clust_filename, result_html_prefix = str('Download png' + ': ')))

In [None]:
if full_heat_possible:
    cmap_df = sns_heatmap_to_df(sns_clust)
    cmap_filename = library_name.replace(" ", "_") + "_jaccard_heatmap_reordered_gene_sets.csv"
    display(HTML(cmap_df.head().to_html(index=False)))
    figure_legend(f"Table {table_count}", f"Reordered gene sets in heatmap of {library_name}", "This table lists your gene sets in the order in which they appear in the heatmap. The full table is available for download.")
    display(create_download_link(cmap_df, "Download this table as a CSV", cmap_filename))
    table_count = table_count + 1

In [None]:
# If interactive heatmap is possible, display it. Otherwise, create a static heatmap and report the new indices.
jac_static_heat=True
if jac_interactive and jac_zero > 0 and len(res_cut) > 0:
    if len(dist_cut_rows_cols) < 300:
        show(jac_heat_cut)
        figure_legend(f"Fig. {fig_count}", "High Jaccard Indices Heatmap", f"This heatmap includes all gene sets with at least one Jaccard Index greater than {jac_cutoff} in comparison with another set.")
        fig_count = fig_count + 1
        jac_static_heat=False
    else:
        print("There are too many sets to generate an interactive figure. A static heatmap will be generated instead. To see an interactive heatmap of the highest Jaccard Indices, try selecting a higher threshold value.")
        
if jac_static_heat:        
    cmap_cut = sns.clustermap(dist_cut, cmap="Reds")
    cmap_cut.ax_row_dendrogram.set_visible(False)
    cmap_cut.ax_col_dendrogram.set_visible(False)
    cmap_cut.ax_cbar.set_position((0.8, 0, .03, .4))
    figure_legend(f"Fig. {fig_count}", "High Jaccard Indices Heatmap", f"This heatmap includes all gene sets with at least one Jaccard Index greater than {jac_cutoff} in comparison with another set.")
    fig_count = fig_count + 1

In [None]:
%%appyter markdown
# 5. Visualization of Novelty and Size Distributions

In [None]:
# Read table of all Enrichr library novelties and append the user's library
all_enrichr_novs = pd.read_csv("https://appyters.maayanlab.cloud/storage/Gene_Set_Library_Synopsis/enrichr_library_novelties.csv", header=0)
all_enrichr_novs = all_enrichr_novs.sort_values(by='Stat', ascending=False)
my_lib_nov = pd.DataFrame(data={'Library':[library_name], 'Novelty':[library_nov_term], 'Stat':[library_nov_exact], 'Genes': [len(all_genes_unique)]})
mouse_libs = ['KEGG 2019 Mouse',
 'Mouse Gene Atlas',
 'RNAseq Automatic GEO Signatures Mouse Down',
 'RNAseq Automatic GEO Signatures Mouse Up',
 'WikiPathways 2019 Mouse']
all_enrichr_novs_mouse = all_enrichr_novs[all_enrichr_novs['Library'].isin(mouse_libs)]
all_enrichr_novs_mix = all_enrichr_novs[~all_enrichr_novs['Library'].isin(mouse_libs)]

In [None]:
# Display gene set size and novelty distribution as a scatterplot
novelties = ['well studied', 'studied', 'understudied', 'highly understudied']
xlabel = 'Set Size (Genes)'
ylabel = 'Mean Publications Per Gene'
my_gs_stat_df = pd.DataFrame(data={'Gene Set':['Average of all gene sets'], 'Novelty':[library_nov_term], 'Size':[avg_genes], 'Publications/Gene': [int(round(np.mean(geneset_df['Mean Publications/Gene'],0)))]})
geneset_df['Novelty Num'] = geneset_df['Novelty'].map(novel_map_dict)
geneset_df = geneset_df.sort_values(by='Novelty Num', ascending=False).reset_index(drop=True).drop(columns=['Novelty Num'])

source1 = ColumnDataSource(
        data=dict(
            x = geneset_df.Size,
            y = geneset_df['Mean Publications/Gene'],
            alpha = [0.9] * geneset_df.shape[0],
            size = [7] * geneset_df.shape[0],
            novelty = geneset_df.Novelty,
            geneset = geneset_df['Gene Set'],
        )
    )

source2 = ColumnDataSource(
        data=dict(
            x = my_gs_stat_df.Size,
            y = my_gs_stat_df['Publications/Gene'],
            alpha = [0.7] * my_gs_stat_df.shape[0],
            size = [10] * my_gs_stat_df.shape[0],
            novelty = my_gs_stat_df.Novelty,
            geneset = my_gs_stat_df['Gene Set']
        )
    )
#print(embedding.shape[0])

hover_gs_nov = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@geneset</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Novelty:</span>
            <span style="font-size: 12px">@novelty</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Set Size (Genes):</span>
            <span style="font-size: 12px">@x</span>
         <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Mean Publications Per Gene:</span>
            <span style="font-size: 12px">@y</span>
        </div>
    </div>
    """)
tools_gs_nov = [hover_gs_nov, 'pan', 'wheel_zoom', 'reset', 'save']
title_gs_nov = f"Novelties and Sizes of Gene Sets Within {library_name} Library"
plot_gs_nov = figure(plot_width=700, plot_height=700, tools=tools_gs_nov, title=title_gs_nov, x_axis_label=xlabel, y_axis_label=ylabel, x_axis_type='log', y_axis_type='log')
plot_gs_nov.circle('x', 'y', size='size', 
                alpha='alpha', line_alpha=0, line_width=0.01, source=source1, name="df", 
                fill_color=factor_cmap('novelty', palette=Spectral6, factors=novelties),
               legend_field='novelty')
plot_gs_nov.circle('x', 'y', size='size', 
                alpha='alpha', line_alpha=0, line_width=0.01, source=source2, name="df", 
                fill_color='red')
plot_gs_nov.xaxis.axis_label_text_font_style = 'normal'
plot_gs_nov.yaxis.axis_label_text_font_style = 'normal'
plot_gs_nov.title.align = 'center'
plot_gs_nov.legend.location = "bottom_right"
plot_gs_nov.xaxis.axis_label_text_font_size = '18px'
plot_gs_nov.yaxis.axis_label_text_font_size = '18px'
plot_gs_nov.title.text_font_size = '16px'

show(plot_gs_nov)
figure_legend(f"Fig. {fig_count}", title=f"Novelties and Sizes of Gene Sets Within {library_name} Library", content=f"Scatterplot showing the size, publication count, and novelty rating of each gene set within your library.")
fig_count = fig_count + 1

In [None]:
# Make a scatterplot of library size by novelty for all Enrichr libraries
xlabel = 'Library Size (Genes)'
ylabel = 'Novelty Statistic'

source1 = ColumnDataSource(
        data=dict(
            x = all_enrichr_novs_mix.Genes,
            y = all_enrichr_novs_mix.Stat,
            alpha = [0.9] * all_enrichr_novs_mix.shape[0],
            size = [7] * all_enrichr_novs_mix.shape[0],
            novelty = all_enrichr_novs_mix.Novelty,
            lib = all_enrichr_novs_mix.Library,
        )
    )

source2 = ColumnDataSource(
        data=dict(
            x = my_lib_nov.Genes,
            y = my_lib_nov.Stat,
            alpha = [0.7] * my_lib_nov.shape[0],
            size = [10] * my_lib_nov.shape[0],
            novelty = my_lib_nov.Novelty,
            lib = my_lib_nov.Library,
        )
    )

source3 = ColumnDataSource(
    data=dict(
            x = all_enrichr_novs_mouse.Genes,
            y = all_enrichr_novs_mouse.Stat,
            alpha = [0.9] * all_enrichr_novs_mouse.shape[0],
            size = [7] * all_enrichr_novs_mouse.shape[0],
            novelty = all_enrichr_novs_mouse.Novelty,
            lib = all_enrichr_novs_mouse.Library,
        )
    )


#print(embedding.shape[0])

hover_nov = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set Library:</span>
            <span style="font-size: 12px">@lib</span>
         <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Novelty:</span>
            <span style="font-size: 12px">@novelty</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Total Genes:</span>
            <span style="font-size: 12px">@x</span>
        </div>
    </div>
    """)
tools_nov = [hover_nov, 'pan', 'wheel_zoom', 'reset','save']
title_nov = f"Novelty and Size of {library_name} Library Among All Enrichr Libraries"
plot_nov = figure(plot_width=700, plot_height=700, tools=tools_nov, title=title_nov, x_axis_label=xlabel, y_axis_label=ylabel, x_axis_type='log')
plot_nov.circle('x', 'y', size='size', 
                alpha='alpha', line_alpha=0, line_width=0.01, source=source1, name="df", 
                fill_color=factor_cmap('novelty', palette=Spectral6, factors=novelties),
               legend_field='novelty')
plot_nov.square('x', 'y', size='size', 
                alpha='alpha', line_alpha=0, line_width=0.01, source=source3, name="df", 
                fill_color=factor_cmap('novelty', palette=Spectral6, factors=novelties))
if other:
    plot_nov.triangle('x', 'y', size='size', 
                alpha='alpha', line_alpha=0, line_width=0.01, source=source2, name="df", 
                fill_color='red')
if specname=='Mouse':
    plot_nov.square('x', 'y', size='size', 
                    alpha='alpha', line_alpha=0, line_width=0.01, source=source2, name="df", 
                    fill_color='red')
if specname=='Human':
    plot_nov.circle('x', 'y', size=10, 
                alpha='alpha', line_alpha=0, line_width=0.01, source=source2, name="df", 
                fill_color='red')
plot_nov.xaxis.axis_label_text_font_style = 'normal'
plot_nov.yaxis.axis_label_text_font_style = 'normal'
plot_nov.title.align = 'center'
plot_nov.legend.location = (170,520)
plot_nov.xaxis.axis_label_text_font_size = '18px'
plot_nov.yaxis.axis_label_text_font_size = '18px'
plot_nov.title.text_font_size = '16px'

from bokeh.models import Legend, LegendItem
markers = ['circle','square','triangle']
r = plot_nov.scatter(x=0, y=0, color="grey", size=6, marker=markers)
r.visible = False

shape_legend = Legend(items=[
    LegendItem(label="human harmonized", renderers=[r], index=0),
    LegendItem(label="mouse harmonized", renderers=[r], index=1),
    LegendItem(label="other", renderers=[r], index=2),],
    location=(10,520)           
)
plot_nov.add_layout(shape_legend)

show(plot_nov)
figure_legend(f"Fig. {fig_count}", title=f"Novelty and Size of {library_name} Library Among All Enrichr Libraries", content=f"Scatterplot showing the size and novelty of your library compared with 174 Enrichr libraries. The Library Novelty Statistic ranges from 0 (well-studied) to 3 (highly understudied). The {library_name} library is shown in red.")
fig_count = fig_count + 1

In [None]:
# Bokeh barplot
def make_bok_barplot(dat, col1name, col2name, title, lab1, lab2, tooltips_vert, tooltips_hor):
    barsource_v = ColumnDataSource(
            dict(
            x = dat[col1name],
            y = dat[col2name],
            novelty = dat['Novelty'][::-1],
        )
    )
    barsource_h = ColumnDataSource(
            dict(
            x = dat[col2name][::-1],
            y = dat[col1name][::-1],
            novelty = dat['Novelty'][::-1],
        )
    )

    bar_title = title

    if orient_bar == 'Vertical':
        bokbar = figure(x_range=dat[col1name], plot_height=350, title=bar_title, toolbar_location='below', tooltips=tooltips_vert, x_axis_label=lab1, y_axis_label=lab2)
        bokbar.vbar(x=dat[col1name], top=dat[col2name], width=.5, color=bar_color, hover_alpha=.7)
        bokbar.xaxis.major_label_orientation = math.pi/5
        bokbar.xgrid.grid_line_color = None
        bokbar.y_range.start = 0

        if counts_bar:
            labels = LabelSet(x='x', y='y', text='y', level='annotation',
                x_offset=-7, y_offset=0, source=barsource_v, render_mode='canvas', text_font_size = '11px')

            bokbar.add_layout(labels)

    if orient_bar == 'Horizontal':
        bokbar = figure(y_range = dat[col1name][::-1], plot_height=400, title=bar_title, toolbar_location='below', tooltips=tooltips_hor, x_axis_label=lab2, y_axis_label=lab1)
        bokbar.hbar(y='y',right='x', height=.5, color=bar_color, hover_alpha=.7, source=barsource_h)
        bokbar.xgrid.grid_line_color = None

        if counts_bar:
            labels = LabelSet(x='x', y='y', text='x', level='annotation',
                x_offset=2, y_offset=-6, source=barsource_h, render_mode='canvas', text_font_size = '11px')

            bokbar.add_layout(labels)

    bokbar.xaxis.axis_label_text_font_style = 'normal'
    bokbar.yaxis.axis_label_text_font_style = 'normal'
    bokbar.title.align = 'center'
    
    return bokbar  

In [None]:
bokbar_counts_title = '' + str(num_bar_genes) + ' Most Frequent Genes in ' + library_name
bokbar_pubs_title = '' + str(num_bar_genes) + ' Most Studied Genes in ' + library_name

tooltips_vert = [
    ("count", "@top")
    ]

tooltips_hor = [
    ("count", "@x")
    ]

bokbar_counts = make_bok_barplot(top_genes, 'Gene', 'Count', bokbar_counts_title, 'Genes', 'Counts', tooltips_vert, tooltips_hor)
show(bokbar_counts)
figure_legend(f"Fig. {fig_count}", title=f"Most Frequent Genes in {library_name}")
fig_count = fig_count + 1

if len(pubhist_dat) > 0:
    tooltips_vert = [
        ("publications", "@top"),
        ("novelty", "@novelty")
    ]

    tooltips_hor = [
        ("publications", "@x"),
        ("novelty", "@novelty")
    ]
    
    bokbar_pubs = make_bok_barplot(top_pub_genes, 'Gene', 'Publications', bokbar_pubs_title, 'Genes', 'Publications', tooltips_vert, tooltips_hor)
    show(bokbar_pubs)
    figure_legend(f"Fig. {fig_count}", title=f"Most Studied Genes in {library_name}")
    fig_count = fig_count + 1