In [None]:

from appyter import magic
magic.init(lambda _ = globals: _())

# GeneSet Library Set Appyter
This appyter is designed to perform basic statistics, analysis, and visualizations on a Gene Matrix Transpose (.GMT) file. This will allow bioinformatics researchers to analyze relationships between many different gene sets from several gene set libraries.
 To create your own GMT file, please see Enrichr. Enrichr, hosted by the Ma'ayan Laboratory at Mt. Sinai Icahn School of Medicine, is a collection of geneset libraries. 

In [None]:
import numpy as np 
import pandas as pd
import itertools 
import bokeh
import time
import networkx as nx
import pathlib
import scanpy as sc
from IPython.display import display, FileLink, HTML
import anndata
from statsmodels.stats import multitest as mlt
from sklearn.feature_extraction.text import TfidfVectorizer
from maayanlab_bioinformatics.enrichment import crisp
from collections import OrderedDict
from bokeh.palettes import Category20
import statistics as stat
from bokeh.io import output_notebook
from bokeh.models import HoverTool, ColumnDataSource, RangeSlider
from bokeh.plotting import figure, show, save, output_file
import os

output_notebook()

In [3]:
%%appyter hide_code


{% do SectionField(name='GMTSubmission', title='1. Submit a GMT file', subtitle = 'Sumbit a GMT (Gene Matrix Transpose file) for analysis.', img = 'bulb.png') %}
{% do SectionField(name = 'Pairwise Similarity Table', title = '2. Pairwise Intersection Table', subtitle = 'In this table, the value in row A, column B, is the size of the intersection of A and B. If you would like to get a list of genes from a specific intersection of two library terms, please see the Intersection Search Section.', img = 'bulb.png') %}
{% do SectionField(name = 'Jaccard Similarity Table', title = '3. Jaccard Similarity Table', subtitle = '##TODO', img = 'bulb.png') %}
{% do SectionField(name = 'Intersection Search', title = '4. Gene Intersection Search', subtitle = '###TODO', img = 'bulb.png')%}
{% do SectionField (name = 'UMAP_visualization', title = '5. Scatterplot Visualization', subtitle= 'Visualize relative Geneset similarities on an interactive scatterplot', img = 'bulb.png') %}
{% do SectionField(name = 'GMT Descriptive Statistics', title = '6. Descriptive Statistics', subtitle = '#TODO', img = 'bulb.png') %}



## 0. Submitted Variables

In [4]:
%%appyter code_exec

{% set gs = TabField(
    name='gs_type',
    label='Gene Sets',
    default='Upload',
    choices={'Upload': [
            FileField(
                name='gs',
                label='Gene Set Files',
                default='static/gene_sets_for_breast_cancer.gmt',
                example={
                    'example.gmt': url_for('static', filename = 'gene_sets_for_breast_cancer.gmt')
                }
            ),
        ],},
    
section = 'GMTSubmission',)%}
gs = {{gs.value[0]}}



int_tbl = {{BoolField(name = 'SimilarityTbl', label = 'Intersection Size Table', default = 'true', description = 'In this table, the value in row A, column B, is the size of the intersection of A and B. If you would like to get a list of genes from a specific intersection of two library terms, please see the Intersection Search Section. Select \'Yes\' if you would like to generate a Intersection Size Table. Otherwise, select \'No\'', section = 'Pairwise Similarity Table') }}
jaccard_tbl = {{BoolField(name = 'JaccardTbl', label = 'Jaccard Similarity Table', default = 'true', description = '##TODO', section = 'Jaccard Similarity Table') }}


umap = {{ BoolField(name = 'umap', label = 'ScatterPlot Visualization', default = 'true', description = 'Select \'Yes\' if you would like to generate a Scatter Plot. Otherwise, select \'No\'', section = 'UMAP_visualization')}}

umap_num_neighbors = {{ IntField(name = 'nneighbors', label = 'Number of Neighbors', default = 5, min = 1, max = 30, description = '##TODO: Play around with parameter settings', section = 'UMAP_visualization')}}
umap_maxdf = {{ ChoiceField(name = 'max_df', label = 'Max df setting', choices = {'0.5': '0.5', '0.75': '.75', '0.9': '.9', '1.0': '1'}, default = '0.5',  description = '##TODO: Play around with parameter settings', section = 'UMAP_visualization')}}
umap_mindf = {{ ChoiceField(name = 'min_df', label = 'Min df setting', choices = {'0.1' : '0.1', '0.25' : '0.25', '0.5': '0.5' }, default = '0.25', description = '##TODO: Play around with parameter settings', section = 'UMAP_visualization')}}


```python
gs = 'static/gene_sets_for_breast_cancer.gmt'
int_tbl = True
jaccard_tbl = True
umap = True
umap_num_neighbors = 5
umap_maxdf = 0.5
umap_mindf = 0.25
```

In [5]:
if gs == '' :
    raise Exception('Please upload a GMT File!')

## 1. Process the GMT FILE

In [6]:
%%appyter code_exec

def series_to_list(gene_list):
    ##helper function to convert a gene pd.series to a gene list
    ret_list = []
    for genes in gene_list:
        if type(genes) is str:
            ret_list.append(genes)
        else: ##pd series case
            genes = genes.tolist()        
            ret_list.append(' '.join(genes))
    return ret_list

def load_set(file):
    ''' Load a set of files into pairs of labeled sets
    '''
    lst= []
    path = pathlib.Path(file)
    with open(path) as f:
        lines = f.readlines()
        for line in lines:
            parsed_line = line.split('\t')
            term, library, genes = parsed_line[0], parsed_line[1], parsed_line[2:]
            if genes[-1][:-2] == '\n':
                genes[-1] = genes[-1][:-2] ##trim off newline regex '\n'
            lst.append((term,  library, ' '.join(genes)))
    zip_lst = [list(i) for i in zip(*lst)]
    term, library, genes = zip_lst[0], zip_lst[1], zip_lst[2]
    genes = series_to_list(genes)

    df = pd.DataFrame({'Genes': genes, 'Library': library}, index = term)
    return df              

```python
def series_to_list(gene_list):
    ##helper function to convert a gene pd.series to a gene list
    ret_list = []
    for genes in gene_list:
        if type(genes) is str:
            ret_list.append(genes)
        else: ##pd series case
            genes = genes.tolist()
            ret_list.append(' '.join(genes))
    return ret_list
def load_set(file):
    ''' Load a set of files into pairs of labeled sets
    '''
    lst= []
    path = pathlib.Path(file)
    with open(path) as f:
        lines = f.readlines()
        for line in lines:
            parsed_line = line.split('\t')
            term, library, genes = parsed_line[0], parsed_line[1], parsed_line[2:]
            if genes[-1][:-2] == '\n':
                genes[-1] = genes[-1][:-2] ##trim off newline regex '\n'
            lst.append((term,  library, ' '.join(genes)))
    zip_lst = [list(i) for i in zip(*lst)]
    term, library, genes = zip_lst[0], zip_lst[1], zip_lst[2]
    genes = series_to_list(genes)
    df = pd.DataFrame({'Genes': genes, 'Library': library}, index = term)
    return df
```

In [7]:

df = load_set(gs)
if df.shape[0] < umap_num_neighbors:
    umap_num_neighbors = int(np.ceil(df.shape[0]/2)) ##arbitrary right now. May want to change based on parameter settings
    print('Number of Neighbors parameter in scatterplot is too large for the submitted dataset. Resetting number of neighbors to '+ str(umap_num_neighbors)+'')
        


In [8]:
def calculate_FET(set1, set2, background = 20000):
    ##inputs: set1, set2 - python sets
    ##output: p-value of the fisher exact test
    res = crisp.fisher_overlap(set1, set2, n_background_entities= background, preserve_overlap=True)
    if res == None:
        return 0
    else:
        return res.pvalue

In [18]:
def get_itertuple(str1, str2):
    ##given two strings (which should be terms in the given gmt), get the tuple back that will index into the pair_df
    ##itertools.combos gives tuples that are alphabetically ordered for strings.
    return (str1, str2) if str1 < str2 else (str2, str1) 

def clean_name(dir_name):
    dir_name = dir_name.replace(' ', '_')
    dir_name = dir_name.replace('/', '_')
    dir_name = dir_name.replace(':', '_')
    dir_name = dir_name.replace('(', '_')
    dir_name = dir_name.replace(')', '_')
    dir_name = dir_name.strip()
    return dir_name


def make_dirs(str1,str2):
    if not os.path.exists(f'Intersection_Sets/{str1}'):
        os.makedirs(f"Intersection_Sets/{str1}")
    return
        
def save_set(str1, str2, intersection_set):
    term1, term2 = clean_name(str1), clean_name(str2)
    term1, term2 = get_itertuple(term1, term2)
    make_dirs(term1, term2)
    ##str1, str2 are terms to save set in system by. geneset is set of the intersection set to save
    series = pd.Series(list(intersection_set))
    full_name = os.path.join(r'Intersection_Sets', term1, term2)
    try:
        
        series.to_csv(full_name+'.csv')
    except:
        term2 = 'file.csv'
        full_name = os.path.join(r'Intersection_Sets', term1, term2)
        series.to_csv(full_name)
    return
    
    
        
    
    
    ##given two strings (representing the terms in the given gmt), create a directory to store intersection lists in
    

In [None]:
def BH_test(pair_df, alpha = .05):
    #benjamini hochberg test
    pvals = pair_df['FET_pval'].tolist()
    sig, corrected_pval = mlt.fdrcorrection(pvals, alpha, method = 'indep')
    print(type(sig))
    pair_df['BH_sig'] = sig
    pair_df['BH_corrected_pval'] = corrected_pval
    return pair_df
    

In [19]:
def series_to_str(el):
    if type(el) == str:
        return el
    else:
        return ' '.join(el.tolist())

def generate_pairs_df(df, background = 20000):
    ##inputs: df - pandas dataframe that is the result of GMT_to_df transformation
    ##output: pair_df - pandas dataframe whose rows are indexed by a tuple/ pair of terms in the set of Gene set 
    # #terms and columns represent calculated set properties between the two sets

    
    os.makedirs("Intersection_Sets", exist_ok = True)
    intersection = []
    in_A_not_B = []
    in_B_not_A = []
    union = []
    jaccard = []
    FET_pval = []

    to_set = lambda el: set(series_to_str(el).split(' '))
    space_counter = lambda str1: str1.count(" ") +1
    terms = list(df.index.values)
    int_df = pd.DataFrame(index = terms, columns = terms)
    jac_df = pd.DataFrame(index = terms, columns = terms)
    pairwise_perms = list(itertools.combinations(terms,2))
    for term1,term2 in pairwise_perms:
        setA, setB = df.loc[term1]['Genes'], df.loc[term2]['Genes']
        set1, set2 = to_set(setA), to_set(setB)
        intersect = set1 & set2
        save_set(term1, term2, intersect)


        union_set = set1 | set2
        intersection.append(' '.join(list(intersect)))
        in_A_not_B.append(' '.join(list(set1 -set2)))
        in_B_not_A.append(' '.join(list(set2 - set1)))
        union.append(' '.join(list(union_set)))
        pval = calculate_FET(set1, set2)
        FET_pval.append(pval)

        int_size = len(intersect)
        uni_size = len(union_set)
        jaccard = "{:.2f}".format(int_size/uni_size)
        term1_c, term2_c = clean_name(term1), clean_name(term2) ##c
        
        jac_df.loc[term1, term2] = jaccard
        jac_df.loc[term2, term1] = jaccard
        
        if int_size != 0:
            int_df.loc[term1, term2] = f'<a href = "localhost:8888/edit/Intersection_Sets/{term1_c}/{term2_c}.csv">{int_size}</a>'
            int_df.loc[term2,term1] =  f'<a href = "localhost:8888/edit/Intersection_Sets/{term1_c}/{term2_c}.csv">{int_size}</a>'
        else:
            int_df.loc[term1,term2] = 0
            int_df.loc[term2,term1] = 0



    pair_df = pd.DataFrame({'Intersection' : intersection, 'A-B' : in_A_not_B, 'B-A' : in_B_not_A, 'Union': union, 'FET_pval': FET_pval}, index = pairwise_perms)
    pair_df['intersect_size'] = pair_df['Intersection'].map(space_counter)
    pair_df['union_size'] = pair_df['Union'].map(space_counter)
    pair_df['Jaccard'] = pair_df['intersect_size'] / pair_df['union_size']
    
    np.fill_diagonal(int_df.values,0)
    np.fill_diagonal(jac_df.values, 0)
    
    

    return pair_df, int_df, jac_df




In [20]:
pair_df, int_df, jac_df = generate_pairs_df(df)

In [None]:
# BH_test(pair_df)

## 2. Pairwise Intersection Analysis


In [43]:
if int_tbl:
    html_string = f'''
    <html>
    <head><title>Intersection Table</title></head>
    <link rel="stylesheet" type="text/css" href="static/df_style.css"/>
    <body>
    <div class = 'int_df'>
    {int_df.to_html(render_links = True, escape = False, classes = 'styled')}
    </div>
    </body>
    </html>.
    '''
    
    
    
    os.makedirs("P.I_matrix", exist_ok = True)
    int_df.to_csv('P.I_matrix/intersection_matrix.csv')
    display(HTML(html_string))
    display(FileLink('P.I_matrix/intersection_matrix.csv', result_html_prefix= str('Download Pairwise Intersection Matrix:   ')))

Unnamed: 0,Integrated breast cancer pathway,Stathmin and breast cancer resistance to antimicrotubule agents,Integrated breast cancer pathway WP1984,Breast cancer pathway WP4262,Breast cancer,NOTCH1 Signaling in Breast Cancer,Genes with Mutations Associated with Breast Cancer,Genes with Mutations Associated with Hereditary Breast and/or Ovarian Cancer Syndrome,Proteins Involved in Breast Cancer Related to ERBB2/VEGFR/Akt Signaling Pathway,Hereditary Breast and Ovarian Cancer Syndrome,Proteins Involved in Breast Cancer Related to ESR1 Signaling Pathway,Proteins Involved in Breast Cancer Related to IGF1R/Akt Signaling Pathway,Proteins Involved in Breast Cancer Related to NOTCH Signaling Pathway,Proteins Involved in Breast Cancer Related to WNT Signaling Pathway,Breast Cancer,WNT Signaling in Breast Cancer,IGF1R/AKT Signaling in Breast Cancer,Proteins with Altered Expression in Breast Cancer,ERBB/VEGFR/Akt Signaling in Breast Cancer,ESR1 Signaling in Breast Cancer,ESR1/ERBB Positive Luminal Breast Cancer,Stathmin and breast cancer resistance to antimicrotubule agents Homo sapiens h stathminPathway,Cancer Stem cell:Breast,Integrated Breast Cancer Pathway WP1984,Breast cancer pathway WP4262.1,Integrated Breast Cancer Pathway Homo sapiens WP1984,IRF1-19129219-H3396 breast cancer cells-human,breast cancer cell,Control Nrde2-Depleted Breast Cancer GSE119827 1,Multi-Treatment Models Breast Cancer GSE136823 1,Multi-Treatment Models Breast Cancer GSE136823 2,Multi-Treatment Models Breast Cancer GSE136823 3,Multi-Treatment Models Breast Cancer GSE136823 4,Multi-Treatment Models Breast Cancer GSE136823 5,Multi-Treatment Models Breast Cancer GSE136823 6,Multi-Treatment Models Breast Cancer GSE136823 7,Differentially Breast Cancer Diabetes GSE150586 1,Control Nrde2-Depleted Breast Cancer GSE119827 1.1,Multi-Treatment Models Breast Cancer GSE136823 1.1,Multi-Treatment Models Breast Cancer GSE136823 2.1,Multi-Treatment Models Breast Cancer GSE136823 3.1,Multi-Treatment Models Breast Cancer GSE136823 4.1,Multi-Treatment Models Breast Cancer GSE136823 5.1,Multi-Treatment Models Breast Cancer GSE136823 6.1,Multi-Treatment Models Breast Cancer GSE136823 7.1,Differentially Breast Cancer Diabetes GSE150586 1.1,Breast cancer DOID-1612 mouse GSE24594 sample 616,Breast Cancer DOID-1612 human GSE34925 sample 478,Breast Cancer C0006142 human GSE2429 sample 148,Breast Cancer C0006142 human GSE1379 sample 392,breast cancer DOID-1612 human GSE9574 sample 448,breast cancer DOID-1612 human GSE14943 sample 504,breast cancer DOID-1612 human GSE26910 sample 602,Breast Cancer C0006142 human GSE1378 sample 52,sporadic breast cancer DOID-8029 human GSE3744 sample 979,Breast Cancer C0006142 human GSE2155 sample 39,breast cancer DOID-1612 human GSE3744 sample 978,Breast Cancer C0006142 mouse GSE2528 sample 28,Breast Cancer C0006142 human GSE3744 sample 24,Breast Cancer C0006142 rat GSE1872 sample 63,Breast Cancer C0006142 human GSE1379 sample 392.1,Breast Cancer C0006142 human GSE2429 sample 148.1,breast cancer DOID-1612 human GSE14943 sample 504.1,breast cancer DOID-1612 human GSE26910 sample 602.1,Breast Cancer C0006142 human GSE1378 sample 52.1,sporadic breast cancer DOID-8029 human GSE3744 sample 979.1,Breast Cancer C0006142 human GSE2155 sample 39.1,breast cancer DOID-1612 human GSE3744 sample 978.1,Breast Cancer C0006142 mouse GSE2528 sample 28.1,Breast Cancer C0006142 human GSE3744 sample 24.1,Breast Cancer C0006142 rat GSE1872 sample 63.1,Breast Cancer DOID-1612 human GSE34925 sample 478.1,breast cancer DOID-1612 human GSE9574 sample 448.1,Breast cancer DOID-1612 mouse GSE24594 sample 616.1,estradiol human estrogen receptor (ER)-positive MCF7 breast cancer cells GDS3217 ligand:41,estradiol human MCF-7 breast cancer cells GDS3283 ligand:43,estradiol human MCF-7 breast cancer cells GDS3283 ligand:42,Interleukin-13 human Breast cancer - MCF10CA1a cell line (pLKO-shIL13RA2) GSE57677 ligand:243,estradiol human estrogen receptor (ER)-positive MCF7 breast cancer cells GDS3217 ligand:40,estradiol human MCF-7 breast cancer cells GDS3105 ligand:38,estradiol human estrogen receptor (ER)-positive MCF7 breast cancer cells GDS3217 ligand:39,Interleukin-13 human Breast cancer - MCF10CA1a cell line (pLKO-shSCR) GSE57677 ligand:242,estradiol human estrogen receptor (ER)-positive MCF7 breast cancer cells GDS3217 ligand:41.1,Interleukin-13 human Breast cancer - MCF10CA1a cell line (pLKO-shSCR) GSE57677 ligand:242.1,estradiol human MCF-7 breast cancer cells GDS3283 ligand:42.1,estradiol human MCF-7 breast cancer cells GDS3283 ligand:43.1,Interleukin-13 human Breast cancer - MCF10CA1a cell line (pLKO-shIL13RA2) GSE57677 ligand:243.1,estradiol human estrogen receptor (ER)-positive MCF7 breast cancer cells GDS3217 ligand:39.1,estradiol human estrogen receptor (ER)-positive MCF7 breast cancer cells GDS3217 ligand:40.1,estradiol human MCF-7 breast cancer cells GDS3105 ligand:38.1
Integrated breast cancer pathway,0,0,146,29,25,12,4,6,23,16,25,20,12,10,32,9,17,9,19,21,33,0,1,146,29,148,1,2,5,6,4,17,11,14,18,11,4,5,6,4,17,11,14,18,11,4,8,7,5,2,7,10,4,4,8,9,6,3,7,6,2,5,10,4,4,8,9,6,3,7,6,7,7,8,6,12,3,2,5,6,5,0,6,0,3,12,2,5,5,6
Stathmin and breast cancer resistance to antimicrotubule agents,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,0,0,0,24,0,0,0,0,0,0,3,0,0,3,3,5,4,3,0,3,0,0,3,3,5,4,3,0,2,1,2,0,2,1,0,0,2,1,2,2,2,6,0,2,1,0,0,2,1,2,2,2,6,1,2,2,3,1,1,0,6,2,5,1,3,1,1,1,0,5,6,2
Integrated breast cancer pathway WP1984,146,0,0,30,25,12,4,6,23,16,25,20,12,10,33,9,17,9,19,21,34,0,1,151,30,147,1,2,6,7,4,17,11,14,18,11,5,6,7,4,17,11,14,18,11,5,8,7,5,2,8,11,4,4,8,10,6,3,7,5,2,5,11,4,4,8,10,6,3,7,5,7,8,8,7,12,3,2,5,6,6,0,7,0,3,12,2,6,5,6
Breast cancer pathway WP4262,29,0,30,0,139,14,6,6,27,14,27,24,12,18,45,17,22,13,24,22,41,0,1,30,154,29,1,3,8,9,8,11,7,11,9,9,9,8,9,8,11,7,11,9,9,9,7,4,5,5,8,4,6,5,9,11,10,7,14,4,5,5,4,6,5,9,11,10,7,14,4,4,8,7,6,13,11,1,3,9,5,1,6,1,11,13,1,5,3,9
Breast cancer,25,0,25,139,0,15,3,6,27,10,27,24,13,18,41,17,22,13,24,22,36,0,1,25,139,25,1,3,9,9,9,10,7,10,8,9,9,9,9,9,10,7,10,8,9,9,7,4,5,5,8,4,6,5,9,12,10,7,14,4,5,5,4,6,5,9,12,10,7,14,4,4,8,7,6,13,11,2,3,9,5,1,6,1,11,13,2,5,3,9
NOTCH1 Signaling in Breast Cancer,12,1,12,14,15,0,0,0,16,12,14,12,24,13,25,15,7,13,9,9,14,1,1,12,14,12,0,0,3,3,3,4,3,3,3,4,3,3,3,3,4,3,3,3,4,3,5,3,8,2,3,1,4,2,4,7,4,3,5,6,2,8,1,4,2,4,7,4,3,5,6,3,3,5,2,5,4,1,2,2,2,0,2,0,4,5,1,2,2,2
Genes with Mutations Associated with Breast Cancer,4,0,4,6,3,0,0,5,1,8,1,1,0,0,9,0,0,0,0,0,9,0,0,4,6,4,1,0,0,0,0,4,3,2,4,3,0,0,0,0,4,3,2,4,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,1,0,0,0,0,2,0,1,1,0
Genes with Mutations Associated with Hereditary Breast and/or Ovarian Cancer Syndrome,6,0,6,6,6,0,5,0,4,3,4,4,0,1,7,1,3,1,3,3,7,0,0,6,6,6,0,0,0,0,0,2,2,1,2,2,0,0,0,0,2,2,1,2,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
Proteins Involved in Breast Cancer Related to ERBB2/VEGFR/Akt Signaling Pathway,23,1,23,27,27,16,1,4,0,5,47,45,18,18,42,16,25,19,40,25,43,1,1,23,27,23,0,0,3,8,8,9,7,6,8,7,6,3,8,8,9,7,6,8,7,6,6,5,8,2,9,1,9,5,10,11,8,5,9,8,2,8,1,9,5,10,11,8,5,9,8,5,9,6,4,11,10,2,2,5,2,2,4,2,10,11,2,2,2,5
Hereditary Breast and Ovarian Cancer Syndrome,16,0,16,14,10,12,8,3,5,0,10,5,5,3,27,3,5,1,5,12,28,0,0,16,14,16,2,1,2,3,2,6,7,3,5,6,3,2,3,2,6,7,3,5,6,3,2,1,1,0,3,0,2,0,1,4,1,0,1,0,0,1,0,2,0,1,4,1,0,1,0,1,3,2,2,4,2,0,2,1,2,0,2,0,2,4,0,2,2,1


In [None]:
# term_lst = list(set(list(itertools.chain.from_iterable(list(pair_df.index)))))
# term_lst_indexer = np.arange(len(term_lst))
# G = nx.Graph()
# G.add_nodes_from(term_lst)
# BH_sig_pairs = pair_df.iloc[np.where(pair_df['BH_sig']== True)[0]].index
# G.add_edges_from(BH_sig_pairs)

# nx.draw(G, with_labels = True)


## 3. Jaccard Similarity Matrix

In [None]:
if jaccard_tbl:
#     display(jac_df.head())
    jac_df.to_csv('P.I_matrix/jaccard_matrix.csv')
    display(FileLink('P.I_matrix/jaccard_matrix.csv', result_html_prefix= str('Download Jaccard Similarity Matrix:   ')))
    

## 4. Term Intersection Search

#### In this section, the user can submit library terms (at least 2) they would like to search the intersection of. The user can submit as many terms names as they would like, allowing analysis of the intersections of many sets.






## 5. ScatterPlot Visualization

In [None]:
class NoResults(Exception):
    pass


class APIFailure(Exception):
    pass


class NotValidFile(Exception):
    pass


class UMAP_Visualization:

    def __init__(self, query_set=[], gene_libraries=[], sig_value=.05, gmt_files=[], gmt_df = []):
        self.query_set = [gene.strip() for gene in query_set]
        self.gene_libraries = gene_libraries
        self.significant_value = sig_value
        self.term_library_map = {}
        self.dataset = OrderedDict()
        self.dataset.update(self.process_gmt_df(gmt_df))
        
        
    def process_gmt_df(self, gmt_df):
        if gmt_df == []:
            return OrderedDict() ##return the empty Dictionary when no gmt passed in
        else:
            gmt_df = gmt_df[0]
            self.term_library_map.update(pd.Series(gmt_df['Library'].values,index=gmt_df.index.values).to_dict())
            return OrderedDict(pd.Series(gmt_df['Genes'].values,index=gmt_df.index.values).to_dict())
            
   

    def process_scatterplot(self, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
        libdict = self.dataset
        print("\tTF-IDF vectorizing gene set data...")
        # computes tdfidf score--look this up
        vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
        X = vec.fit_transform(libdict.values())
        print(X.shape)
        adata = anndata.AnnData(X)
        adata.obs.index = libdict.keys()

        print("\tPerforming Leiden clustering...")
        # the n_neighbors and min_dist parameters can be altered
        sc.pp.neighbors(adata, n_neighbors=nneighbors)
        sc.tl.leiden(adata, resolution=1.0)
        sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

        new_order = adata.obs.sort_values(by='leiden').index.tolist()
        adata = adata[new_order, :]
        adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

        df = pd.DataFrame(adata.obsm['X_umap'])
        df.columns = ['x', 'y']

        df['cluster'] = adata.obs['leiden'].values
        df['term'] = adata.obs.index
        df['genes'] = [libdict[l] for l in df['term']]
        df['library'] = [self.term_library_map[l] for l in df['term']]

        return df

    def get_scatter_colors(self, df):
        clusters = pd.unique(df['library']).tolist()
        colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
        color_mapper = {clusters[i]: colors[i % 20]
                        for i in range(len(clusters))}
        return color_mapper

    # def get_marker_mapper(self, df):
    #     markers = ["circle", "square", "triangle",
    #                "hex", "inverted_triangle", "diamond"]
    #     libs = pd.unique(df['library']).tolist()
    #     marker_mapper = {libs[i]: markers[i] for i in range(len(libs))}
    #     return marker_mapper

    def get_scatterplot(self, scatterdf):
        df = scatterdf.copy()
        color_mapper = self.get_scatter_colors(df)
        # marker_mapper = self.get_marker_mapper(df)
        df['color'] = df['library'].apply(lambda x: color_mapper[x])
        # df['marker'] = df['library'].apply(lambda x: marker_mapper[x])

        # range_slider = RangeSlider("title = Adjust x-axis",
        #                            start=0,
        #                            end=10,
        #                            step=1)

        tooltips = [
            ("Gene Set", "@gene_set"),
            ("Cluster", "@label"),
            ("Library", "@library")
        ]

        hover_emb = HoverTool(tooltips=tooltips)
        tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

        plot_emb = figure(
            width=900,
            height=700,
            tools=tools_emb
        )

        source = ColumnDataSource(
            data=dict(
                x=df['x'],
                y=df['y'],
                gene_set=df['term'],
                colors=df['color'],
                label=df['cluster'],
                library=df['library'],
                # markers=df['marker']

            )
        )

        # hide axis labels and grid lines
        plot_emb.xaxis.major_tick_line_color = None
        plot_emb.xaxis.minor_tick_line_color = None
        plot_emb.yaxis.major_tick_line_color = None
        plot_emb.yaxis.minor_tick_line_color = None
        plot_emb.xaxis.major_label_text_font_size = '0pt'
        plot_emb.yaxis.major_label_text_font_size = '0pt'

        plot_emb.output_backend = "svg"

        plot_emb.xaxis.axis_label = "UMAP_1"
        plot_emb.yaxis.axis_label = "UMAP_2"

        s = plot_emb.scatter(
            'x',
            'y',
            size=4,
            source=source,
            color='colors',
            legend_group='label',
            # marker='markers'
        )

        plot_emb.add_layout(plot_emb.legend[0], 'right')

        return plot_emb


In [None]:
%%appyter code_eval
if umap:
    umap = UMAP_Visualization(gmt_df = [df])
    umap_df = umap.process_scatterplot(maxdf = umap_maxdf, mindf = umap_mindf, nneighbors = umap_num_neighbors)
    fig = umap.get_scatterplot(umap_df)
    show(fig)

## Descriptive Statistics of GMT

In [None]:
def get_GMT_stats(pair_df):
    ##input: df
    
    ##gene stats
    geneset_lst = pair_df['Genes'].to_list()
    geneset_lst = [l.split(' ') for l in geneset_lst]
    gene_count = {}
    geneset_size = []

    for gene_set in geneset_lst:
        geneset_size.append(len(gene_set))
        for gene in gene_set:
            if gene in gene_count:
                gene_count[gene] +=1
            else:
                gene_count[gene] =1
                
    avg_geneset_size = stat.mean(geneset_size)
    std_geneset_size = stat.stdev(geneset_size)
#     print(avg_geneset_size)
#     print(std_geneset_size)
    hist, edges = np.histogram(list(gene_count.values()))
    p = figure(width=700, height=500, toolbar_location= None,
       title="Count of Gene Occurence", 
        x_axis_label = 'Sets', 
        y_axis_label  = 'Genes' )
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
     fill_color="skyblue", line_color="white")

    show(p)

    hist, edges = np.histogram(geneset_size)
    
    p = figure(width=700, height=500, toolbar_location= None,
           title="Histogram of Gene Set Size", 
            x_axis_label = 'Genes in Gene Set', 
            y_axis_label  = 'Gene Sets' )
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
         fill_color="skyblue", line_color="white")
    

    show(p)

In [None]:
get_GMT_stats(df)