In [1]:

from appyter import magic
magic.init(lambda _ = globals: _())

# GeneSet Library Set Appyter
This appyter is designed to perform basic statistics, analysis, and visualizations on a Gene Matrix Transpose (.GMT) file. This will allow bioinformatics researchers to analyze relationships between many different gene sets from several gene set libraries.
 To create your own GMT file, please see Enrichr. Enrichr, hosted by the Ma'ayan Laboratory at Mt. Sinai Icahn School of Medicine, is a collection of geneset libraries. 

In [39]:
import numpy as np 
import pandas as pd
import itertools 
import bokeh
import time
import networkx as nx
import pathlib
import scanpy as sc
from IPython.display import display, FileLink, HTML
import anndata
from statsmodels.stats import multitest as mlt
from sklearn.feature_extraction.text import TfidfVectorizer
from maayanlab_bioinformatics.enrichment import crisp
from collections import OrderedDict
from bokeh.palettes import Category20
import statistics as stat
from bokeh.io import output_notebook
from bokeh.models import HoverTool, ColumnDataSource, RangeSlider, Circle, MultiLine, Range1d
from bokeh.plotting import figure, show, save, output_file, from_networkx
import os
output_notebook()

In [3]:
%%appyter hide_code


{% do SectionField(name='GMTSubmission', title='1. Submit a GMT file', subtitle = 'Sumbit a GMT (Gene Matrix Transpose file) for analysis.', img = 'bulb.png') %}
{% do SectionField(name = 'Pairwise Similarity Table', title = '2. Pairwise Intersection Table', subtitle = 'In this table, the value in row A, column B, is the size of the intersection of A and B. If you would like to get a list of genes from a specific intersection of two library terms, please see the Intersection Search Section.', img = 'bulb.png') %}
{% do SectionField(name = 'Jaccard Similarity Table', title = '3. Jaccard Similarity Table', subtitle = '##TODO', img = 'bulb.png') %}
{% do SectionField(name = 'Intersection Search', title = '4. Gene Intersection Search', subtitle = '###TODO', img = 'bulb.png')%}
{% do SectionField (name = 'UMAP_visualization', title = '5. Scatterplot Visualization', subtitle= 'Visualize relative Geneset similarities on an interactive scatterplot', img = 'bulb.png') %}
{% do SectionField(name = 'GMT Descriptive Statistics', title = '6. Descriptive Statistics', subtitle = '#TODO', img = 'bulb.png') %}



## 0. Submitted Variables

In [6]:
%%appyter code_exec

{% set gs = TabField(
    name='gs_type',
    label='Gene Sets',
    default='Upload',
    choices={'Upload': [
            FileField(
                name='gs',
                label='Gene Set Files',
                default='static/Geneshot_PainGenes.gmt',
                example={
                    'example.gmt': url_for('static', filename = 'Geneshot_PainGenes.gmt')
                }
            ),
        ],},
    
section = 'GMTSubmission',)%}
gs = {{gs.value[0]}}



int_tbl = {{BoolField(name = 'SimilarityTbl', label = 'Intersection Size Table', default = 'true', description = 'In this table, the value in row A, column B, is the size of the intersection of A and B. If you would like to get a list of genes from a specific intersection of two library terms, please see the Intersection Search Section. Select \'Yes\' if you would like to generate a Intersection Size Table. Otherwise, select \'No\'', section = 'Pairwise Similarity Table') }}
jaccard_tbl = {{BoolField(name = 'JaccardTbl', label = 'Jaccard Similarity Table', default = 'true', description = '##TODO', section = 'Jaccard Similarity Table') }}


umap = {{ BoolField(name = 'umap', label = 'ScatterPlot Visualization', default = 'true', description = 'Select \'Yes\' if you would like to generate a Scatter Plot. Otherwise, select \'No\'', section = 'UMAP_visualization')}}

umap_num_neighbors = {{ IntField(name = 'nneighbors', label = 'Number of Neighbors', default = 5, min = 1, max = 30, description = '##TODO: Play around with parameter settings', section = 'UMAP_visualization')}}
umap_maxdf = {{ ChoiceField(name = 'max_df', label = 'Max df setting', choices = {'0.5': '0.5', '0.75': '.75', '0.9': '.9', '1.0': '1'}, default = '0.5',  description = '##TODO: Play around with parameter settings', section = 'UMAP_visualization')}}
umap_mindf = {{ ChoiceField(name = 'min_df', label = 'Min df setting', choices = {'0.1' : '0.1', '0.25' : '0.25', '0.5': '0.5' }, default = '0.25', description = '##TODO: Play around with parameter settings', section = 'UMAP_visualization')}}


```python
gs = 'static/Geneshot_PainGenes.gmt'
int_tbl = True
jaccard_tbl = True
umap = True
umap_num_neighbors = 5
umap_maxdf = 0.5
umap_mindf = 0.25
```

In [7]:
if gs == '' :
    raise Exception('Please upload a GMT File!')
if gs.split('.')[1] != 'gmt':
    raise Exception('Invalid File, Please upload a GMT File')

## 1. Process the GMT FILE

In [8]:
%%appyter code_exec

def series_to_list(gene_list):
    ##helper function to convert a gene pd.series to a gene list
    ret_list = []
    for genes in gene_list:
        if type(genes) is str:
            ret_list.append(genes)
        else: ##pd series case
            genes = genes.tolist()        
            ret_list.append(' '.join(genes))
    return ret_list

def load_set(file):
    ''' Load a set of files into pairs of labeled sets
    '''
    lst= []
    path = pathlib.Path(file)
    with open(path) as f:
        lines = f.readlines()
        for line in lines:
            parsed_line = line.split('\t')
            term, library, genes = parsed_line[0], parsed_line[1], parsed_line[2:]
            if genes[-1][:-2] == '\n':
                genes[-1] = genes[-1][:-2] ##trim off newline regex '\n'
            lst.append((term,  library, ' '.join(genes)))
    zip_lst = [list(i) for i in zip(*lst)]
    term, library, genes = zip_lst[0], zip_lst[1], zip_lst[2]
    genes = series_to_list(genes)

    df = pd.DataFrame({'Genes': genes, 'Library': library}, index = term)
    return df              

```python
def series_to_list(gene_list):
    ##helper function to convert a gene pd.series to a gene list
    ret_list = []
    for genes in gene_list:
        if type(genes) is str:
            ret_list.append(genes)
        else: ##pd series case
            genes = genes.tolist()
            ret_list.append(' '.join(genes))
    return ret_list
def load_set(file):
    ''' Load a set of files into pairs of labeled sets
    '''
    lst= []
    path = pathlib.Path(file)
    with open(path) as f:
        lines = f.readlines()
        for line in lines:
            parsed_line = line.split('\t')
            term, library, genes = parsed_line[0], parsed_line[1], parsed_line[2:]
            if genes[-1][:-2] == '\n':
                genes[-1] = genes[-1][:-2] ##trim off newline regex '\n'
            lst.append((term,  library, ' '.join(genes)))
    zip_lst = [list(i) for i in zip(*lst)]
    term, library, genes = zip_lst[0], zip_lst[1], zip_lst[2]
    genes = series_to_list(genes)
    df = pd.DataFrame({'Genes': genes, 'Library': library}, index = term)
    return df
```

In [9]:

df = load_set(gs)
if df.shape[0] < umap_num_neighbors:
    umap_num_neighbors = int(np.ceil(df.shape[0]/2)) ##arbitrary right now. May want to change based on parameter settings
    print('Number of Neighbors parameter in scatterplot is too large for the submitted dataset. Resetting number of neighbors to '+ str(umap_num_neighbors)+'')
        


Number of Neighbors parameter in scatterplot is too large for the submitted dataset. Resetting number of neighbors to 2


In [10]:
def calculate_FET(set1, set2, background = 20000):
    ##inputs: set1, set2 - python sets
    ##output: p-value of the fisher exact test
    res = crisp.fisher_overlap(set1, set2, n_background_entities= background, preserve_overlap=True)
    if res == None:
        return 0
    else:
        return res.pvalue

In [11]:
def get_itertuple(str1, str2):
    ##given two strings (which should be terms in the given gmt), get the tuple back that will index into the pair_df
    ##itertools.combos gives tuples that are alphabetically ordered for strings.
    return (str1, str2) if str1 < str2 else (str2, str1) 

def clean_name(dir_name):
    dir_name = dir_name.replace(' ', '_')
    dir_name = dir_name.replace('/', '_')
    dir_name = dir_name.replace(':', '_')
    dir_name = dir_name.replace('(', '_')
    dir_name = dir_name.replace(')', '_')
    dir_name = dir_name.strip()
    if len(dir_name) >  60:
        dir_name = dir_name[0:30] + '...' + dir_name[-30:]
    return dir_name


def make_dirs(str1,str2):
    if not os.path.exists(f'Intersection_Sets/{str1}'):
        os.mkdir(f"Intersection_Sets/{str1}")
    return
        
def save_set(str1, str2, intersection_set):
    term1, term2 = clean_name(str1), clean_name(str2)
    term1, term2 = get_itertuple(term1, term2)
    make_dirs(term1, term2)
    ##str1, str2 are terms to save set in system by. geneset is set of the intersection set to save
    series = pd.Series(list(intersection_set))
    full_name = os.path.join(r'Intersection_Sets', term1, term2)
    try: 
        series.to_csv(full_name+'.csv')
    except:
        print(f' File {full_name} could not be saved. Please consider renaming the terms to fit file naming requirements.')
    return
    
    
        
    
    
    ##given two strings (representing the terms in the given gmt), create a directory to store intersection lists in
    

In [12]:
def BH_test(pair_df, alpha = .05):
    #benjamini hochberg test
    #input: pair_df: pairwise dataframe as described above
    #input: alpha: a priori significance level 
    #output: an extended pair_df dataframe with two new columns, 'BH_sig'- a boolean column where True implies significant overlap and 'BH_corrected_pval' 
    ##- a  pvalue adjust for multiple hypothesis testing
    pvals = pair_df['FET_pval'].tolist()
    sig, corrected_pval = mlt.fdrcorrection(pvals, alpha, method = 'indep')
    print(type(sig))
    pair_df['BH_sig'] = sig
    pair_df['BH_corrected_pval'] = corrected_pval
    return pair_df
    

In [27]:
def series_to_str(el):
    if type(el) == str:
        return el
    else:
        return ' '.join(el.tolist())

def generate_pairs_df(df, background = 20000):
    ##inputs: df - pandas dataframe that is the result of GMT_to_df transformation
    ##output: pair_df - pandas dataframe whose rows are indexed by a tuple/ pair of terms in the set of Gene set 
    # #terms and columns represent calculated set properties between the two sets

    
    os.makedirs("Intersection_Sets", exist_ok = True)
    intersection = []
    in_A_not_B = []
    in_B_not_A = []
    union = []
    jaccard = []
    FET_pval = []

    to_set = lambda el: set(series_to_str(el).split(' '))
    space_counter = lambda str1: str1.count(" ") +1
    terms = list(df.index.values)
    int_df = pd.DataFrame(index = terms, columns = terms)
    jac_df = pd.DataFrame(index = terms, columns = terms)
    pairwise_perms = list(itertools.combinations(terms,2))
    for term1,term2 in pairwise_perms:
        setA, setB = df.loc[term1]['Genes'], df.loc[term2]['Genes']
        set1, set2 = to_set(setA), to_set(setB)
        intersect = set1 & set2
        save_set(term1, term2, intersect)


        union_set = set1 | set2
        intersection.append(' '.join(list(intersect)))
        in_A_not_B.append(' '.join(list(set1 -set2)))
        in_B_not_A.append(' '.join(list(set2 - set1)))
        union.append(' '.join(list(union_set)))
        pval = calculate_FET(set1, set2)
        FET_pval.append(pval)

        int_size = len(intersect)
        uni_size = len(union_set)
        jaccard = "{:.2f}".format(int_size/uni_size)
        
        term1_c, term2_c = get_itertuple(clean_name(term1), clean_name(term2)) ##clean and reorder the terms to the appropriate directory mapping
        
        jac_df.loc[term1, term2] = jaccard
        jac_df.loc[term2, term1] = jaccard
        
        if int_size != 0:
            int_df.loc[term1, term2] = f'<div class = "df-wrap" style = "border: 1px solid; font-weight:bold; background-color: Floralwhite; 2px; height: 1.5em; width: 21px; border-radius: 4px; color: black; float: right; text-align: center"><a style = "text-decoration: none; color: black;" href = "localhost:8888/edit/Intersection_Sets/{term1_c}/{term2_c}.csv">{int_size}</a></div>'
            int_df.loc[term2,term1] =  f'<div class = "df-wrap" style = "border: 1px solid; font-weight:bold; background-color: Floralwhite;  height: 1.5em; width: 21px; border-radius: 4px; color: black; float: right; text-align: center"><a style = "text-decoration: none; color: black;" href = "localhost:8888/edit/Intersection_Sets/{term1_c}/{term2_c}.csv">{int_size}</a></div>'
        else:
            int_df.loc[term1,term2] = 0
            int_df.loc[term2,term1] = 0



    pair_df = pd.DataFrame({'Intersection' : intersection, 'A-B' : in_A_not_B, 'B-A' : in_B_not_A, 'Union': union, 'FET_pval': FET_pval}, index = pairwise_perms)
    pair_df['intersect_size'] = pair_df['Intersection'].map(space_counter)
    pair_df['union_size'] = pair_df['Union'].map(space_counter)
    pair_df['Jaccard'] = pair_df['intersect_size'] / pair_df['union_size']
    
    np.fill_diagonal(int_df.values,0)
    np.fill_diagonal(jac_df.values, 0)
    
    

    return pair_df, int_df, jac_df




In [14]:
pair_df, int_df, jac_df = generate_pairs_df(df)

In [15]:
pair_df = BH_test(pair_df)

<class 'numpy.ndarray'>


## 2. Pairwise Intersection Analysis


In [16]:
if int_tbl:
    table_styles = [dict(selector = "td", props = [("text-align", "center")])]
    int_df.style.set_table_styles(table_styles)
    
    
    
    
    
    html_string = f'''
    <html>
    <head><title>Intersection Table</title></head>
    <body style = "text-align: center">
    {int_df.to_html(render_links = True, escape = False)}
    </body>
    </html>.
    '''
    
    
    
    os.makedirs("P.I_matrix", exist_ok = True)
    int_df.to_csv('P.I_matrix/intersection_matrix.csv')
    display(HTML(html_string))
    display(FileLink('P.I_matrix/intersection_matrix.csv', result_html_prefix= str('Download Pairwise Intersection Matrix:   ')))

Unnamed: 0,A_Geneshot_PainGenes_GeneRIF_AssociatedGenes,B_Geneshot_PainGenes_AutoRIF_AssociatedGenes,C_Geneshot_PainGenes_GeneRIF_PredictedGenes_AutoRIF-CoOccurrence
A_Geneshot_PainGenes_GeneRIF_AssociatedGenes,0,757,152
B_Geneshot_PainGenes_AutoRIF_AssociatedGenes,757,0,126
C_Geneshot_PainGenes_GeneRIF_PredictedGenes_AutoRIF-CoOccurrence,152,126,0


In [44]:
# G.clear()
# term_lst = list(df.index)
# term_lst_indexer = range(len(term_lst))
# term_map = dict(zip(term_lst, term_lst_indexer))

# index_mapper = lambda x,y: (term_map[x], term_map[y])

# title = 'Fisher Exact Test Significance Network'


# HOVER_TOOLTIPS = [("Library", "@index")]

# #Create a plot — set dimensions, toolbar, and title
# plot = figure(tooltips = HOVER_TOOLTIPS,
#               tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
#             x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1), title=title)

# G = nx.Graph()
# BH_sig_pairs = list(pair_df.iloc[np.where(pair_df['BH_sig']== True)[0]].index)
# BH_sig_pairs = [index_mapper(term1, term2) for (term1, term2) in BH_sig_pairs]

# G.add_edges_from(BH_sig_pairs)

# graph = from_networkx(G, nx.spring_layout)
# graph.node_renderer.glyph = Circle(size=15, fill_color='skyblue')
# graph.edge_renderer.glyph = MultiLine(line_alpha = 0.5, line_width = 1)
# plot.renderers.append(graph)
# show(plot)




In [None]:



# nx.draw(G, with_labels = True)

## 3. Jaccard Similarity Matrix

In [18]:
if jaccard_tbl:
#     display(jac_df.head())
    jac_df.to_csv('P.I_matrix/jaccard_matrix.csv')
    display(FileLink('P.I_matrix/jaccard_matrix.csv', result_html_prefix= str('Download Jaccard Similarity Matrix:   ')))
    

## 4. Term Intersection Search

#### In this section, the user can submit library terms (at least 2) they would like to search the intersection of. The user can submit as many terms names as they would like, allowing analysis of the intersections of many sets.






## 5. ScatterPlot Visualization

In [43]:
class NoResults(Exception):
    pass


class APIFailure(Exception):
    pass


class NotValidFile(Exception):
    pass


class UMAP_Visualization:

    def __init__(self, query_set=[], gene_libraries=[], sig_value=.05, gmt_files=[], gmt_df = []):
        self.query_set = [gene.strip() for gene in query_set]
        self.gene_libraries = gene_libraries
        self.significant_value = sig_value
        self.term_library_map = {}
        self.dataset = OrderedDict()
        self.dataset.update(self.process_gmt_df(gmt_df))
        
        
    def process_gmt_df(self, gmt_df):
        if gmt_df == []:
            return OrderedDict() ##return the empty Dictionary when no gmt passed in
        else:
            gmt_df = gmt_df[0]
            self.term_library_map.update(pd.Series(gmt_df['Library'].values,index=gmt_df.index.values).to_dict())
            return OrderedDict(pd.Series(gmt_df['Genes'].values,index=gmt_df.index.values).to_dict())
            
   

    def process_scatterplot(self, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
        libdict = self.dataset
        print("\tTF-IDF vectorizing gene set data...")
        # computes tdfidf score--look this up
        vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
        X = vec.fit_transform(libdict.values())
        print(X.shape)
        adata = anndata.AnnData(X)
        adata.obs.index = libdict.keys()

        print("\tPerforming Leiden clustering...")
        # the n_neighbors and min_dist parameters can be altered
        sc.pp.neighbors(adata, n_neighbors=nneighbors)
        sc.tl.leiden(adata, resolution=1.0)
        sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

        new_order = adata.obs.sort_values(by='leiden').index.tolist()
        adata = adata[new_order, :]
        adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

        df = pd.DataFrame(adata.obsm['X_umap'])
        df.columns = ['x', 'y']

        df['cluster'] = adata.obs['leiden'].values
        df['term'] = adata.obs.index
        df['genes'] = [libdict[l] for l in df['term']]
        df['library'] = [self.term_library_map[l] for l in df['term']]

        return df

    def get_scatter_colors(self, df):
        clusters = pd.unique(df['library']).tolist()
        colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
        color_mapper = {clusters[i]: colors[i % 20]
                        for i in range(len(clusters))}
        return color_mapper

    # def get_marker_mapper(self, df):
    #     markers = ["circle", "square", "triangle",
    #                "hex", "inverted_triangle", "diamond"]
    #     libs = pd.unique(df['library']).tolist()
    #     marker_mapper = {libs[i]: markers[i] for i in range(len(libs))}
    #     return marker_mapper

    def get_scatterplot(self, scatterdf):
        df = scatterdf.copy()
        color_mapper = self.get_scatter_colors(df)
        # marker_mapper = self.get_marker_mapper(df)
        df['color'] = df['library'].apply(lambda x: color_mapper[x])
        # df['marker'] = df['library'].apply(lambda x: marker_mapper[x])

        # range_slider = RangeSlider("title = Adjust x-axis",
        #                            start=0,
        #                            end=10,
        #                            step=1)

        tooltips = [
            ("Gene Set", "@gene_set"),
            ("Cluster", "@cluster"),
            ("Library", "@library")
        ]

        hover_emb = HoverTool(tooltips=tooltips)
        tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

        plot_emb = figure(
            width=900,
            height=700,
            tools=tools_emb
        )

        source = ColumnDataSource(
            data=dict(
                x=df['x'],
                y=df['y'],
                gene_set=df['term'],
                colors=df['color'],
                label=df['library'],
                library=df['library'],
                cluster = df['cluster']
                # markers=df['marker']

            )
        )

        # hide axis labels and grid lines
        plot_emb.xaxis.major_tick_line_color = None
        plot_emb.xaxis.minor_tick_line_color = None
        plot_emb.yaxis.major_tick_line_color = None
        plot_emb.yaxis.minor_tick_line_color = None
        plot_emb.xaxis.major_label_text_font_size = '0pt'
        plot_emb.yaxis.major_label_text_font_size = '0pt'

        plot_emb.output_backend = "svg"

        plot_emb.xaxis.axis_label = "UMAP_1"
        plot_emb.yaxis.axis_label = "UMAP_2"

        s = plot_emb.scatter(
            'x',
            'y',
            size=4,
            source=source,
            color='colors',
            legend_group='label',
            # marker='markers'
        )

        plot_emb.add_layout(plot_emb.legend[0], 'right')

        return plot_emb


In [20]:
%%appyter code_eval
if umap:
    umap = UMAP_Visualization(gmt_df = [df])
    umap_df = umap.process_scatterplot(maxdf = umap_maxdf, mindf = umap_mindf, nneighbors = umap_num_neighbors)
    fig = umap.get_scatterplot(umap_df)
    show(fig)

```python
if umap:
    umap = UMAP_Visualization(gmt_df = [df])
    umap_df = umap.process_scatterplot(maxdf = umap_maxdf, mindf = umap_mindf, nneighbors = umap_num_neighbors)
    fig = umap.get_scatterplot(umap_df)
    show(fig)
```

	TF-IDF vectorizing gene set data...
(3, 1580)
	Performing Leiden clustering...
         Falling back to preprocessing with `sc.pp.pca` and default params.




## Descriptive Statistics of GMT

In [21]:
def get_GMT_stats(pair_df):
    ##input: df
    
    ##gene stats
    geneset_lst = pair_df['Genes'].to_list()
    geneset_lst = [l.split(' ') for l in geneset_lst]
    gene_count = {}
    geneset_size = []

    for gene_set in geneset_lst:
        geneset_size.append(len(gene_set))
        for gene in gene_set:
            if gene in gene_count:
                gene_count[gene] +=1
            else:
                gene_count[gene] =1
                
    avg_geneset_size = stat.mean(geneset_size)
    std_geneset_size = stat.stdev(geneset_size)
#     print(avg_geneset_size)
#     print(std_geneset_size)
    hist, edges = np.histogram(list(gene_count.values()))
    p = figure(width=700, height=500, toolbar_location= None,
       title="Count of Gene Occurence", 
        x_axis_label = 'Sets', 
        y_axis_label  = 'Genes' )
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
     fill_color="skyblue", line_color="white")

    show(p)

    hist, edges = np.histogram(geneset_size)
    
    p = figure(width=700, height=500, toolbar_location= None,
           title="Histogram of Gene Set Size", 
            x_axis_label = 'Genes in Gene Set', 
            y_axis_label  = 'Gene Sets' )
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
         fill_color="skyblue", line_color="white")
    

    show(p)

In [22]:
get_GMT_stats(df)