In [14]:
import urllib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import numpy as np
import umap
import itertools

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider, Range1d
from bokeh.layouts import column
from bokeh.palettes import all_palettes

output_notebook()

In [15]:
# define Enrichr library names here if you're using Enrichr libraries
all_libraries = ['ChEA_2016']

In [16]:
# open Enrichr library from online
def get_Enrichr_library(library_index):
    # processes library data
    raw_library_data = []
    library_data = []

    # open Enrichr library from online
    with urllib.request.urlopen('https://amp.pharm.mssm.edu/Enrichr/geneSetLibrary?mode=text&libraryName=' + all_libraries[library_index]) as f:
        for line in f.readlines():
                raw_library_data.append(line.decode("utf-8").split("\t\t"))

    #OR
    #locally upload library
    '''
    with open('filepath', 'r') as f:
        for line in f.readlines():
           raw_library_data.append(line.split("\t\t"))
    '''

    name = []
    gene_list = []

    for i in range(len(raw_library_data)):
        name += [raw_library_data[i][0]]
        raw_genes = raw_library_data[i][1].replace('\t', ' ')
        gene_list += [raw_genes[:-1]]

    library_data = [list(a) for a in zip(name, gene_list)]
    
    return library_data

In [17]:
library_data = get_Enrichr_library(0)

df = pd.DataFrame(data = library_data, columns = ['Name', 'Genes'])

gene_list = df['Genes']

tfidf_vectorizer = TfidfVectorizer(
    min_df=1, # says it will ignore genes that fall in under 1 gene set
    max_df=0.85, # says it will ignore genes that fall in over 85% of the gene sets
    max_features = 10000, # I think this means it only looks at 10000 genes but I'm not totally sure
    ngram_range=(1, 1) # says it will only look at individual genes (because order doesn't matter in our case)
)
print(gene_list)
tfidf = tfidf_vectorizer.fit_transform(gene_list)

# Save the feature names for later to create topic summaries
tfidf_fn = tfidf_vectorizer.get_feature_names()

# plot using UMAP after tfidf
reduce = umap.UMAP()
reduce.fit(tfidf)
embedding = reduce.transform(tfidf)

embedding = pd.DataFrame(embedding, columns=['x','y'])

source1 = ColumnDataSource(
        data=dict(
            x = embedding.x,
            y = embedding.y,
            alpha = [0.7] * embedding.shape[0],
            size = [7] * embedding.shape[0], # can make size of the circles smaller by reducing the 7 here
            gene_set = df['Name']
        )
    )

# just a print check here to make sure it's working correctly (feel free to delete)
print('There are ', embedding.shape[0], ' gene sets in this visualization')

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        </div>
    </div>
    """)
tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset']
plot_emb = figure(plot_width=700, plot_height=700, tools=tools_emb)
plot_emb.circle('x', 'y', size='size', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source1, name="df")

show(plot_emb)

0      CBR1 OR1A1 HTR1B LAPTM5 ADAM21 HPSE2 SELE CSF2...
1      MCTS2 BCL11B H13 ACYP2 FOXP1 HIST1H4A HIST1H4B...
2      EED NSUN5 UGT1A1 GABRA5 NCOA5 1700030C10RIK 31...
3      IGFBP2 GM973 2610017I09RIK REV1 TCFAP2B 181003...
4      TRANSLOCASE TIMM8B RGD1563216 CD3EAP RT1-CE5 H...
                             ...                        
640    FNBP4 HEXIM1 FNBP1 DHDH RC3H1 CACNA2D4 BTBD10 ...
641    HEXIM2 NIPSNAP3A CASP8AP2 FNBP4 HEXIM1 4930420...
642    HEXIM2 HEXIM1 FNBP4 FNBP1 BTBD19 BAD CACNA2D4 ...
643    HEXIM2 HEXIM1 FNBP1 BAD ANKFY1 BTBD16 CACNA2D4...
644    BAD DHDH SCD TFR2 BAX L1TD1 HCN4 PRKAA2 C5ORF5...
Name: Genes, Length: 645, dtype: object
There are  645  gene sets in this visualization


In [18]:
# Create and run NMF model

n_comp = 6 # this is the number of groups you are splitting your gene set into

nmf = NMF(
    n_components=n_comp, 
    max_iter=100000,
    alpha=0.0
)

W = nmf.fit_transform(tfidf)
H = nmf.components_
nmf_embedding = nmf.transform(tfidf)

In [19]:
reducer = umap.UMAP()
reducer.fit(W)
embedding = reducer.transform(W)

embedding = umap.UMAP().fit_transform(tfidf.todense())
embedding = pd.DataFrame(embedding, columns=['x','y'])

# this will color by the groups created during NMF
# it's useful to look at what the model considers to be a "group"
# can use it help adjust the number of components in the NMF model
embedding['hue'] = nmf_embedding.argmax(axis = 1)
my_colors = [all_palettes['Category20'][20][i] for i in embedding.hue]

source = ColumnDataSource(
        data=dict(
            x = embedding.x,
            y = embedding.y,
            alpha = [0.7] * embedding.shape[0],
            size = [7] * embedding.shape[0],
            gene_set = df['Name'],
            colors = my_colors
        )
    )

print(embedding.shape[0])

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        </div>
    </div>
    """)
tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']
plot_emb = figure(plot_width=700, plot_height=700, tools=tools_emb)
plot_emb.circle('x', 'y', size='size', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source, fill_color='colors', name="df")

show(plot_emb)

645


In [20]:
# this will print out the most common genes in each group
# not super useful but kind of interesting
n_topics = n_comp
n_top_words = 15

print("Topics found via NMF:")
for topic_idx, topic in enumerate(nmf.components_):
    print("\nTopic {}:".format(topic_idx+1))
    print(" ".join(['[{}]'.format(tfidf_fn[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()

Topics found via NMF:

Topic 1:
[h2] [pisd] [cfl1] [mnt] [atf4] [per1] [phf12] [wibg] [sf1] [mtmr4] [ndufs7] [ing1] [ehd1] [snar] [tob2]

Topic 2:
[c4bp] [ps1] [epb4] [mir181b] [mir181a] [4921521f21rik] [2900092d14rik] [1500015o10rik] [1700029f09rik] [hsfy2] [pinc] [aox4] [1700066m21rik] [mtap2] [9130024f11rik]

Topic 3:
[nkx2] [h2] [nkx6] [zic1] [foxf1a] [foxf2] [alx3] [dmrt3] [hoxb3] [en2] [wnt1] [hmx1] [tlx3] [hoxd9] [hoxa13]

Topic 4:
[rcc1] [snhg3] [c1orf93] [tprg1l] [loc100128003] [kiaa0754] [c1orf113] [hes4] [mst1p2] [flj37453] [c1orf86] [epb41] [loc100133612] [cdk11a] [hspc157]

Topic 5:
[mir] [mmu] [phc1] [rnf12] [pou5f1] [lrrc2] [sox2] [tdgf1] [rif1] [porcn] [dppa3] [ifitm1] [zfp57] [fbxo15] [2310003c23rik]

Topic 6:
[hla] [odz2] [auts2] [mctp2] [ext1] [robo1] [znf608] [tns3] [dlg2] [pik3r1] [gramd3] [cd180] [irs2] [dapk1] [myo10]

