In [70]:
import urllib
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.datasets
import pandas as pd
import numpy as np
import umap
import itertools

from maayanlab_bioinformatics.enrichment import enrich_crisp

from sklearn.decomposition import NMF

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider, Range1d
from bokeh.layouts import column
from bokeh.palettes import all_palettes
output_notebook()

In [71]:
all_libraries = ['ChEA_2016']
#genes = ['TP53', 'TNF', 'EGFR', 'GKN1', 'HADHA', 'APOE', 'ESR1', 'VEGFA', 'TGFB1', 'PREPL', 'TIA1', 'TPO', 'TTN', 'SATB2', 'CHPF', 'MALL', 'MIPIP', 'NUPL1', 'IL6', 'PDIA3', 'CTNNB1', 'SLC39A1', 'DTNA','SLC1A1', 'GALNT2', 'HIST2H2AC', 'CD63']

open_gene_list_file = open('geneList.txt','r')
lines = open_gene_list_file.readlines()
genes = [x.strip().upper() for x in lines]
open_gene_list_file.close()

significance_value = 0.05

In [72]:
# open Enrichr library from online
def get_Enrichr_library(library_index):
    # processes library data
    raw_library_data = []
    library_data = []

    with urllib.request.urlopen('https://amp.pharm.mssm.edu/Enrichr/geneSetLibrary?mode=text&libraryName=' + all_libraries[library_index]) as f:
        for line in f.readlines():
                raw_library_data.append(line.decode("utf-8").split("\t\t"))

    name = []
    gene_list = []

    for i in range(len(raw_library_data)):
        name += [raw_library_data[i][0]]
        raw_genes = raw_library_data[i][1].replace('\t', ' ')
        gene_list += [raw_genes[:-1]]

    library_data = [list(a) for a in zip(name, gene_list)]
    
    return library_data

In [73]:
library_data = get_Enrichr_library(0)

df = pd.DataFrame(data = library_data, columns = ['Name', 'Genes'])

gene_list = df['Genes']

tfidf_vectorizer = TfidfVectorizer(
    min_df=1,
    max_df=0.85,
    max_features = 10000,
    ngram_range=(1, 1)
)

tfidf = tfidf_vectorizer.fit_transform(gene_list)

# Save the feature names for later to create topic summaries
tfidf_fn = tfidf_vectorizer.get_feature_names()

# plot after tfidf

reduce = umap.UMAP()
reduce.fit(tfidf)
embedding = reduce.transform(tfidf)

embedding = pd.DataFrame(embedding, columns=['x','y'])

source1 = ColumnDataSource(
        data=dict(
            x = embedding.x,
            y = embedding.y,
            alpha = [0.7] * embedding.shape[0],
            size = [7] * embedding.shape[0],
            gene_set = df['Name']
        )
    )

print(embedding.shape[0])

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        </div>
    </div>
    """)
tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset']
plot_emb = figure(plot_width=700, plot_height=700, tools=tools_emb)
plot_emb.circle('x', 'y', size='size', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source1, name="df")

show(plot_emb)

645


In [74]:
# create and run NMF model

n_comp = 8

nmf = NMF(
    n_components=n_comp, 
    max_iter=100000,
    alpha=0.0
)

W = nmf.fit_transform(tfidf)
H = nmf.components_
nmf_embedding = nmf.transform(tfidf)

In [75]:
# enrichment analysis
def get_library_iter(library_data):
    for member in library_data:
        term = member[0]
        gene_set = member[1].split(' ')
        yield term, gene_set

def get_enrichment_results(genes, library_data):
    return sorted(enrich_crisp(genes, get_library_iter(library_data), 20000, True), key=lambda r: r[1].pvalue)

def get_pvalue(row, unzipped_results, all_results):
    if row['Name'] in list(unzipped_results[0]):
        index = list(unzipped_results[0]).index(row['Name'])
        return all_results[index][1].pvalue
    else:
        return 1

In [76]:
# call UMAP

reducer = umap.UMAP()
reducer.fit(W)
embedding = reducer.transform(W)

embedding = umap.UMAP().fit_transform(tfidf.todense())
embedding = pd.DataFrame(embedding, columns=['x','y'])

In [77]:
# call enrichment results
all_results = get_enrichment_results(genes, library_data)
unzipped_results = list(zip(*all_results))

# add p value to the dataframe
df['p value'] = df.apply (lambda row: get_pvalue(row, unzipped_results, all_results), axis=1)

my_colors = []
for index, row in df.iterrows():
    if row['p value'] < significance_value:
        my_colors += ['#000000']
    else:
        my_colors += [all_palettes['Category20'][20][0]]

source = ColumnDataSource(
        data=dict(
            x = embedding.x,
            y = embedding.y,
            gene_set = df['Name'],
            p_value = df['p value'],
            colors = my_colors
        )
    )

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
            <span style="font-size: 12px; font-weight: bold;">p-value:</span>
            <span style="font-size: 12px">@p_value</span>
        </div>
    </div>
    """)
tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset']
plot_emb = figure(plot_width=700, plot_height=700, tools=tools_emb)
plot_emb.circle('x', 'y', size = 7, alpha = 0.7, line_alpha = 0, 
                line_width = 0.01, source = source, fill_color = 'colors', name = "df")

show(plot_emb)

In [78]:
# this will print out the most common genes in each group
# not super useful but kind of interesting
n_topics = n_comp
n_top_words = 15

print("Topics found via NMF:")
for topic_idx, topic in enumerate(nmf.components_):
    print("\nTopic {}:".format(topic_idx+1))
    print(" ".join(['[{}]'.format(tfidf_fn[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()

Topics found via NMF:

Topic 1:
[h2] [sep] [pisd] [ps1] [kdm6b] [malat1] [arid3a] [neat1] [mir142] [foxo3] [mir21] [mnt] [myo1c] [stk11] [bahcc1]

Topic 2:
[c4bp] [epb4] [ps1] [mir181b] [mir181a] [2900092d14rik] [4921521f21rik] [pinc] [hsfy2] [1500015o10rik] [aox4] [1700029f09rik] [mtap2] [9430031j16rik] [1700066m21rik]

Topic 3:
[nkx2] [h2] [nkx6] [zic1] [foxf1a] [foxf2] [alx3] [dmrt3] [hoxb3] [hmx1] [en2] [hoxd9] [wnt1] [tlx1] [otp]

Topic 4:
[rcc1] [snhg3] [tprg1l] [c1orf93] [kiaa0754] [loc100128003] [cdk11a] [loc100133612] [epb41] [c1orf113] [mst1p2] [c1orf63] [hes4] [znf362] [flj37453]

Topic 5:
[mir] [mmu] [phc1] [rnf12] [pou5f1] [lrrc2] [sox2] [tdgf1] [porcn] [rif1] [dppa3] [ifitm1] [zfp57] [2310003c23rik] [cd38]

Topic 6:
[odz2] [mctp2] [auts2] [robo1] [gramd3] [dlg2] [znf608] [ext1] [pdzrn3] [kcnma1] [lphn2] [cd180] [odz3] [efna5] [wwox]

Topic 7:
[tbp] [ppp1r10] [rps18] [pex12] [vps52] [sart3] [ccnt1] [dnaja3] [rpo1] [dpagt1] [mrpl24] [bat3] [mrps18b] [brd8] [prpf31]

Topic 8