# T-SNE representations of topic models derived from text-based corpora

We use the core functionality from TechKnAcq's MALLET model, in additional to the code from [this excellent blog post](https://shuaiw.github.io/2016/12/22/topic-modeling-and-tsne-visualzation.html) (from https://shuaiw.github.io/).

This notebook provides base mapping that we specialize our applicaion from.

In [1]:
import sys
import numpy as np

sys.path.append("../lib")
from mallet import Mallet
from techknacq.corpus import Corpus
from techknacq.conceptgraph import ConceptGraph

MALLET_PATH = '/usr/local/bin/mallet'
corpus_dir = '/Users/Gully/Documents/Projects/2_active/bigDataU/work/2017-05-18-google_books/corpus4'
mallet_dir = '/Users/Gully/Documents/Projects/2_active/bigDataU/work/2017-05-18-google_books/mallet4'
prefix = '/Users/Gully/Documents/Projects/2_active/bigDataU/work/2017-05-18-google_books/mallet4/2d094-'

Instantiate the techknacq-tk elements.

In [2]:
cg = ConceptGraph()
corpus = Corpus(corpus_dir)
cg.add_docs(corpus)
from numpy.linalg import norm
m = Mallet(MALLET_PATH, mallet_dir, prefix=prefix)

Read 19612 documents.
Adding documents to concept graph.
Read 200 topics.
Loading key file.
Loading word-topic file.
Loading document-topic composition file.


In [3]:
from numpy.linalg import norm

td = []
doc_list = [d_tuple[0] for d_tuple in m.topic_doc[0]]

for (t, d_in_t_list) in enumerate(m.topic_doc):
    topic_counts = []
    topic_weights = []
    for (d, d_tuple) in enumerate(d_in_t_list):
        topic_counts.append( d_tuple[1] )
    td.append(topic_counts)
    
TD_raw = np.asarray(td)
DT_raw = TD_raw.transpose()

n_docs = DT_raw.shape[0]
n_topics = DT_raw.shape[1]

L1_norm = norm(DT_raw, axis=1, ord=1)
DT = DT_raw / L1_norm.reshape(n_docs,1)

In [4]:
import pickle
from sklearn.manifold import TSNE

tsne_lda_pkl_path = mallet_dir + "/tsne_lda.pkl"

In [6]:
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 

tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(DT)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 19611
[t-SNE] Computed conditional probabilities for sample 2000 / 19611
[t-SNE] Computed conditional probabilities for sample 3000 / 19611
[t-SNE] Computed conditional probabilities for sample 4000 / 19611
[t-SNE] Computed conditional probabilities for sample 5000 / 19611
[t-SNE] Computed conditional probabilities for sample 6000 / 19611
[t-SNE] Computed conditional probabilities for sample 7000 / 19611
[t-SNE] Computed conditional probabilities for sample 8000 / 19611
[t-SNE] Computed conditional probabilities for sample 9000 / 19611
[t-SNE] Computed conditional probabilities for sample 10000 / 19611
[t-SNE] Computed conditional probabilities for sample 11000 / 19611
[t-SNE] Computed conditional probabilities for sample 12000 / 19611
[t-SNE] Computed conditional probabilities for sample 13000 / 19611
[t-SNE] Computed conditional probabilities 

In [12]:
# save the t-SNE model

tsne_lda_pkl_file = open(tsne_lda_pkl_path, 'wb')
pickle.dump(tsne_lda, tsne_lda_pkl_file)
tsne_lda_pkl_file.close()

In [5]:
# load the t-SNE model

tsne_lda_pkl_file = open(tsne_lda_pkl_path, 'rb')
tsne_lda = pickle.load(tsne_lda_pkl_file)
tsne_lda_pkl_file.close()


# Code to create the HTML display

In [17]:
import bokeh.plotting as bp
from bokeh.models import PanTool, BoxZoomTool, WheelZoomTool, ResetTool
from bokeh.models import ColumnDataSource, Range1d, HoverTool, TapTool, OpenURL
import random

colors = []
for i in range(200):
    r = lambda: random.randint(0,255)
    colors.append('#%02X%02X%02X' % (r(),r(),r()))

colormap = np.array(colors)
print(len(colormap))

200


In [9]:
from collections import defaultdict
import math

#
# Provides HTML code for a single topic signature based on greyscale coding
# for each word
#
def topic_signature_html(m, t_tuple, n_words, colormap, global_min=None, global_max=None):
    
    t_id = t_tuple[0]
    t_percent = t_tuple[1]
    color = colormap[t_id]

    def invert_hex(hex_number):
        inverse = hex(abs(int(hex_number, 16) - 255))[2:]
        # If the number is a single digit add a preceding zero
        if len(inverse) == 1:
            inverse = '0' + inverse
        return inverse

    def float_to_greyscale(f):
        val = '%x' % int(f * 255)
        val = invert_hex(val)
        return '#%s%s%s' % (val, val, val)
        
    word_weights = sorted(
        m.topics[t_id].items(), key=operator.itemgetter(1), reverse=True
        )[:n_words] 
    
    vals = [x[1] for x in word_weights]
    val_max = max(vals)
    val_min = math.sqrt(min(vals) / 2)
    val_diff = float(val_max - val_min)
    if global_min and global_max:
        global_diff = float(global_max - global_min)

    t_percent_2sf = '%s' % float('%.2g' % t_percent)
    
    ret = '<emph><font color="'+color+'">&#x25A0; </font>'+str(t_id)+' ('+t_percent_2sf+'): </emph>'
    
    for (y, z) in sorted(word_weights, key=lambda x: x[1],
                         reverse=True):

        p = float(z - val_min) / val_diff

        if global_min and global_max:
            q = float(z - global_min) / global_diff
        else:
            q = p

        ret += '<span style="color:%s" title="%s%% relevant">%s</span>\n' % (
                float_to_greyscale(p), int(q * 100), y.replace('_', '&nbsp;'))
    
    return ret

In [18]:
#
# Given the document id, this function lists its top n_topics topics 
#
import operator
from IPython.core.display import display, HTML
from tqdm import tqdm

def document_signature_html(doc_id, DT, m, doc_list, n_topics, n_words, colormap):
    doc_count = DT.shape[0]
    top_topics = sorted(
            enumerate(DT[doc_id]), reverse=True, key=operator.itemgetter(1)
            )[:n_topics]
    
    doc = corpus[doc_list[doc_id]]
    html_signature = '<p><b>' + doc.title + '</b></br>'
    html_signature += '<i>' + ', '.join(doc.authors) + '</i>'
    #if(doc.url):
    #    html_signature += ' [<a href="'+doc.url+'">Link</a>]'
    html_signature += '</br>'
    html_signature += '</br>'.join([topic_signature_html(m, top_topics[i], n_words, colormap) for i in range(n_topics)])
    html_signature += '</p>'

    return html_signature

html_signatures = []
for i in tqdm(range(n_docs)):
    html_signatures.append(document_signature_html(i, DT, m, doc_list, 5, 10, colormap))

display(HTML(html_signatures[0]))    

  0%|          | 0/19611 [00:00<?, ?it/s]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [11]:
doc_count = DT.shape[0]
doc_urls = [corpus[doc_list[i]].url for i in range(doc_count)]

In [12]:
topic_keys = []
for i in range(DT.shape[0]):
    topic_keys += DT[i].argmax(),

In [13]:
title = 'Data Science Google Books Visualization'
num_example = len(DT)

hover = HoverTool( tooltips="""
    <div>
        <span>
            @html_signatures{safe}
        </span>
    </div>
    """
)

pan = PanTool()
boxzoom = BoxZoomTool()
wheelzoom = WheelZoomTool()
resetzoom = ResetTool()
tap = TapTool(callback=OpenURL(url="@doc_urls"))

cds = ColumnDataSource({
    "x":tsne_lda[:, 0],
    "y":tsne_lda[:, 1], 
    "color":colormap[topic_keys][:num_example],
    "html_signatures": html_signatures,
    "doc_urls": doc_urls
})

#plot_lda = bp.figure(plot_width=1400, plot_height=1100,
#                     title=title,
#                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
#                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools=[pan, boxzoom, wheelzoom, resetzoom, hover, tap],
                     active_drag=pan,
                     active_scroll=wheelzoom,
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter('x', 'y', color='color', source=cds)


In [14]:
from bokeh.io import output_file, show, save

output_file(mallet_dir+'/scatterplot1.html', title='Data Science Google Books Visualization', mode='cdn', root_dir=None)
show(plot_lda)