In [1]:
import pyLDAvis
import pandas as pd
import json
import numpy as np

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 250)
# np.set_printoptions(suppress=True)

In [2]:
phi_topic_term_dists_file = "/opt/0.imaginea/rpx/spark-lda-vis/phi/part-00000"
theta_doc_topics_dists_file = "/opt/0.imaginea/rpx/spark-lda-vis/theta/part-00000"
# doc_length_file = "/tmp/lda-vis/doc-length/part-00000"
vocab_file = "/opt/0.imaginea/rpx/spark-lda-vis/vocab/part-00000"
term_freq_file = "/opt/0.imaginea/rpx/spark-lda-vis/ter-freq/part-00000"

### Reference http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb

In [4]:
import glob
path = '/opt/0.imaginea/rpx/spark-lda-vis/theta'
thetaFiles = glob.glob(path + "/part*")
theta_with_size = pd.DataFrame()
list_ = []
for file_ in thetaFiles:
    df = pd.read_csv(file_, index_col=None, header=None)
    list_.append(df)
theta_with_size = pd.concat(list_)
theta_with_size = theta_with_size[theta_with_size[0] > 0]
phi = pd.read_csv(phi_topic_term_dists_file, header=None)
# theta_with_size = pd.read_csv(theta_doc_topics_dists_file, header=None)
theta = theta_with_size.ix[:,1:]
doc_length = theta_with_size.ix[:,0]
vocab = pd.read_csv(vocab_file, header=None)
term_freq = pd.read_csv(term_freq_file, header=None)

In [5]:
flatten = lambda l: [item for sublist in l for item in sublist]


In [6]:
data = {'topic_term_dists': phi.values.tolist(), 
            'doc_topic_dists': theta.values.tolist(),
            'doc_lengths': doc_length.astype("int64").values.tolist(),
            'vocab': flatten(vocab.values.tolist()),
            'term_frequency': flatten(term_freq.astype("int64").values.tolist())}


print('Topic-Term shape: %s' % str(np.array(data['topic_term_dists']).shape))
print('Doc-Topic shape: %s' % str(np.array(data['doc_topic_dists']).shape))

Topic-Term shape: (50, 36887)
Doc-Topic shape: (1000, 50)


In [7]:
lda_vis_data = pyLDAvis.prepare(**data)

In [8]:
pyLDAvis.enable_notebook()

In [9]:
# pyLDAvis.prepare(mds='mmds', **data)

In [10]:
pyLDAvis.save_json(lda_vis_data, "/tmp/ldavis.json")

In [11]:
pyLDAvis.save_html(lda_vis_data, "/tmp/ldavis.html")

In [12]:
pyLDAvis.urls

<module 'pyLDAvis.urls' from '/home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/pyLDAvis/urls.py'>

In [14]:
# !cat /home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/pyLDAvis/urls.py

# Reference exploration for Scala Porting
**Only for developers**

# LDA - Latent Dirichlet allocation
- https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

K(integer) = number of topics (e.g. 50)  
V(integer) = number of words in the vocabulary (e.g. 50,000 or 1,000,000)  
M(integer) = number of documents  
$N_{d=1\dots M} $(integer) = number of words in document d  
N(integer) = total number of words in all documents; sum of all $N_{d}$ values, i.e. $N=\sum _{d=1}^{M}N_{d}$  
Z = N-dimension vector of integers between 1 and K = identity of topic of all words in all documents  
W = N-dimension vector of integers between 1 and V = identity of all words in all documents  

# Dirichlet distribution
- https://en.wikipedia.org/wiki/Dirichlet_distribution

In [16]:
from pyLDAvis import *
from pyLDAvis._prepare import *


In [17]:
topic_term_dists = pyLDAvis._prepare._df_with_names(data['topic_term_dists'], 'topic', 'term')
#[K x V]
doc_topic_dists  = pyLDAvis._prepare._df_with_names(data['doc_topic_dists'], 'doc', 'topic')
#[M x K]
term_frequency   = pyLDAvis._prepare._series_with_name(data['term_frequency'], 'term_frequency')
#[V]
doc_lengths      = pyLDAvis._prepare._series_with_name(data['doc_lengths'], 'doc_length')
#[M]
vocab            = pyLDAvis._prepare._series_with_name(data['vocab'], 'vocab')
#[V]

In [18]:
topic_term_dists.shape#[K x V]

(50, 36887)

In [19]:
doc_topic_dists.shape#[M x K]

(1000, 50)

In [20]:
term_frequency.shape#[V]

(36887,)

In [21]:
doc_lengths.shape#[M]

(1000,)

In [22]:
vocab.shape#[V]

(36887,)

In [33]:
topic_freq  = (doc_topic_dists.T * doc_lengths).T.sum(axis=0) # elementwise multiplication and sum all the rows
print(topic_freq.shape)    #[K x M] * [M] = [K x M] = [M x K] = [M,]
# doc_topic_dists.T
# topic_freq
(doc_topic_dists.T * doc_lengths).T

(50,)


topic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
0,0.019806,0.019538,0.019513,1245.119074,0.019608,0.019586,0.019525,0.020281,0.019583,0.019795,0.019664,0.019513,0.019547,0.019513,0.019513,0.019607,0.019513,0.019597,0.019600,1286.813190,0.019620,0.019702,0.019708,0.019678,0.019550,0.019538,0.019525,0.019525,0.019707,0.019570,0.019513,0.019513,0.019550,334.145049,0.019629,0.019525,0.021146,0.019525,0.019596,0.019642,0.019861,0.019550,0.019525,0.019538,0.019513,0.019537,0.019513,0.019513,0.019548,0.019525
1,0.019806,0.019538,0.019513,0.021697,0.019609,0.019586,0.019525,0.020281,0.019583,0.019795,0.019665,0.019513,0.019548,0.019513,0.019513,0.019608,0.019513,0.019597,0.019600,2253.645224,0.019620,0.019703,0.019708,0.019678,0.019551,0.019538,0.019525,0.019525,0.019708,0.019570,0.019513,0.019513,0.019550,692.410385,0.019629,0.019525,0.021146,0.019525,0.019597,0.019642,0.019861,0.019551,0.019525,0.019538,0.019513,0.019538,0.019513,0.019513,0.019548,0.019525
2,0.019812,74.617215,0.019518,0.021702,0.019614,0.019591,0.019530,0.020287,0.019588,0.019800,0.019670,0.019518,0.019553,0.019518,0.019518,0.019613,0.019518,0.019602,0.019605,724.541384,0.019625,0.019708,0.019713,0.019683,0.019556,0.019543,0.019530,0.019530,0.019713,0.019575,0.019518,0.019518,0.019555,9355.223654,0.019634,0.019530,2738.713806,0.019530,0.019602,0.019647,0.019866,0.019556,0.019530,0.019543,0.019518,0.019543,0.019518,0.019518,0.019553,0.019530
3,0.019810,0.019541,0.019516,2206.869688,0.019612,0.019589,0.019529,374.731157,0.019586,0.019799,0.019668,0.019516,0.019551,0.019516,0.019516,0.019611,0.019516,0.019601,0.019604,557.797360,0.019623,0.019706,0.019711,0.019681,0.019554,0.019541,0.019529,0.019529,0.019711,0.019573,0.019516,0.019516,0.019554,0.021305,0.019632,0.019529,2895.699073,0.019529,0.019600,0.019645,0.019865,0.019554,0.019529,0.019541,0.019516,0.019541,0.019516,0.019516,0.019551,0.019529
4,0.019791,0.019523,0.019498,535.497039,0.019593,0.019571,0.019510,0.020266,0.019568,15.565705,0.019649,0.019498,0.019532,0.019498,0.019498,0.019593,0.019498,0.019582,0.019585,0.021610,0.019605,0.019687,0.019693,0.019663,0.019535,0.019523,0.019510,0.019510,0.019692,0.019555,0.019498,0.019498,0.019535,0.021284,0.019614,0.019510,251.283785,0.019510,0.019581,0.019627,87.749350,0.019536,0.019510,0.019523,0.019498,0.019523,0.019498,0.019498,0.019533,0.019510
5,0.019801,0.019533,0.019508,856.717932,0.019603,0.019581,0.019520,0.020276,0.019578,0.019790,0.019659,0.019508,0.019542,0.019508,0.019508,0.019602,0.019508,0.019592,0.019595,307.483524,0.019614,0.019697,0.019703,0.019672,0.019545,0.019532,0.019520,0.019520,0.019702,0.019564,0.019508,0.019508,0.019545,122.673631,0.019624,0.019520,338.223609,0.019520,0.019591,0.019637,0.019856,0.019545,0.019520,0.019533,0.019508,0.019532,0.019508,0.019508,0.019543,0.019520
6,0.019809,0.019540,0.019515,0.021699,0.019611,0.019588,0.019527,0.020284,0.019585,0.019797,0.019667,0.019515,0.019550,0.019515,0.019515,0.019610,0.019515,0.019600,0.019602,0.021629,0.019622,0.019705,0.019710,0.019680,0.019553,0.019540,0.019527,0.019527,0.019710,0.019572,0.019515,0.019515,0.019552,4360.033876,0.019631,0.019527,0.021148,0.019527,0.019599,0.019644,0.019864,0.019553,0.019528,0.019540,0.019515,0.019540,0.019515,0.019515,0.019550,0.019527
7,0.019800,0.019532,0.019507,978.724230,0.019603,0.019580,0.019519,0.020275,0.019577,0.019789,0.019659,0.019507,0.019542,0.019507,0.019507,0.019602,0.019507,0.019591,0.019594,318.008207,0.019614,0.019697,0.019702,0.019672,0.019545,0.019532,0.019519,0.019519,0.019702,0.019564,0.019507,0.019507,0.019544,0.021294,0.019623,0.019519,241.344996,0.019519,0.019591,0.019636,0.019855,0.019545,0.019519,0.019532,0.019507,0.019532,0.019507,0.019507,0.019542,0.019519
8,0.019797,0.019528,0.019503,714.214357,0.019599,0.019576,0.019516,0.020271,0.019573,22.763734,0.019655,0.019503,0.019538,0.019503,0.019503,0.019598,0.019503,0.019588,0.019591,0.021616,0.019610,0.019693,0.019698,0.019668,0.019541,0.019528,0.019516,0.019516,0.019698,0.019560,0.019503,0.019503,0.019540,0.021290,0.019619,0.019516,444.097689,0.019515,0.019587,0.019632,0.019851,0.019541,0.019516,0.019528,0.019503,0.019528,0.019503,0.019503,0.019538,0.019516
9,0.019801,0.019532,0.019507,0.021690,0.019603,0.019580,0.019520,0.020275,0.019577,0.019789,0.019659,0.019507,0.019542,0.019507,0.019507,0.019602,0.019507,0.019592,0.019594,0.021621,0.019614,0.019697,0.019702,0.019672,0.019545,0.019532,0.019520,0.019519,0.019702,0.019564,0.019507,0.019507,0.019544,1557.034270,0.019623,0.019519,0.021139,0.019519,0.019591,0.019636,0.019855,0.019545,0.019520,0.019532,0.019507,0.019532,0.019507,0.019507,0.019542,0.019520


In [17]:
topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)
print()

In [18]:
topic_order      = topic_proportion.index
# reorder all data based on new ordering of topics
topic_freq       = topic_freq[topic_order]
topic_term_dists = topic_term_dists.ix[topic_order]
doc_topic_dists  = doc_topic_dists[topic_order]

In [19]:
topic_order

Int64Index([33, 19,  3, 36,  7, 22, 23, 40,  8,  0, 21,  9, 28, 20, 29, 10, 18,
            39, 34, 48,  4, 12, 15,  5,  1, 38, 41, 43, 17, 24, 32, 25, 45, 42,
             6, 26, 49, 35, 27, 37, 31, 11, 13, 14, 30, 16, 44, 46,  2, 47],
           dtype='int64', name='topic')

In [20]:
term_topic_freq = (topic_term_dists.T * topic_freq).T
# term_topic_freq

In [21]:
term_frequency = np.sum(term_topic_freq, axis=0)
len(term_frequency)

36887

a = np.array([1,2,3,4,5,6,7,8,9,10])
b = np.array([1,2,3,4,5,6,7,8,9,10])
print(a)
print(np.sum(a, axis=0))
pk = a * 1.0 /np.sum(a)
qk = b * 1.0 /np.sum(b)
print(pk)
print(qk)
vvv =pk/qk
vvv.sum()

In [22]:
def _find_relevance(log_ttd, log_lift, R, lambda_):
    relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
    return relevance.T.apply(lambda s: s.sort_values(ascending=False).index).head(R)


def _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq):
    return pd.concat([_find_relevance(log_ttd, log_lift, R, l) for l in lambda_seq])


In [23]:
R=30
lambda_step = 0.01
n_jobs = -1

# marginal distribution over terms (width of blue bars)
term_proportion = term_frequency / term_frequency.sum()

# compute the distinctiveness and saliency of the terms:
# this determines the R terms that are displayed when no topic is selected
topic_given_term = topic_term_dists / topic_term_dists.sum()
kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
distinctiveness = kernel.sum()
saliency = term_proportion * distinctiveness

# Order the terms for the "default" view by decreasing saliency:
default_term_info  = pd.DataFrame({'saliency': saliency, 'Term': vocab, \
                                  'Freq': term_frequency, 'Total': term_frequency, \
                                  'Category': 'Default'}). \
                                  sort_values(by='saliency', ascending=False)\
                                    .head(R).drop('saliency', 1)
# Rounding Freq and Total to integer values to match LDAvis code:
default_term_info['Freq'] = np.floor(default_term_info['Freq'])
default_term_info['Total'] = np.floor(default_term_info['Total'])
ranks = np.arange(R, 0, -1)
default_term_info['logprob'] = default_term_info['loglift'] = ranks

## compute relevance and top terms for each topic
log_lift = np.log(topic_term_dists / term_proportion)
log_ttd = np.log(topic_term_dists)
lambda_seq = np.arange(0, 1 + lambda_step, lambda_step)

def topic_top_term_df(tup):
    new_topic_id, (original_topic_id, topic_terms) = tup
    term_ix = topic_terms.unique()
    print('===========')
    print('new_topic_id: ', new_topic_id)
    print('--------')
    print('original_topic_id:' , original_topic_id)
    print('--------')
    print('term_ix: ', term_ix)
    print('-========')

    return pd.DataFrame({'Term': vocab[term_ix], \
                   'Freq': term_topic_freq.loc[original_topic_id, term_ix], \
                   'Total': term_frequency[term_ix], \
                   'logprob': log_ttd.loc[original_topic_id, term_ix].round(4), \
                   'loglift': log_lift.loc[original_topic_id, term_ix].round(4), \
                   'Category': 'Topic%d' % new_topic_id})

top_terms = _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq)

topic_dfs = map(topic_top_term_df, enumerate(top_terms.T.iterrows(), 1))
topic_info =  pd.concat([default_term_info] + list(topic_dfs))

new_topic_id:  1
--------
original_topic_id: 33
--------
term_ix:  [ 130  284  617  371  995 1718   57 1419 3354 1017 1876 1467  401  160  739
  258 1771 1071 1292  513 1223  378 1488 2850 1052  708 3822 2331 2392 2346
 1025 1862   29  142  687   17  281 1216  313  393  108  295   41   68  614
  909  472 1008  373   33   86    4  362   32    2  230  205   81  115  226
  171  131   30   12   51    8  125   20   46   25    3  141   11   50    5
   22    0   27   10   16   13]
new_topic_id:  2
--------
original_topic_id: 19
--------
term_ix:  [ 315 1378  874  229  763 1106 1072  387 2848  167  721 3062 5038   39 2312
  730 2298  460 2540 1359 1031  164 2824 4374  823  670 2213 1639  114   74
  187  592   34  157    9  235  262  448  185  634  352  117   15  288   87
   60  122    6  236  209   49  271   19   25  126   16    0   38    1  238
   10   56    5   11   13    2   12   23    3   51    4]
new_topic_id:  3
--------
original_topic_id: 3
--------
term_ix:  [ 534  805  173   26 1056  

In [24]:
default_term_info

Unnamed: 0_level_0,Category,Freq,Term,Total,logprob,loglift
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
17,Default,1017556000.0,image,1017556000.0,30,30
9,Default,1469294000.0,said,1469294000.0,29,29
26,Default,906685200.0,layer,906685200.0,28,28
62,Default,981184700.0,film,981184700.0,27,27
0,Default,2690259000.0,first,2690259000.0,26,26
15,Default,1372487000.0,signal,1372487000.0,25,25
1,Default,2000025000.0,second,2000025000.0,24,24
14,Default,1223022000.0,portion,1223022000.0,23,23
19,Default,1265586000.0,unit,1265586000.0,22,22
33,Default,1458512000.0,processing,1458512000.0,21,21


In [25]:
# marginal distribution over terms (width of blue bars)
term_proportion = term_frequency / term_frequency.sum()

# compute the distinctiveness and saliency of the terms:
# this determines the R terms that are displayed when no topic is selected
topic_given_term = topic_term_dists / topic_term_dists.sum()
kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
distinctiveness = kernel.sum()
saliency = term_proportion * distinctiveness

# Order the terms for the "default" view by decreasing saliency:
default_term_info  = pd.DataFrame({'saliency': saliency, 'Term': vocab, \
                                  'Freq': term_frequency, 'Total': term_frequency, \
                                  'Category': 'Default'}). \
                                  sort_values(by='saliency', ascending=False)\
                                    .head(R).drop('saliency', 1)
        
default_term_info        

Unnamed: 0_level_0,Category,Freq,Term,Total
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
17,Default,1017556000.0,image,1017556000.0
9,Default,1469294000.0,said,1469294000.0
26,Default,906685200.0,layer,906685200.0
62,Default,981184700.0,film,981184700.0
0,Default,2690259000.0,first,2690259000.0
15,Default,1372487000.0,signal,1372487000.0
1,Default,2000025000.0,second,2000025000.0
14,Default,1223022000.0,portion,1223022000.0
19,Default,1265586000.0,unit,1265586000.0
33,Default,1458512000.0,processing,1458512000.0


In [26]:
# def _token_table(topic_info, term_topic_freq, vocab, term_frequency):
# last, to compute the areas of the circles when a term is highlighted
# we must gather all unique terms that could show up (for every combination
# of topic and value of lambda) and compute its distribution over topics.

# term-topic frequency table of unique terms across all topics and all values of lambda
term_ix = topic_info.index.unique()
term_ix = np.sort(term_ix)

top_topic_terms_freq = term_topic_freq[term_ix]
# use the new ordering for the topics
K = len(term_topic_freq)
top_topic_terms_freq.index = range(1, K + 1)
top_topic_terms_freq.index.name = 'Topic'

# we filter to Freq >= 0.5 to avoid sending too much data to the browser
token_table = pd.DataFrame({'Freq': top_topic_terms_freq.unstack()}). \
             reset_index().set_index('term'). \
             query('Freq >= 0.5')

token_table['Freq'] = token_table['Freq'].round()
token_table['Term'] = vocab[token_table.index.values].values
# Normalize token frequencies:
token_table['Freq'] = token_table.Freq / term_frequency[token_table.index]
token_table = token_table.sort_values(by=['Term', 'Topic'])

print()
token_table





Unnamed: 0_level_0,Topic,Freq,Term
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
28706,1,4.190326e-01,aacgctcggttgccgccgggcgttttttatt
28706,2,2.285199e-01,aacgctcggttgccgccgggcgttttttatt
28706,3,2.263249e-01,aacgctcggttgccgccgggcgttttttatt
28706,4,9.518710e-02,aacgctcggttgccgccgggcgttttttatt
28706,5,9.185906e-03,aacgctcggttgccgccgggcgttttttatt
28706,6,4.127663e-03,aacgctcggttgccgccgggcgttttttatt
28706,7,2.536182e-03,aacgctcggttgccgccgggcgttttttatt
28706,8,2.050818e-03,aacgctcggttgccgccgggcgttttttatt
28706,9,1.662362e-03,aacgctcggttgccgccgggcgttttttatt
28706,10,1.324020e-03,aacgctcggttgccgccgggcgttttttatt


In [27]:
# def _topic_coordinates(mds, topic_term_dists, topic_proportion):
mds = js_PCoA
K = topic_term_dists.shape[0]
mds_res = mds(topic_term_dists)
assert mds_res.shape == (K, 2)
mds_df = pd.DataFrame({'x': mds_res[:,0], 'y': mds_res[:,1], 'topics': range(1, K + 1), \
                      'cluster': 1, 'Freq': topic_proportion * 100})
# note: cluster (should?) be deprecated soon. See: https://github.com/cpsievert/LDAvis/issues/26
topic_coordinates = mds_df
topic_coordinates

Unnamed: 0_level_0,Freq,cluster,topics,x,y
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
33,32.816826,1,1,0.316896,0.062321
19,25.87269,1,2,0.319832,0.054235
3,25.251788,1,3,0.273008,-0.121139
36,11.866584,1,4,0.173012,-0.00591
7,1.316615,1,5,0.040806,-0.061788
22,0.514008,1,6,0.005979,0.009373
23,0.380038,1,7,0.022765,-0.035769
40,0.296941,1,8,0.014097,-0.028361
8,0.227157,1,9,-0.014173,0.009327
0,0.21051,1,10,0.017032,0.008941


In [28]:
client_topic_order = [x + 1 for x in topic_order]

In [29]:
plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}

In [30]:
class PreparedData(namedtuple('PreparedData', ['topic_coordinates', 'topic_info', 'token_table',\
                                               'R', 'lambda_step', 'plot_opts', 'topic_order'])):
    def to_dict(self):
       return {'mdsDat': self.topic_coordinates.to_dict(orient='list'),
               'tinfo': self.topic_info.to_dict(orient='list'),
               'token.table': self.token_table.to_dict(orient='list'),
               'R': self.R,
               'lambda.step': self.lambda_step,
               'plot.opts': self.plot_opts,
               'topic.order': self.topic_order}

    def to_json(self):
       return json.dumps(self.to_dict(), cls=NumPyEncoder)


In [31]:
pp = PreparedData(topic_coordinates, topic_info, token_table, R, lambda_step, plot_opts, client_topic_order)

In [32]:
topic_coordinates

Unnamed: 0_level_0,Freq,cluster,topics,x,y
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
33,32.816826,1,1,0.316896,0.062321
19,25.87269,1,2,0.319832,0.054235
3,25.251788,1,3,0.273008,-0.121139
36,11.866584,1,4,0.173012,-0.00591
7,1.316615,1,5,0.040806,-0.061788
22,0.514008,1,6,0.005979,0.009373
23,0.380038,1,7,0.022765,-0.035769
40,0.296941,1,8,0.014097,-0.028361
8,0.227157,1,9,-0.014173,0.009327
0,0.21051,1,10,0.017032,0.008941


In [33]:
# topic_coordinates.to_dict(orient='list')
# topic_info.to_dict(orient='list')
# token_table.to_dict(orient='list')
topic_order

Int64Index([33, 19,  3, 36,  7, 22, 23, 40,  8,  0, 21,  9, 28, 20, 29, 10, 18,
            39, 34, 48,  4, 12, 15,  5,  1, 38, 41, 43, 17, 24, 32, 25, 45, 42,
             6, 26, 49, 35, 27, 37, 31, 11, 13, 14, 30, 16, 44, 46,  2, 47],
           dtype='int64', name='topic')

In [34]:
def _jensen_shannon(_P, _Q):
    _M = 0.5 * (_P + _Q)
    e1 = entropy(_P, _M)
    e2 = entropy(_Q, _M)
    res = 0.5 * ( e1 + e2 )
#     print('e1 ', e1)
#     print('e2 ', e2)
    print('res ', res)
    return res



In [35]:
# topic_term_dists

In [36]:
ss = pdist(topic_term_dists, metric=_jensen_shannon)
ss.shape

res  0.142724819809
res  0.192271634926
res  0.17144197055
res  0.311275027283
res  0.307439531051
res  0.30184231192
res  0.318109555954
res  0.342420722689
res  0.313680342912
res  0.351117116096
res  0.322018463753
res  0.323910681419
res  0.328637381953
res  0.357673122605
res  0.296279582707
res  0.305319731623
res  0.328713619325
res  0.334987022433
res  0.365105526391
res  0.337941567194
res  0.367432395809
res  0.31614508122
res  0.316448034991
res  0.368232019166
res  0.334620603013
res  0.337715123547
res  0.368291091563
res  0.328583414446
res  0.340049938295
res  0.36369445323
res  0.369473411713
res  0.371492317523
res  0.37015861101
res  0.375394617567
res  0.374576273748
res  0.373119139516
res  0.373234262867
res  0.37725077638
res  0.377245282749
res  0.381176075023
res  0.381067356851
res  0.38106580718
res  0.381033071441
res  0.380905764704
res  0.380754715151
res  0.380983712271
res  0.380856232362
res  0.380912646133
res  0.380857851692
res  0.183588038016
res  0.

(1225,)

In [37]:
print(ss)

[ 0.14272482  0.19227163  0.17144197 ...,  0.00249611  0.00249389
  0.00249419]


In [38]:
ss.shape[0]

d = int(np.ceil(np.sqrt(ss.shape[0] * 2)))

d

50

In [39]:
d * (d - 1) / 2

1225.0

In [40]:
ss[:10]

array([ 0.14272482,  0.19227163,  0.17144197,  0.31127503,  0.30743953,
        0.30184231,  0.31810956,  0.34242072,  0.31368034,  0.35111712])

In [41]:
pair_dists = pyLDAvis._prepare.squareform(ss)

In [42]:
pair_dists[:10]

array([[ 0.        ,  0.14272482,  0.19227163,  0.17144197,  0.31127503,
         0.30743953,  0.30184231,  0.31810956,  0.34242072,  0.31368034,
         0.35111712,  0.32201846,  0.32391068,  0.32863738,  0.35767312,
         0.29627958,  0.30531973,  0.32871362,  0.33498702,  0.36510553,
         0.33794157,  0.3674324 ,  0.31614508,  0.31644803,  0.36823202,
         0.3346206 ,  0.33771512,  0.36829109,  0.32858341,  0.34004994,
         0.36369445,  0.36947341,  0.37149232,  0.37015861,  0.37539462,
         0.37457627,  0.37311914,  0.37323426,  0.37725078,  0.37724528,
         0.38117608,  0.38106736,  0.38106581,  0.38103307,  0.38090576,
         0.38075472,  0.38098371,  0.38085623,  0.38091265,  0.38085785],
       [ 0.14272482,  0.        ,  0.18358804,  0.18123054,  0.29473695,
         0.32463364,  0.31458354,  0.31898588,  0.3372834 ,  0.29534932,
         0.33862619,  0.31831523,  0.31388508,  0.3293859 ,  0.3148644 ,
         0.29814399,  0.30440065,  0.31970556,  0.

In [43]:
pyLDAvis._prepare.js_PCoA(topic_term_dists)

array([[  3.16895758e-01,   6.23210845e-02],
       [  3.19831567e-01,   5.42354526e-02],
       [  2.73008058e-01,  -1.21139016e-01],
       [  1.73011672e-01,  -5.91010009e-03],
       [  4.08056658e-02,  -6.17875176e-02],
       [  5.97881265e-03,   9.37308392e-03],
       [  2.27646894e-02,  -3.57685397e-02],
       [  1.40969889e-02,  -2.83607005e-02],
       [ -1.41727176e-02,   9.32651111e-03],
       [  1.70320192e-02,   8.94087059e-03],
       [ -1.44636448e-02,  -8.57292519e-03],
       [  1.09181503e-02,  -1.96729872e-02],
       [  6.21262721e-03,   6.27915573e-04],
       [ -4.65067777e-03,   9.63580700e-03],
       [ -1.33580323e-02,   1.67752978e-02],
       [  2.01569234e-02,   1.81035095e-02],
       [  1.30062412e-02,   2.12029417e-02],
       [  1.53687464e-03,  -2.33505086e-03],
       [ -8.62434348e-03,   3.87598189e-03],
       [ -3.59291662e-02,   2.42226521e-03],
       [ -7.90893353e-03,  -1.27444441e-03],
       [ -3.66181541e-02,  -1.26033124e-02],
       [  

In [44]:
pair_dists = np.asarray(pair_dists, np.float64)

In [45]:
n = pair_dists.shape[0]
n

50

In [46]:
H = np.eye(n) - np.ones((n, n)) / n
H

array([[ 0.98, -0.02, -0.02, ..., -0.02, -0.02, -0.02],
       [-0.02,  0.98, -0.02, ..., -0.02, -0.02, -0.02],
       [-0.02, -0.02,  0.98, ..., -0.02, -0.02, -0.02],
       ..., 
       [-0.02, -0.02, -0.02, ...,  0.98, -0.02, -0.02],
       [-0.02, -0.02, -0.02, ..., -0.02,  0.98, -0.02],
       [-0.02, -0.02, -0.02, ..., -0.02, -0.02,  0.98]])

In [47]:
B = - H.dot(pair_dists ** 2).dot(H) / 2
B

array([[ 0.10825455,  0.09798507,  0.07841483, ..., -0.01735804,
        -0.01737588, -0.01735813],
       [ 0.09798507,  0.10808597,  0.07996244, ..., -0.01817841,
        -0.01824323, -0.01818234],
       [ 0.07841483,  0.07996244,  0.08554348, ..., -0.01569306,
        -0.01571964, -0.01567451],
       ..., 
       [-0.01735804, -0.01817841, -0.01569306, ...,  0.00208084,
         0.00208137,  0.00207825],
       [-0.01737588, -0.01824323, -0.01571964, ...,  0.00208137,
         0.00208813,  0.0020819 ],
       [-0.01735813, -0.01818234, -0.01567451, ...,  0.00207825,
         0.0020819 ,  0.00208189]])

In [48]:
eigvals, eigvecs = np.linalg.eig(B)
eigvecs

array([[  5.24088329e-01,  -1.35195871e-01,   3.51536276e-01, ...,
          9.66437818e-05,   1.31070067e-04,  -6.96306758e-05],
       [  5.28943626e-01,  -2.46374848e-01,   3.05927427e-01, ...,
         -1.04645972e-05,  -1.16211388e-04,   1.27069988e-04],
       [  4.51506002e-01,  -2.40917933e-01,  -6.83312219e-01, ...,
         -1.74759454e-04,  -6.90337164e-05,   4.13543056e-05],
       ..., 
       [ -8.73177105e-02,  -1.37190101e-01,   2.86782158e-03, ...,
          1.49239758e-01,   1.04181265e-01,  -6.28739790e-01],
       [ -8.75233363e-02,  -1.37590449e-01,   2.73754252e-03, ...,
         -5.33662342e-01,  -4.88363680e-02,  -2.29741783e-01],
       [ -8.73200078e-02,  -1.36877466e-01,   2.63850541e-03, ...,
          1.91195300e-01,   3.51732359e-01,   4.99178034e-01]])

In [49]:
ix = eigvals.argsort()[::-1][:2]
ix

array([0, 2])

In [50]:
eigvals = eigvals[ix]

In [51]:
eigvecs = eigvecs[:, ix]
eigvecs

array([[ 0.52408833,  0.35153628],
       [ 0.52894363,  0.30592743],
       [ 0.451506  , -0.68331222],
       [ 0.28613005, -0.03333727],
       [ 0.0674852 , -0.34852657],
       [ 0.00988788,  0.05287102],
       [ 0.03764868, -0.2017606 ],
       [ 0.02331387, -0.15997499],
       [-0.02343911,  0.05260831],
       [ 0.02816788,  0.05043302],
       [-0.02392026, -0.04835754],
       [ 0.01805665, -0.11096997],
       [ 0.01027456,  0.0035419 ],
       [-0.00769138,  0.05435296],
       [-0.02209177,  0.09462489],
       [ 0.03333591,  0.10211697],
       [ 0.02150997,  0.11960002],
       [ 0.00254171, -0.01317139],
       [-0.01426311,  0.02186336],
       [-0.05942035,  0.01366334],
       [-0.01307995, -0.00718879],
       [-0.06055981, -0.07109185],
       [ 0.01415886,  0.18082004],
       [-0.0140614 ,  0.08593503],
       [-0.06639389, -0.01295573],
       [-0.01581158,  0.06435916],
       [-0.02536701,  0.07447895],
       [-0.06168466, -0.03804466],
       [-0.01000017,

In [52]:
eigvals[np.isclose(eigvals, 0)] = 0
np.any(eigvals < 0)

False

In [53]:
if np.any(eigvals < 0):
    ix_neg = eigvals < 0
    eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
    eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)

In [54]:
eigvals.shape
type(eigvals)

numpy.ndarray

In [55]:
eigvecs.shape

(50, 2)

In [56]:
np.sqrt(eigvals) * eigvecs


array([[  3.16895758e-01,   6.23210845e-02],
       [  3.19831567e-01,   5.42354526e-02],
       [  2.73008058e-01,  -1.21139016e-01],
       [  1.73011672e-01,  -5.91010009e-03],
       [  4.08056658e-02,  -6.17875176e-02],
       [  5.97881265e-03,   9.37308392e-03],
       [  2.27646894e-02,  -3.57685397e-02],
       [  1.40969889e-02,  -2.83607005e-02],
       [ -1.41727176e-02,   9.32651111e-03],
       [  1.70320192e-02,   8.94087059e-03],
       [ -1.44636448e-02,  -8.57292519e-03],
       [  1.09181503e-02,  -1.96729872e-02],
       [  6.21262721e-03,   6.27915573e-04],
       [ -4.65067777e-03,   9.63580700e-03],
       [ -1.33580323e-02,   1.67752978e-02],
       [  2.01569234e-02,   1.81035095e-02],
       [  1.30062412e-02,   2.12029417e-02],
       [  1.53687464e-03,  -2.33505086e-03],
       [ -8.62434348e-03,   3.87598189e-03],
       [ -3.59291662e-02,   2.42226521e-03],
       [ -7.90893353e-03,  -1.27444441e-03],
       [ -3.66181541e-02,  -1.26033124e-02],
       [  

In [57]:
eigvecs * np.sqrt(eigvals)

array([[  3.16895758e-01,   6.23210845e-02],
       [  3.19831567e-01,   5.42354526e-02],
       [  2.73008058e-01,  -1.21139016e-01],
       [  1.73011672e-01,  -5.91010009e-03],
       [  4.08056658e-02,  -6.17875176e-02],
       [  5.97881265e-03,   9.37308392e-03],
       [  2.27646894e-02,  -3.57685397e-02],
       [  1.40969889e-02,  -2.83607005e-02],
       [ -1.41727176e-02,   9.32651111e-03],
       [  1.70320192e-02,   8.94087059e-03],
       [ -1.44636448e-02,  -8.57292519e-03],
       [  1.09181503e-02,  -1.96729872e-02],
       [  6.21262721e-03,   6.27915573e-04],
       [ -4.65067777e-03,   9.63580700e-03],
       [ -1.33580323e-02,   1.67752978e-02],
       [  2.01569234e-02,   1.81035095e-02],
       [  1.30062412e-02,   2.12029417e-02],
       [  1.53687464e-03,  -2.33505086e-03],
       [ -8.62434348e-03,   3.87598189e-03],
       [ -3.59291662e-02,   2.42226521e-03],
       [ -7.90893353e-03,  -1.27444441e-03],
       [ -3.66181541e-02,  -1.26033124e-02],
       [  