# Load Data and Models

In [None]:
import numpy as np
import h5py

In [None]:
f = h5py.File('/home/nam/codes/vectorlearn/data/dataset.h5','r')
# print f.name
# print f.keys()
# print f['Words'].attrs['num_words']
# print f['Words'].keys()

words_grp = f['Words']
words_freq_info_dset = words_grp['freq_info']
# print words_freq_info_dset.len()
word_vocab_size = words_freq_info_dset.len()
word_vocab = np.array([words_freq_info_dset[i][0].decode('utf-8') for i in xrange(word_vocab_size)])

f.close()

In [None]:
f = h5py.File('/home/nam/codes/vectorlearn/data/dataset.h5','r')
# print f.keys()
labels_grp = f['Labels']
# print labels_grp.keys()
label_desc_word_indices_dset = labels_grp['entity_indices']
label_offset_dset = labels_grp['offset_info']
# print label_offset_dset[0][0]
label_vocab_size = label_offset_dset.len()
label_vocab = np.array([label_offset_dset[i][0].decode('utf-8') for i in xrange(label_vocab_size)])
label_descriptions = []
for i in xrange(label_vocab_size):
    offset = long(label_offset_dset[i][1])
    desc_length = long(label_offset_dset[i][2])
    label_descriptions.append(' '.join(word_vocab[label_desc_word_indices_dset[offset:(offset+desc_length)]]))
    
label_descriptions = np.array(label_descriptions)
f.close()

In [None]:
print label_descriptions[0]

In [None]:
"""
Load the trained model
"""

from utils import *

model = load_model('/home/nam/codes/vectorlearn/data/BioASQ_label_desc_small_dataset_model.vectors')

In [None]:
print model['word_emb'].shape

In [None]:
for i in xrange(len(label_vocab)):
    label_vocab[i] = label_vocab[i].replace('_',' ')
    

# Analysis

In [None]:

import pandas as pd
from bokeh.io import output_notebook, show

In [None]:
output_notebook()

## 2D projection for word embeddings 

In [None]:
import numpy as np
from sklearn.manifold import TSNE

In [None]:
"""
Run tSNE
"""
tsne = TSNE(n_components=2, verbose=2, init='pca', perplexity=30, learning_rate=100, early_exaggeration=1)
mapped_word_embeddings = tsne.fit_transform(model['word_emb'][0:3000,:])

In [None]:
from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.models import HoverTool, WheelZoomTool

In [None]:
"""
Plot word embeddings 
"""
output_file("/home/nam/Dropbox/word_embeddings.html")
fig = figure(title="word embeddings")

for i in xrange(mapped_word_embeddings.shape[0]):
    fig.text(mapped_word_embeddings[i,0], mapped_word_embeddings[i,1], text=[word_vocab[i]], text_color="blue", text_align="center", text_font_size="10pt")
    
show(fig)

## 2D projection for label embeddings

In [None]:
"""
Run tSNE
"""
tsne = TSNE(n_components=2, verbose=2, init='pca', perplexity=30, learning_rate=1000, early_exaggeration=1)
mapped_label_embeddings = tsne.fit_transform(model['label_emb'][0:5000,:])

In [None]:
"""
Plot label embeddings 
"""
output_file("/home/nam/Dropbox/label_embeddings.html")
fig = figure(title="Seen label embeddings")

for i in xrange(mapped_label_embeddings.shape[0]):
    fig.text(mapped_label_embeddings[i,0], mapped_label_embeddings[i,1], text=[label_vocab[i]], text_color="blue", text_align="center", text_font_size="10pt")
    
show(fig)

## Analysis on the test set

In [None]:
print len(label_vocab)

In [None]:
label_embeddings = model['label_emb']
print label_embeddings.shape[0]
num_seen_labels = label_embeddings.shape[0]

In [None]:
print 'Number of unseen labels: %d' % (len(label_vocab) - label_embeddings.shape[0])

In [None]:
test_model = load_inferred_test_instances('/home/nam/codes/vectorlearn/data/BioASQ_label_desc_small_dataset_model.inferred_testset_vectors')

In [None]:
print test_model.keys()

In [None]:
print test_model['test_doc_emb'].shape

In [None]:
print test_model['unseen_label_emb'].shape

## Plot of unseen labels 

In [None]:
"""
Run tSNE
"""
tsne = TSNE(n_components=2, verbose=2, init='pca', perplexity=30, learning_rate=1000, early_exaggeration=1)
mapped_unseen_label_embeddings = tsne.fit_transform(test_model['unseen_label_emb'])

In [None]:
"""
Plot label embeddings 
"""
output_file("/home/nam/Dropbox/unseen_label_embeddings.html")

# # TODO change the following source later
# seen_source = ColumnDataSource(
#         data=dict(
#             x=mapped_unseen_label_embeddings[:,0]+1,
#             y=mapped_unseen_label_embeddings[:,1]+5,
#             name=label_vocab[num_seen_labels:],
#             desc=label_descriptions[num_seen_labels:],
#         )
#     )

unseen_source = ColumnDataSource(
        data=dict(
            x=mapped_unseen_label_embeddings[:,0],
            y=mapped_unseen_label_embeddings[:,1],
            name=label_vocab[num_seen_labels:],
            desc=label_descriptions[num_seen_labels:],
        )
    )

hover = HoverTool(
    tooltips="""
        <div style="max-width:300px;">
            <div>
                <span style="font-size: 17px; font-weight: bold; color:SeaGreen; max-width:200px;">@name</span>
            </div>
            <div>
                <span style="font-size: 14px;">@desc</span>
            </div>
        </div>
        """
    )

fig = figure(title="Unseen label embeddings")
fig.add_tools(hover)

fig.text('x','y', text='name', text_color='blue', text_align="left", text_font_size="10pt", source=unseen_source)
fig.circle('x', 'y', size=8, line_color='red', fill_color='white', fill_alpha=0, source=unseen_source)

# # TODO change the following figure according to the data
# fig.circle('x', 'y', size=10, fill_color='blue', source=seen_source)
    
show(fig)