In [113]:
import numpy as np
import sklearn
from sklearn.manifold import TSNE
import cPickle as pickle

import plotly
plotly.__version__
import plotly.plotly as py
import plotly.graph_objs as go

In [114]:
# load the word pairs with representations
word_pairs_housman_glove = pickle.load(open('../working_data/housman.pair.glove.pkl', 'rb'))
word_pairs_shakespeare_glove = pickle.load(open('../working_data/shakes.pair.glove.pkl', 'rb'))

In [115]:
# create a list of words and a np.array of representations from the word tuples
def format_pairs(word_pairs):
    representations = []
    labels = []
    for wp_tuple in word_pairs:
        w1 = wp_tuple[0]
        w2 = wp_tuple[1]
        w1_repr = np.asmatrix(wp_tuple[7])
        w2_repr = np.asmatrix(wp_tuple[8])
        w1w2_repr = np.concatenate((w1_repr, w2_repr), axis=1)
        representations.append(w1w2_repr)
        labels.append(w1 + " - " + w2)
        
    np_representations = np.stack(representations)
    
    return np_representations, labels

# format the housman and shakespear tuples with glove representations
housman_glove_repr, housman_glove_labels = format_pairs(word_pairs_housman_glove)
shakespeare_glove_repr, shakespeare_glove_labels = format_pairs(word_pairs_shakespeare_glove)

# concatenate all representations and run tsne
all_repr = np.concatenate((housman_glove_repr, shakespeare_glove_repr), axis=0)
model = TSNE(n_components=2, random_state=0, verbose=2)
np.set_printoptions(suppress=True)
print("==> Running TSNE...")
tsne_word_repr = model.fit_transform(all_repr) 
print("==> done.")

# pick out the housman from the shakespeare tsne representations
housman_glove_repr_tsne = tsne_word_repr[:len(housman_glove_labels)]
shakespeare_glove_repr_tsne = tsne_word_repr[len(housman_glove_labels):]

==> Running TSNE...
[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 4554
[t-SNE] Computed conditional probabilities for sample 2000 / 4554
[t-SNE] Computed conditional probabilities for sample 3000 / 4554
[t-SNE] Computed conditional probabilities for sample 4000 / 4554
[t-SNE] Computed conditional probabilities for sample 4554 / 4554
[t-SNE] Mean sigma: 2.934353
[t-SNE] Iteration 25: error = 1.6732621, gradient norm = 0.0086232
[t-SNE] Iteration 50: error = 1.6279033, gradient norm = 0.0057335
[t-SNE] Iteration 75: error = 1.4068370, gradient norm = 0.0024847
[t-SNE] Iteration 100: error = 1.3487746, gradient norm = 0.0022395
[t-SNE] Error after 100 iterations with early exaggeration: 1.348775
[t-SNE] Iteration 125: error = 1.2403316, gradient norm = 0.0018607
[t-SNE] Iteration 150: error = 1.2028494, gradient norm = 0.0017208
[t-SNE] Iteration 175: error = 1.1934843, gradient norm = 0.00169

In [119]:
# Create a trace for the Housman examples
trace_housman_glove = go.Scattergl(
    x = housman_glove_repr_tsne[:,0],
    y = housman_glove_repr_tsne[:,1],
    mode = 'markers',
    text = housman_glove_labels,
    name = "Housman (GloVe,semantic)",
)

# Create a trace for the Shakespeare examples
trace_shakespeare_glove = go.Scattergl(
    x = shakespeare_glove_repr_tsne[:,0],
    y = shakespeare_glove_repr_tsne[:,1],
    mode = 'markers',
    text = shakespeare_glove_labels,
    name = "Shakespeare (GloVe,semantic)",
)

# plot words from both authors together
data = [trace_housman_glove, trace_shakespeare_glove]
py.iplot(data, filename='basic-scatter')
