In [133]:
import numpy as np
import sklearn
from sklearn.manifold import TSNE
import cPickle as pickle

import plotly
plotly.__version__
import plotly.plotly as py
import plotly.graph_objs as go
# py.sign_in("IPython.Demo", "1fw3zw2o13")
# py.sign_in("DemoAccount","lr1c37zw81")
import plotly.tools as tls
credentials = tls.get_credentials_file()
py.sign_in(credentials['username'], credentials['api_key'])

In [6]:
# load the word pairs with representations
word_pairs_housman_glove = pickle.load(open('../working_data/housman.pair.glove.pkl', 'rb'))
word_pairs_shakespeare_glove = pickle.load(open('../working_data/shakes.pair.glove.pkl', 'rb'))

In [69]:
# create a list of words and a np.array of representations from the word tuples
def format_pairs(word_pairs, operation):
    representations = []
    labels = []
    for wp_tuple in word_pairs:
        w1 = wp_tuple[0]
        w2 = wp_tuple[1]
        w1_repr = np.asmatrix(wp_tuple[7])
        w2_repr = np.asmatrix(wp_tuple[8])
        w1w2_repr = None
        if operation == 'add':
            w1w2_repr = np.add(w1_repr, w2_repr)
        elif operation == 'concat':
            w1w2_repr = np.concatenate((w1_repr, w2_repr), axis=1)
        else:
            print("unknown operation")
        representations.append(w1w2_repr)
        if operation == 'add':
            labels.append(w1 + " + " + w2)
        elif operation == 'concat':
            labels.append(w1 + " : " + w2)
        else:
            print("unknown operation")
        
    np_representations = np.stack(representations)
    
    return np_representations, labels

# format the housman and shakespeare tuples with glove representations
housman_glove_repr, housman_glove_labels = format_pairs(word_pairs_housman_glove, 'concat')
shakespeare_glove_repr, shakespeare_glove_labels = format_pairs(word_pairs_shakespeare_glove, 'concat')

In [22]:
# create a list of random word pairs, that do not appear in our training data
from sets import Set
import random


shakespeare_allWords = pickle.load(open('../working_data/shakes.voc.glove.pkl'))
housman_allWords = pickle.load(open('../working_data/housman.voc.glove.pkl'))

all_words = Set()
for w in shakespeare_allWords:
    repr_dict[w[0]] = np.asmatrix(w[1])
    
for w in housman_allWords:
    repr_dict[w[0]] = np.asmatrix(w[1])
    
all_words = Set(repr_dict.keys())
print("Total words: " + str(len(all_words)))

def create_word_set(word_pairs, word_set):
    for wp_tuple in word_pairs:
        word_set.add(wp_tuple[0])
        word_set.add(wp_tuple[1])
    
rhyme_words = Set()
create_word_set(word_pairs_housman_glove, rhyme_words)
create_word_set(word_pairs_shakespeare_glove, rhyme_words)
print("Total rhyme words: " + str(len(rhyme_words)))

non_rhyme_words = all_words - rhyme_words
print("Total non-rhyme words: " + str(len(non_rhyme_words)))

positiveNo = len(housman_glove_labels) + len(shakespeare_glove_labels)
negativeNo = 0

random_glove_repr_list = []
random_glove_labels = []

while negativeNo < positiveNo:
    random_words = random.sample(non_rhyme_words, 2)
    random_pair = ' - '.join(random_words)
    in_housman = random_pair in housman_glove_labels
    in_shakespeare = random_pair in shakespeare_glove_labels
    if (not in_housman) and (not in_shakespeare):
        random_glove_labels.append(random_pair)
        random_repr = np.concatenate((repr_dict[random_words[0]], repr_dict[random_words[1]]), axis=1)
        random_glove_repr_list.append(random_repr)
        negativeNo = negativeNo + 1

random_glove_repr = np.stack(random_glove_repr_list)

for i in random_glove_labels[:20]:
    print(i)
print(random_glove_repr.shape)

Total words: 6928
Total rhyme words: 2508
Total non-rhyme words: 4420
dressings - wherefore
speech - bristly
thoughts - citizen
supper - bastions
controlled - remission
peers - knowledge
lesson - digression
vestal - perished
north - impurity
falchion - starry
liker - healthful
cinders - leaden
valleys - grecian
storming - withering
falsely - large
shall - advanced
exclaims - forty
alabaster - embittered
sprinkled - planing
spilt - venice
(4554, 600)


In [23]:
# concatenate all representations and run tsne
all_repr = np.concatenate((housman_glove_repr, shakespeare_glove_repr, random_glove_repr), axis=0)
model = TSNE(n_components=2, random_state=0, verbose=2)
np.set_printoptions(suppress=True)
print("==> Running TSNE...")
tsne_word_repr = model.fit_transform(all_repr) 
print("==> done.")

# pick out the housman from the shakespeare tsne representations
housman_glove_repr_tsne = tsne_word_repr[:len(housman_glove_labels)]
shakespeare_glove_repr_tsne = tsne_word_repr[len(housman_glove_labels):positiveNo]
random_glove_repr_tsne = tsne_word_repr[positiveNo:]


==> Running TSNE...
[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 9108
[t-SNE] Computed conditional probabilities for sample 2000 / 9108
[t-SNE] Computed conditional probabilities for sample 3000 / 9108
[t-SNE] Computed conditional probabilities for sample 4000 / 9108
[t-SNE] Computed conditional probabilities for sample 5000 / 9108
[t-SNE] Computed conditional probabilities for sample 6000 / 9108
[t-SNE] Computed conditional probabilities for sample 7000 / 9108
[t-SNE] Computed conditional probabilities for sample 8000 / 9108
[t-SNE] Computed conditional probabilities for sample 9000 / 9108
[t-SNE] Computed conditional probabilities for sample 9108 / 9108
[t-SNE] Mean sigma: 2.784947
[t-SNE] Iteration 25: error = 1.3723714, gradient norm = 0.0012162
[t-SNE] Iteration 50: error = 1.3709687, gradient norm = 0.0069180
[t-SNE] Iteration 75: error = 1.3311120, gradient norm = 0.0025930
[t-SNE] 

In [125]:
# Create a trace for the Housman examples
trace_housman_glove = go.Scattergl(
    x = housman_glove_repr_tsne[:,0],
    y = housman_glove_repr_tsne[:,1],
    mode = 'markers',
    marker = dict(
        color = 'rgba(22, 96, 167, 0.8)',
    ),      
    text = housman_glove_labels,
    name = "Housman (GloVe,semantic)",
)

# Create a trace for the Shakespeare examples
trace_shakespeare_glove = go.Scattergl(
    x = shakespeare_glove_repr_tsne[:,0],
    y = shakespeare_glove_repr_tsne[:,1],
    mode = 'markers',
    marker = dict(
        color = 'rgba(255, 106, 0, 0.8)',
    ),  
    text = shakespeare_glove_labels,
    name = "Shakespeare (GloVe,semantic)",
)

In [134]:
# plot words from both authors together
data = [trace_housman_glove, trace_shakespeare_glove]
layout = go.Layout(
    title='Housman and Shakespeare rhyme pairs, concatenated representations',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='housman-shakespeare-rhymes-concat')


In [143]:
# plot rhyme pairs for Housman, concatenated
data = [trace_housman_glove]
layout = go.Layout(
    title='Housman rhyme pairs, concatenated representations',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='housman-rhymes-concat')

In [136]:
# plot rhyme pairs for Shakespeare, concatenated
data = [trace_shakespeare_glove]
layout = go.Layout(
    title='Shakespeare rhyme pairs, concatenated representations',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='shakespeare-rhymes-concat')

In [137]:
rhyme_glove_repr_tsne = tsne_word_repr[:positiveNo]
random_glove_repr_tsne = tsne_word_repr[positiveNo:]
rhyme_glove_labels = housman_glove_labels + shakespeare_glove_labels

# Create a trace for the random examples
trace_random_glove = go.Scattergl(
    x = random_glove_repr_tsne[:,0],
    y = random_glove_repr_tsne[:,1],
    mode = 'markers',
    marker = dict(
        color = 'rgba(205, 13, 24, 0.8)',
    ),      
    text = random_glove_labels,
    name = "random words (GloVe,semantic)",
)

# Create a trace for the rhyme word examples
trace_rhyme_glove = go.Scattergl(
    x = rhyme_glove_repr_tsne[:,0],
    y = rhyme_glove_repr_tsne[:,1],
    mode = 'markers',
    marker = dict(
        color = 'rgba(34, 148, 26, 0.8)',
    ),      
    text = rhyme_glove_labels,
    name = "rhyme words (GloVe,semantic)",
)

# plot words from both authors together
data = [trace_random_glove, trace_rhyme_glove]

layout = go.Layout(
    title='rhyme pairs vs. non-rhyme pairs, concatenated representations',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='rhyme-vs-non_rhyme-concat')


In [73]:
allWords_list = repr_dict.keys()
allWords_repr = []
for w in allWords_list:
    allWords_repr.append(repr_dict[w])
allWords_glove_repr = np.stack(allWords_repr)

# create representations by element-wise addition instead of concatenation
add_housman_glove_repr, add_housman_glove_labels = format_pairs(word_pairs_housman_glove, 'add')
add_shakespeare_glove_repr, add_shakespeare_glove_labels = format_pairs(word_pairs_shakespeare_glove, 'add')


In [56]:
# concatenate all representations and run tsne
add_all_repr = np.concatenate((add_housman_glove_repr, add_shakespeare_glove_repr, allWords_glove_repr), axis=0)
print(add_all_repr.shape)
add_model = TSNE(n_components=2, random_state=0, verbose=2)
np.set_printoptions(suppress=True)
print("==> Running TSNE...")
add_tsne_word_repr = add_model.fit_transform(add_all_repr) 
print("==> done.")


(11482, 300)
==> Running TSNE...
[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 11482
[t-SNE] Computed conditional probabilities for sample 2000 / 11482
[t-SNE] Computed conditional probabilities for sample 3000 / 11482
[t-SNE] Computed conditional probabilities for sample 4000 / 11482
[t-SNE] Computed conditional probabilities for sample 5000 / 11482
[t-SNE] Computed conditional probabilities for sample 6000 / 11482
[t-SNE] Computed conditional probabilities for sample 7000 / 11482
[t-SNE] Computed conditional probabilities for sample 8000 / 11482
[t-SNE] Computed conditional probabilities for sample 9000 / 11482
[t-SNE] Computed conditional probabilities for sample 10000 / 11482
[t-SNE] Computed conditional probabilities for sample 11000 / 11482
[t-SNE] Computed conditional probabilities for sample 11482 / 11482
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 25: error = 1.2048553, gradient

In [80]:
# pick out the housman from the shakespeare tsne representations
rhyme_repr_len = len(add_housman_glove_labels) + len(add_shakespeare_glove_labels)
add_housman_glove_repr_tsne = add_tsne_word_repr[:len(add_housman_glove_labels)]
add_shakespeare_glove_repr_tsne = add_tsne_word_repr[len(add_housman_glove_labels):rhyme_repr_len]
rhymeWords_add_glove_tsne = add_tsne_word_repr[:rhyme_repr_len]
rhymeWords_add_glove_labels = add_housman_glove_labels + add_shakespeare_glove_labels
allWords_repr_tsne = add_tsne_word_repr[rhyme_repr_len:]

In [113]:
# Create a trace for the Housman examples
add_trace_housman_glove = go.Scattergl(
    x = add_housman_glove_repr_tsne[:,0],
    y = add_housman_glove_repr_tsne[:,1],
    mode = 'markers',
    marker = dict(
        color = 'rgba(22, 96, 167, 0.8)',
    ),  
    text = add_housman_glove_labels,
    name = "Housman (GloVe,semantic)",
)

# Create a trace for the Shakespeare examples
add_trace_shakespeare_glove = go.Scattergl(
    x = add_shakespeare_glove_repr_tsne[:,0],
    y = add_shakespeare_glove_repr_tsne[:,1],
    mode = 'markers',
    marker = dict(
        color = 'rgba(255, 106, 0, 0.8)',
    ),  
    text = add_shakespeare_glove_labels,
    name = "Shakespeare (GloVe,semantic)",
)

# Create a trace for the simplex words
trace_allWords_glove = go.Scattergl(
    x = allWords_repr_tsne[:,0],
    y = allWords_repr_tsne[:,1],
#     mode = 'markers+text',
    mode = 'markers',
    marker = dict(
        color = 'rgba(205, 13, 24, 0.8)',
    ),      
    text = allWords_list,
    name = "All words in S+H poems (GloVe,semantic)",
#     textposition='bottom',    
)

# Create a trace for the rhyme word pairs, added
trace_rhymePairs_add_glove = go.Scattergl(
    x = rhymeWords_add_glove_tsne[:,0],
    y = rhymeWords_add_glove_tsne[:,1],
#     mode = 'markers+text',
    mode = 'markers',
    marker = dict(
        color = 'rgba(34, 148, 26, 0.8)',
    ),      
    text = rhymeWords_add_glove_labels,
    name = "Rhyme words in S+H poems (GloVe,semantic)",
#     textposition='bottom',    
)

In [138]:
# plot rhyme words, added vs. simplex words from both authors
add_data = [trace_allWords_glove, trace_rhymePairs_add_glove]
add_layout = go.Layout(
    title='Housman and Shakespeare rhyme pairs, added representations vs. all simplex words in their poems',
)
fig = go.Figure(data=add_data, layout=add_layout)
py.iplot(fig, filename='rhyme_added-vs-simpex_words')

In [139]:
# plot words from both authors together
add_data = [add_trace_housman_glove, add_trace_shakespeare_glove]
# add_data = [add_trace_housman_glove]
add_layout = go.Layout(
    title='Housman and Shakespeare rhyme pairs, added representations',
)
fig = go.Figure(data=add_data, layout=add_layout)
py.iplot(fig, filename='housman-shakespeare-rhymes-added')

In [142]:
# plot rhyme pairs from Housman, added representations
add_data = [add_trace_housman_glove]
add_layout = go.Layout(
    title='Housman rhyme pairs, added representations',
)
fig = go.Figure(data=add_data, layout=add_layout)
py.iplot(fig, filename='housman-rhymes-addeed')

In [144]:
# plot rhyme pairs from Shakespeare, added representations
# Create a trace for the Shakespeare examples
add_data = [add_trace_shakespeare_glove]
add_layout = go.Layout(
    title='Shakespeare rhyme pairs, added representations',
)
fig = go.Figure(data=add_data, layout=add_layout)
py.iplot(fig, filename='shakespeare-rhymes-added')