In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import tensorflow as tf
import itertools as it
import pickle
import os

from tensorflow.contrib.tensorboard.plugins import projector

from time import time

LOG_DIR = '/tmp/tensorboard-logs/semantic/'

PATH_DATA = '../data/amazon/food/reviews_df.msg'
PATH_ENC_TXT = '../data/amazon/food/reviews_txt_enc_s.msg'
PATH_VOCAB = '../data/amazon/food/vocab.p'

entity_col = 'ProductId'

In [2]:
# Params
word_emb_size = 64
entity_emb_size = 16
n_negs_per_pos = 10  # number of negatives to sample per positive

adam_alpha = 0.001
adam_beta1 = 0.9
adam_beta2 = 0.999
l2_emb = 1e-2  # for embeddings
l2_map = 1e-2  # for mapping matrices
batch_size = 1024

In [3]:
df = pd.read_msgpack(PATH_DATA)
data_words_enc = pd.read_msgpack('../data/amazon/food/reviews_txt_enc_s.msg')
vocab = pickle.load(open(PATH_VOCAB, 'rb'))

In [4]:
n_entities = len(df[entity_col].cat.categories)

In [5]:
with tf.variable_scope('reg'):
    reg_emb = tf.contrib.layers.l2_regularizer(l2_emb)
    reg_map = tf.contrib.layers.l2_regularizer(l2_map)
    
with tf.variable_scope('emb'):
    word_embs = tf.get_variable(
        name='word',
        shape=(len(vocab), word_emb_size),
        initializer=None,  # use default glorot
        regularizer=reg_emb
    )
    entity_embs = tf.get_variable(
        name='item',
        shape=(n_entities, entity_emb_size),
        initializer=None,  # use default glorot
        regularizer=reg_emb
    )
    
with tf.variable_scope('ph'):
    ngram_ph = tf.sparse_placeholder(tf.int32)
    pos_entity_ph = tf.placeholder(tf.int32, shape=[batch_size, 1])
    neg_entities_ph = tf.placeholder(tf.int32, shape=[batch_size, n_negs_per_pos])

with tf.variable_scope('looked'):
    agg_looked_word_emb = tf.nn.embedding_lookup_sparse(
        word_embs, ngram_ph, None, combiner='mean')
    pos_looked_entity_emb = tf.nn.embedding_lookup(entity_embs, pos_entity_ph)
    neg_looked_entities_emb = tf.nn.embedding_lookup(entity_embs, neg_entities_ph)
    
f = tf.contrib.layers.fully_connected(
    inputs=agg_looked_word_emb,
    num_outputs=entity_emb_size,
    activation_fn=tf.nn.tanh,
    # use default xavier/glorot init
    weights_regularizer=reg_map,
    scope='map',
)

In [6]:
pos_score = tf.sigmoid(
    tf.reduce_sum(
        tf.multiply(f, tf.squeeze(pos_looked_entity_emb)),
        axis=-1, keep_dims=False),
    name='pos_score')
neg_scores = tf.sigmoid(
    tf.reduce_sum(
        tf.multiply(
            tf.reshape(tf.tile(f, tf.constant([n_negs_per_pos, 1])), shape=[batch_size, n_negs_per_pos, entity_emb_size]),
            neg_looked_entities_emb),
        axis=-1, keep_dims=False),
    name='neg_scores')

loss = tf.reduce_mean(
    tf.log(pos_score) + tf.reduce_sum(tf.log(1. - neg_scores), axis=-1),
    name='loss_mnce'
)

loss_reg = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
loss_tot = loss + loss_reg

In [7]:
opt = tf.train.AdamOptimizer(learning_rate=adam_alpha, beta1=adam_beta1, beta2=adam_beta2)

global_step = tf.get_variable(
    'global_step', shape=[], trainable=False,
    initializer=tf.constant_initializer(0))

train_op = opt.minimize(loss_tot, global_step=global_step)

# Preparing the actual data

In [8]:
%%time
rows, cols = zip(*it.chain(
    *([(row, col) for col in cols] 
      for row, cols in enumerate(data_words_enc))))
indices = np.array(rows)[:, None]
vals = np.array(cols, dtype='int32')

CPU times: user 14.7 s, sys: 632 ms, total: 15.3 s
Wall time: 15.3 s


In [9]:
entity_codes = df[entity_col].cat.codes.values

In [10]:
data_enc_csr = sp.csc_matrix(
    (np.ones(len(rows)), (rows, cols)),
     shape=(len(data_words_enc), len(vocab)),
     dtype=bool
)

In [65]:
def shit_full_gen():
    while True:
        # note: actually we should be uniformly sampling over entities rather than documents
        inds = np.random.permutation(np.arange(len(data_words_enc)))

        for ii in range(0, len(inds)-batch_size+1, batch_size):
            batch_inds = inds[ii:ii+batch_size]

            batch_pos_entity_codes = entity_codes[batch_inds, None]
            batch_neg_entity_codes = np.random.randint(
                0, n_entities,size=[batch_size, n_negs_per_pos])

            # Note: slicing rows of sp matrix is SLOW
            batch_words_rows, batch_words_cols = data_enc_csr[batch_inds].nonzero()

            # Note: we are supposed to grab a window of words here
            #     instead of the entire doc
            batch_words_sptv = tf.SparseTensorValue(
                batch_words_rows[:, None],
                batch_words_cols,
                dense_shape=[batch_size])

            feed_d = {
                ngram_ph: batch_words_sptv,
                pos_entity_ph: batch_pos_entity_codes,
                neg_entities_ph: batch_neg_entity_codes,
            }

            yield feed_d
        
def get_win(seq, win_size=4):
    ii = np.random.randint(0, max(0, len(seq) - win_size)+1)
    return seq[ii:ii+win_size]
        
def shit_win_gen():
    # note: actually we should be uniformly sampling over entities rather than documents
    while True:
        inds = np.random.permutation(np.arange(len(data_words_enc)))

        for ii in range(0, len(inds)-batch_size+1, batch_size):
            batch_inds = inds[ii:ii+batch_size]

            batch_pos_entity_codes = entity_codes[batch_inds, None]
            batch_neg_entity_codes = np.random.randint(
                0, n_entities,size=[batch_size, n_negs_per_pos])

            batch_words = data_words_enc.iloc[batch_inds].map(get_win)
            batch_words_rows, batch_words_cols = zip(*it.chain(
                *([(row, col) for col in cols] 
                  for row, cols in enumerate(batch_words))))

            # Note: we are supposed to grab a window of words here
            #     instead of the entire doc
            batch_words_sptv = tf.SparseTensorValue(
                np.array(batch_words_rows)[:, None],
                np.array(batch_words_cols, dtype='int32'),
                dense_shape=[batch_size])

            feed_d = {
                ngram_ph: batch_words_sptv,
                pos_entity_ph: batch_pos_entity_codes,
                neg_entities_ph: batch_neg_entity_codes,
            }

            yield feed_d

In [66]:
gen = shit_win_gen()

In [67]:
proj_config = projector.ProjectorConfig()

word_proj = proj_config.embeddings.add()
word_proj.tensor_name = word_embs.name
word_proj.metadata_path = os.path.join(LOG_DIR, 'word_metadata.tsv')

# single column meta does not have header
pd.Series(list(enumerate(vocab))).to_csv(os.path.join(LOG_DIR, 'word_metadata.tsv'), sep='\t', index=False, header=False)

summary_writer = tf.summary.FileWriter(LOG_DIR)

In [69]:
max_steps = 100000

print(f'Approx # epochs: {max_steps*batch_size/len(df)}')

gpu_opts = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
saver = tf.train.Saver()
summary_writer = tf.summary.FileWriter(LOG_DIR)

with tf.Session(config=tf.ConfigProto(gpu_options=gpu_opts)) as sess:
    sess.run(tf.global_variables_initializer())
    tic = time()
    for step in range(max_steps):
        feed = next(gen)
        sess.run(train_op, feed_dict=feed)
        
        if (step%1000) == 0:
            toc = time() - tic
            print(step, toc)
            tic = time()

            saver.save(sess, os.path.join(LOG_DIR, "model.ckpt"), step)
            projector.visualize_embeddings(summary_writer, proj_config)
    

Approx # epochs: 180.13770683291875
0 0.04048943519592285
1000 14.889153957366943
2000 14.643097400665283
3000 14.770846843719482
4000 15.402669191360474
5000 14.724296569824219
6000 14.788334369659424
7000 14.598264217376709
8000 14.37799334526062
9000 14.463687419891357
10000 14.65018343925476
11000 14.393420457839966
12000 14.730337619781494
13000 14.991628646850586
14000 14.717162370681763
15000 14.440537929534912
16000 14.651312351226807
17000 14.377591848373413
18000 14.44415283203125
19000 14.670193910598755
20000 14.617869853973389
21000 14.42133355140686
22000 14.773431062698364
23000 14.458477020263672
24000 14.347233057022095
25000 14.587358713150024
26000 14.323538541793823
27000 14.316344976425171
28000 14.552619934082031
29000 14.405521392822266
30000 14.19519853591919
31000 14.659892797470093
32000 14.527189493179321
33000 14.35249137878418
34000 14.695753574371338
35000 14.4535973072052
36000 14.4223792552948
37000 14.64562702178955
38000 14.465288400650024
39000 14.200

# SCRAP

In [15]:
stop

NameError: name 'stop' is not defined

In [87]:
pd.Series(list(enumerate(vocab))).to_csv(os.path.join(LOG_DIR, 'word_metadata.tsv'), sep='\t', index=False, header=False)

In [70]:
next(gen)

{<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x7fb8e652f518>: SparseTensorValue(indices=array([[   0],
       [   0],
       [   0],
       ..., 
       [1023],
       [1023],
       [1023]]), values=array([17970, 13654, 28557, ..., 21822, 24132, 31921], dtype=int32), dense_shape=[1024]),
 <tf.Tensor 'ph/Placeholder_3:0' shape=(1024, 1) dtype=int32>: array([[24607],
        [ 5927],
        [69858],
        ..., 
        [47834],
        [40309],
        [71170]], dtype=int32),
 <tf.Tensor 'ph/Placeholder_4:0' shape=(1024, 10) dtype=int32>: array([[56344, 44833, 25306, ...,  6432,  7376, 70702],
        [26528, 69461, 31499, ..., 24419, 23698, 13890],
        [63025,  5350, 33143, ..., 18762,  2815, 32479],
        ..., 
        [55719, 72111, 71627, ..., 63892, 39498, 38823],
        [ 5972, 59571, 71122, ..., 16453, 68166,  5105],
        [  264, 66604, 48647, ..., 25273,  9870, 47331]])}

In [81]:
np.array(vocab)[[17970, 13654, 28557]]

array(['littl', 'generous', 'sweeter'],
      dtype='<U81')

In [71]:
df.iloc[24607]

ProductId                                                        B0047E2I5U
UserId                                                       A1EMMC2NCSXPSW
ProfileName                                        Debra D. Laflen "gr8skn"
HelpfulnessNumerator                                                      2
HelpfulnessDenominator                                                    2
Score                                                                     5
Time                                                             1281052800
Summary                                                      great dog food
Text                      My dog Denali loves the Ziwi Peak food.  It is...
Name: 24608, dtype: object

In [83]:
df.iloc[np.where(entity_codes == 24607)[0]].Text.values

array([ 'Love these beans! The chocolate coating is creamy and smooth, and a little more generous than most. It has a sweeter taste than the coating on some other varieties (sweet, but not cloying). That balances well with the sharp flavor of the espresso bean. The appearance of these is pleasing also. There is an even distribution of dark chocolate, white chocolate, and a pretty speckled milk-on-white that conceals a milk chocolate underlayer.<br /><br />I like the presentation of Dilettante Chocolates; on the label there are two short paragraphs about the history and practices of the company that incorporate that information charmingly and with a concise flair that is very modern.<br /><br />And the price for this somewhat decadent purchase, three pounds of chocolate covered espresso beans, is terrific even with shipping. Service was very good too. I received my order within three business days of placing it.',
       'Seattle Gourmet Foods is a company I found through Amazon.com The

In [19]:
word_embs

<tf.Variable 'emb/word:0' shape=(32768, 64) dtype=float32_ref>