In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import tensorflow as tf
import itertools as it
import pickle
import os
from sklearn.model_selection import ShuffleSplit

from tensorflow.contrib.tensorboard.plugins import projector

from time import time

from importlib import reload
import sys
if '..' not in sys.path:
    sys.path.append('..')
from sertf import core
reload(core)

LOG_DIR = '/tmp/tensorboard-logs/semantic/'

PATH_DATA = '../data/flickr/Flickr8k_text/df.msg'
PATH_ENC_TXT = '../data/flickr/Flickr8k_text/txt_enc_s.msg'
PATH_VOCAB = '../data/flickr/Flickr8k_text/vocab.p'

entity_col = 'img'

SEED = 322

In [2]:
df = pd.read_msgpack(PATH_DATA)
data_words_enc = pd.read_msgpack(PATH_ENC_TXT)
vocab = pickle.load(open(PATH_VOCAB, 'rb'))

In [3]:
n_entities = len(df[entity_col].cat.categories)

In [4]:
entity_codes = df[entity_col].cat.codes.values

# Split

In [5]:
# Nothing special about holding out entire images or stratification
ss = ShuffleSplit(test_size=0.1).split(df)
tsplit_inds, vsplit_inds = next(ss)

In [6]:
model = core.Model(vocab, n_entities)

In [12]:
batch_size = 1024
train_gen = core.win_gen(
    data_words_enc.iloc[tsplit_inds],
    entity_codes, n_entities,
    model.n_negs_per_pos,
    model.ph_d, batch_size=batch_size)

In [9]:
proj_config = projector.ProjectorConfig()

word_proj = proj_config.embeddings.add()
word_proj.tensor_name = model.emb_d['word'].name
word_proj.metadata_path = os.path.join(LOG_DIR, 'word_metadata.tsv')

# single column meta does not have header
pd.Series(list(enumerate(vocab))).to_csv(os.path.join(LOG_DIR, 'word_metadata.tsv'), sep='\t', index=False, header=False)

summary_writer = tf.summary.FileWriter(LOG_DIR)

In [14]:
%%time
max_steps = 1000

print(f'Approx # epochs: {max_steps*batch_size/len(df)}')

gpu_opts = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
saver = tf.train.Saver()
summary_writer = tf.summary.FileWriter(LOG_DIR)

with tf.Session(config=tf.ConfigProto(gpu_options=gpu_opts)) as sess:
    sess.run(tf.global_variables_initializer())
    tic = time()
    for step in range(max_steps):
        feed = next(train_gen)
        sess.run(model.train_op, feed_dict=feed)
    
        if (step%100) == 0:
            toc = time() - tic
            print(step, toc)
            tic = time()

            saver.save(sess, os.path.join(LOG_DIR, "model.ckpt"), step)
            projector.visualize_embeddings(summary_writer, proj_config)

Approx # epochs: 25.308947108255065
0 0.06989073753356934
100 2.8891751766204834
200 2.7198870182037354
300 2.550952434539795
400 2.710646629333496
500 2.6968531608581543
600 2.8775994777679443
700 2.8592147827148438
800 2.787973642349243
900 2.7136423587799072
CPU times: user 20.1 s, sys: 1.32 s, total: 21.4 s
Wall time: 27.8 s
