In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
## Help Python find our packages
import sys
sys.path.append('..')

import json
import numpy as np
import matplotlib.pyplot as plt
from embeddings.randomwalk_embedder import *

# Randomness
import random as rn
import tensorflow as tf
import os

In [32]:
path = "./mock_metapaths.txt"
data = json.load(open(path, "r", encoding="utf8"))

In [53]:
lens = [len(path) for path in data.keys()]
np.min(lens)

1

In [57]:
def parse_meta_paths(json, min_size = 5, seperator = " | "):
        walk_list = []
        available_nodes = set()
        for meta_paths in json.keys():
            node_ids = [int(id) for id in meta_paths.split(seperator)]
            if (len(node_ids) < min_size):
                continue
            walk_list.append(node_ids)
            available_nodes |= set(node_ids)
        return walk_list, available_nodes

In [69]:
pre_walk_list, pre_available_nodes = parse_meta_paths(data)

In [70]:
available_nodes = range(len(pre_available_nodes))

In [71]:
id_mapping = dict(zip(pre_available_nodes, available_nodes))

In [72]:
walk_list = []
for mp in pre_walk_list:
    path = []
    for n in mp:
        path.append(id_mapping[n])
    walk_list.append(path)

In [83]:
batch_generator = ShortWalkBatchGenerator(walk_list, available_nodes)
embedded_nodes_size = len(batch_generator.available_nodes)

batch_size = 128
embedding_vector_size = 5  # Dimension of the embedding vector.
num_skips = 2         # How many times to reuse a walk to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 5     # Random set of words to evaluate similarity on.
valid_examples = random.sample(batch_generator.available_nodes, valid_size)
num_sampled = 10    # Number of negative examples to sample. (relevant for NCE loss)

graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
  train_context = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Look up embeddings for inputs.
  embeddings = tf.Variable(
      tf.random_uniform([embedded_nodes_size, embedding_vector_size], -1.0, 1.0))
  embed = tf.nn.embedding_lookup(embeddings, train_inputs)

  # Construct the variables for the softmax
  weights = tf.Variable(
      tf.truncated_normal([embedding_vector_size, embedded_nodes_size],
                          stddev=1.0 / math.sqrt(embedding_vector_size)))
  biases = tf.Variable(tf.zeros([embedded_nodes_size]))
  hidden_out = tf.transpose(tf.matmul(tf.transpose(weights), tf.transpose(embed))) + biases

  # convert train_context to a one-hot format
  train_one_hot = tf.one_hot(train_context, embedded_nodes_size)

  cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, labels=train_one_hot))

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(cross_entropy)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Add variable initializer.
  init = tf.global_variables_initializer()
  # Finally, create out saver.
  saver = tf.train.Saver()


def run(graph, num_steps):
    with tf.Session(graph=graph) as session:
      # We must initialize all variables before we use them.
      init.run()
      print('Initialized')

      average_loss = 0
      for step in range(num_steps):
        batch_inputs, batch_context = batch_generator.generate_batch(batch_size, num_skips)
#         print(batch_inputs)
#         print(batch_context)
        feed_dict = {train_inputs: batch_inputs, train_context: batch_context}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, cross_entropy], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
          if step > 0:
            average_loss /= 2000
          # The average loss is an estimate of the loss over the last 2000 batches.
          print('Average loss at step ', step, ': ', average_loss)
          average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
          sim = similarity.eval()
          saver.save(session, "./embedding.chkpt", global_step=step)
          index = 0
          for valid_word in valid_examples:
            top_k = 8  # number of nearest neighbors
            nearest = (-sim[index, :]).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
              close_word = nearest[k]
              log_str = '%s %s,' % (log_str, close_word)
            print(log_str)
            index += 1
      return normalized_embeddings.eval()

# num_steps = 100
# softmax_start_time = dt.datetime.now()
# run(graph, num_steps=num_steps)
# softmax_end_time = dt.datetime.now()
# print("Softmax method took {} minutes to run 100 iterations".format((softmax_end_time-softmax_start_time).total_seconds()))

with graph.as_default():

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([embedded_nodes_size, embedding_vector_size],
                            stddev=1.0 / math.sqrt(embedding_vector_size)))
    nce_biases = tf.Variable(tf.zeros([embedded_nodes_size]))

    nce_loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=train_context,
                       inputs=embed,
                       num_sampled=num_sampled,
                       num_classes=embedded_nodes_size))

    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(nce_loss)

    # Add variable initializer.
    init = tf.global_variables_initializer()

num_steps = 50000
nce_start_time = dt.datetime.now()
embedding = run(graph, num_steps)
nce_end_time = dt.datetime.now()
print("NCE method took {} seconds to run 100 iterations".format((nce_end_time-nce_start_time).total_seconds()))

Initialized
Average loss at step  0 :  2.672363758087158
Nearest to 2: 10, 4, 1, 6, 12, 8, 11, 0,
Nearest to 3: 9, 1, 5, 10, 6, 12, 0, 7,
Nearest to 8: 0, 7, 4, 11, 2, 5, 10, 9,
Nearest to 12: 6, 1, 11, 10, 2, 5, 4, 7,
Nearest to 1: 6, 3, 10, 12, 2, 9, 11, 5,
NCE method took 0.432715 seconds to run 100 iterations


In [79]:
test

array([[-0.42824027,  0.47200668,  0.29335976, -0.49752355, -0.5101276 ],
       [-0.3904592 ,  0.23337731,  0.36747465, -0.17365555, -0.79239047],
       [-0.43053728, -0.13237327,  0.6927135 , -0.31631583, -0.46605518],
       [-0.10324816,  0.17196442,  0.8304917 , -0.0554521 , -0.51669794],
       [-0.3717345 , -0.04965382,  0.8213299 , -0.2679959 , -0.33607048],
       [-0.3914409 ,  0.2498257 ,  0.683063  ,  0.1386725 , -0.54640275],
       [-0.8267579 ,  0.34916762,  0.23643248,  0.06228579,  0.3671149 ],
       [-0.69211215,  0.61120576,  0.19822869,  0.13105878,  0.30155814],
       [-0.5232188 , -0.18998626,  0.5434246 ,  0.46480379, -0.42284077],
       [ 0.22923012,  0.6550616 ,  0.61426634, -0.364336  , -0.09101606],
       [-0.47140205,  0.5522466 ,  0.4377025 , -0.52546996,  0.0714256 ],
       [-0.50748545,  0.78297555,  0.20342244,  0.24139987,  0.17249148],
       [ 0.410743  ,  0.4041864 ,  0.04118409,  0.65856653, -0.48220062]],
      dtype=float32)