In [1]:
from __future__ import print_function
import numpy as np
import random
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from sklearn.linear_model import LogisticRegression
from graph import *
import walker
import os, sys, time, pickle
from collections import Counter
import tensorflow as tf

## Generate random walk

In [2]:
input_file = '/home/jehyuk/Documents/OpenNE/data/cora/cora_edgelist.txt'
walker_file = '/home/jehyuk/Documents/OpenNE/tmp/cora_walker.obj'

In [3]:
def make_rw(input_file, walker_file):
    graph = Graph()
    graph.read_adjlist(input_file)
    rw_gen = walker.BasicWalker(graph, workers = 4)
    walks = rw_gen.simulate_walks(num_walks = 5, walk_length = 5)
    with open(walker_file, 'wb') as f:
        pickle.dump(walks, f)
    return walks

In [4]:
def read_rw(rw_file):
    with open(rw_file, 'rb') as f:
        rw_data = pickle.load(f)
    return rw_data

In [5]:
def create_lookup_tables(total_data):
    word_counts = Counter(total_data)
    int_to_vocab = dict()
    vocab_to_int = dict()
    for key in word_counts.keys():
        int_to_vocab[int(key)] = key
    for int_key in range(max([int(x) for x in word_counts.keys()])):
         if int_key not in int_to_vocab.keys():
            int_to_vocab[int_key] = str(int_key)
    for key in int_to_vocab.keys():
        vocab_to_int[str(key)] = key
    return vocab_to_int, int_to_vocab

In [6]:
def utils_get_batches(int_text, batch_size, seq_length):
    n_batches = int(len(int_text) / (batch_size * seq_length))
    
    xdata = np.array(int_text[:n_batches * batch_size * seq_length])
    ydata = np.array(int_text[1:n_batches * batch_size * seq_length + 1])
    
    x_batches = np.split(xdata.reshape(batch_size, -1), n_batches, 1)
    y_batches = np.split(ydata.reshape(batch_size, -1), n_batches, 1)
    
    return list(zip(x_batches, y_batches))
    

In [7]:
rw_data = read_rw(walker_file)
total_data = list()
for i in range(len(rw_data)):
    total_data.extend((x) for x in rw_data[i])

In [8]:
vocab_to_int, int_to_vocab = create_lookup_tables(total_data)
int_words = [vocab_to_int[word] for word in total_data]

## Subsampling

In [9]:
threshold = 1e-5
word_counts = Counter(int_words)
total_count = len(int_words)
freqs          = {word: count/total_count for word, count in word_counts.items()}
p_drop        = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
train_words = [word for word in int_words if random.random() < (1-p_drop[word])]

From Mikolov et al.:

"Since the more distant words are usually less related to the current word than those close to it, we give less weight to the distant words by sampling less from those words in our training examples... If we choose $C = 5$, for each training word we will select randomly a number $R$ in range $&lt; 1; C &gt;$, and then use $R$ words from history and $R$ words from the future of the current word as correct labels."

In [10]:
def get_target(words, idx, window_size = 5):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx-R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])
    
    return list(target_words)

In [11]:
def get_batches(words, batch_size, window_size=5):
    n_batches = len(words) // batch_size
    
    words = words[:n_batches * batch_size]
    for idx in range(0, len(words), batch_size):
        x = list()
        y = list()
        batch = words[idx: idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x] * len(batch_y))
        yield x, y

## Building the graph

In [12]:
n_vocab = len(int_to_vocab)
embed_dim = 50
negative_samples = 100
max_epochs = 1000
batch_size = 100
window_size = 5

valid_size=4
valid_window = 100

In [13]:
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True

In [14]:
train_graph = tf.Graph()

with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')
    embedding_matrix = tf.Variable(tf.random_uniform((n_vocab, embed_dim), -1, 1))
    embedded = tf.nn.embedding_lookup(embedding_matrix, inputs)
    # For negative sampling
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, embed_dim)))
    softmax_b = tf.Variable(tf.zeros(n_vocab), name = 'softmax_bias')
    loss = tf.nn.sampled_softmax_loss(weights = softmax_w, 
                                      biases = softmax_b, 
                                      labels = labels, 
                                      inputs=embedded, 
                                      num_sampled=negative_samples, 
                                      num_classes=n_vocab)
    cost = tf.reduce_mean(loss)
    opt = tf.train.AdamOptimizer().minimize(cost)
    
    saver = tf.train.Saver()
    
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, random.sample(range(1000,1000+valid_window), valid_size //2))
    valid_dataset    = tf.constant(valid_examples, dtype=tf.int32)

    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding_matrix), 1, keep_dims=True))
    normalized_embedding_matrix = embedding_matrix / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding_matrix, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding_matrix))
      
    
    
with tf.Session(graph = train_graph, config=sess_config) as sess:
    _iter = 1
    loss = 0
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(1,max_epochs+1):
        batches = get_batches(train_words, batch_size, window_size)
        start = time.time()
        for x, y in batches:
            feed_dict = {inputs: x, labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, opt], feed_dict = feed_dict)
            loss += train_loss
            _iter+=1
        if (epoch + 1) % 50 == 0:
            print("Epoch {} / {}, Iterations: {}, Avg trn loss: {:.4f}, {:.4f} s/batch".format(epoch+1, max_epochs, _iter, loss/100, (time.time()-start)/100))
            sim = similarity.eval()
            for i in range(valid_size):
                valid_node = int_to_vocab[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i,:]).argsort()[1:top_k+1]
                log = 'Nearest to %s: ' % valid_node
                for k in range(top_k):
                    close_word = int_to_vocab[nearest[k]]
                    log = "%s %s, " %(log, close_word)
                print(log)

            loss = 0
            start = time.time()

            save_path = saver.save(sess, 'checkpoints/embedded.ckpt')
            embed_mat = sess.run(normalized_embedding_matrix)
            print()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Epoch 50 / 1000, Iterations: 5195, Avg trn loss: 287.7568, 0.0027 s/batch
Nearest to 69:  670,  1820,  59,  2217,  628,  938,  1796,  385, 
Nearest to 3:  392,  2631,  977,  2556,  2573,  509,  915,  2374, 
Nearest to 1014:  867,  495,  1715,  2171,  2500,  2705,  2673,  148, 
Nearest to 1023:  2273,  1671,  1467,  2586,  470,  1618,  1617,  1474, 

Epoch 100 / 1000, Iterations: 10495, Avg trn loss: 153.1273, 0.0033 s/batch
Nearest to 69:  911,  670,  1583,  675,  165,  2600,  938,  2214, 
Nearest to 3:  2544,  430,  1543,  1028,  855,  1702,  1336,  2116, 
Nearest to 1014:  2375,  495,  2171,  1258,  867,  486,  869,  90, 
Nearest to 1023:  1467,  1657,  2586,  1474,  2290,  641,  859,  725, 

Epoch 150 / 1000, Iterations: 15795, Avg trn loss: 104.8127, 0.0034 s/batch
Nearest to 69:  911,  1583,  2600,  1532,  165,  670,  2294,  1594, 
Nearest to 3:  2544,  430,  1028,  1543,  1688,  855,  774,  1702, 
Nearest to

In [15]:
graph = Graph()
graph.read_adjlist(input_file)

In [16]:
x_26 = [x for x in graph.G.neighbors('26')]
x_171 = [x for x in graph.G.neighbors('171')]

In [17]:
print(x_26)
print(x_171)

['2454', '2455', '123', '99', '122']
['790', '1548', '775']
