In [2]:
import tensorflow as tf
import numpy as np
import math
import collections

In [None]:
graph = tf.Graph()
sess = tf.Session(graph=graph)

## Initialize parameters for the model

In [3]:
batch_size = 5
window_size = 5
vocabulary_size = 50000
n_neg_samples = 64
learning_rate = 1
n_steps = 100000
epochs = 3
#Length of word vector
embedding_size_w = 500
#Length of document vector
embedding_size_d = 500

## Loading cases from files

In [38]:
import json
docs = []
tids = []
case_file = open('all_cases.txt', 'r', encoding='utf-8')
cases = json.load(case_file)
i = 0
for case in cases:
    docs.append(case[1])
    tids.append(case[0])
n_documents = len(docs)


[108020655,
 24842375,
 151688871,
 45282825,
 28356393,
 127469387,
 40022871,
 147645378,
 81716407,
 26708139,
 155387249,
 172711925,
 44023977,
 48200887,
 10208674,
 55464165,
 62613390,
 450208,
 141946357,
 121065383,
 24996160,
 194456776,
 10360544,
 117326869,
 134702363,
 31130861,
 172391921,
 99889750,
 137915996,
 191444115,
 193411084,
 140942667,
 111528149,
 100395953,
 61813699,
 168133848,
 116061333,
 163043474,
 121803967,
 138240228,
 76170404,
 33506004,
 11173178,
 100876905,
 73277952,
 157970699,
 173766255,
 184952977,
 197111815,
 196830604,
 95736196,
 97324292,
 93113065,
 169314364,
 184652855,
 37105745,
 50969089,
 62585894,
 49195391,
 81824666,
 186220869,
 154534967,
 104758337,
 142324899,
 110769474,
 52058443,
 196414438,
 186721652,
 112380434,
 128978673,
 184204505,
 69845940,
 69606174,
 187316464,
 79551830,
 53352430,
 125525442,
 178877453,
 187475778,
 106416990,
 101733098,
 180680303,
 84105948,
 114237665,
 86636408,
 129750439,
 14095

## Define inputs to the computation graph

In [None]:
with graph.as_default():
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size, window_size + 1])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

## Initializing TensorFlow variables for training

In [None]:
with graph.as_default():
    word_embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size_w], -1.0, 1.0), name='word_embeddings')
    doc_embeddings = tf.Variable(
        tf.random_uniform([n_documents, embedding_size_d], -1.0, 1.0), name='doc_embeddings')

    # Concatenating doc vector and word vectors
    combined_vector_length = embedding_size_d + window_size * embedding_size_w

    # Softmax weights
    weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, combined_vector_length],
                            stddev=1.0 / math.sqrt(combined_vector_length)), name='weights')
    # Softmax biases
    biases = tf.Variable(tf.zeros([vocabulary_size]), name='biases')
    input_matrix = []
    for j in range(window_size):
        # Find word embeddings for 'batch_size' number of words at window index j
        input_word_embeddings = tf.nn.embedding_lookup(word_embeddings, train_dataset[:, j])
        input_matrix.append(input_word_embeddings)

    input_doc_embeddings = tf.nn.embedding_lookup(doc_embeddings, train_dataset[:, window_size])
    input_matrix.append(input_doc_embeddings)
    input_matrix = tf.concat(1, input_matrix)

    loss = tf.nn.sampled_softmax_loss(weights, biases, input_matrix, 
                                      train_labels, n_neg_samples, vocabulary_size)

    optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)

    norm_w = tf.sqrt(tf.reduce_sum(tf.square(word_embeddings), 1, keep_dims=True))
    normalized_word_embeddings = word_embeddings / norm_w

    norm_d = tf.sqrt(tf.reduce_sum(tf.square(doc_embeddings), 1, keep_dims=True))
    normalized_doc_embeddings = doc_embeddings / norm_d

    init = tf.global_variables_initializer()

    saver = tf.train.Saver([word_embeddings, doc_embeddings, weights, biases])

## Building dictionaries from documents

In [None]:
def _build_dictionaries(docs):
    '''
    Process tokens and build dictionaries mapping between tokens and 
    their indices. Also generate token count and bind these to self.
    '''
    count = [['UNK', -1]]
    words = [] # Store words from all documents
    doc_ids = [] # Store document id for each word
    for i, doc in enumerate(docs):
        doc_ids.extend([i] * len(doc))
        words.extend(doc)
    # Add most frequent words in count list
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict() # {word: index}
    for word, _ in count:
        # Assign index to each word and store in dictionary
        dictionary[word] = len(dictionary)
    word_ids = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        word_ids.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    return doc_ids, word_ids, dictionary, reverse_dictionary


In [None]:
doc_ids, word_ids, dictionary, reverse_dictionary = _build_dictionaries(docs)

## Generate batch input for TensorFlow graph

In [None]:
from itertools import compress
data_index = 0
batch = np.ndarray(shape=(batch_size, window_size + 1), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = window_size + 1
buffer = collections.deque(maxlen=span) # used for collecting word_ids[data_index] in the sliding window
buffer_doc = collections.deque(maxlen=span) # collecting id of documents in the sliding window
def generate_batch_pvdm(doc_ids, word_ids, batch_size, window_size):
    '''
    Batch generator for PV-DM (Distributed Memory Model of Paragraph Vectors).
    batch should be a shape of (batch_size, window_size+1)
    Parameters
    ----------
    doc_ids: list of document indices 
    word_ids: list of word indices
    batch_size: number of words in each mini-batch
    window_size: number of leading words before the target word 
    '''
    global data_index
    global batch
    global labels
    global span
    global buffer
    global buffer_doc
    # collect the first window of words
    if (data_index == 0):
        for _ in range(span):
            buffer.append(word_ids[data_index])
            buffer_doc.append(doc_ids[data_index])
            data_index = (data_index + 1)
    mask = [1] * span
    mask[-1] = 0 
    i = 0
    while i < batch_size:
        if (data_index == len(word_ids)):
            data_index = 0
            return None, None
        if len(set(buffer_doc)) == 1:
            doc_id = buffer_doc[-1]
            # all leading words and the doc_id
            batch[i, :] = list(compress(buffer, mask)) + [doc_id]
            labels[i, 0] = buffer[-1] # the last word at end of the sliding window
            i += 1
        # move the sliding window  
        buffer.append(word_ids[data_index])
        buffer_doc.append(doc_ids[data_index])
        data_index += 1            
    return batch, labels


In [None]:
with sess.as_default():
    sess.run(init)

## Training word and document vectors

In [None]:
average_loss = 0

for epoch in range(epochs):
    print('Doing epoch: {}'.format(epoch))
    step = 0
    while(True):
        batch_data, batch_labels = generate_batch_pvdm(doc_ids, word_ids,
            batch_size, window_size)
        if batch_data is None:
            break
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        op, l = sess.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += np.mean(l)
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
                print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
        step += 1
    docs_done = False




In [None]:
with sess.as_default():
    print(doc_embeddings.eval())

In [None]:
# bind embedding matrices to self
word_embeddings = sess.run(normalized_word_embeddings)
doc_embeddings = sess.run(normalized_doc_embeddings)

## Saving model onto disk

In [25]:
import os
import json

path = 'server-model'
def save():
    saver.save(sess, os.path.join(path, 'trained-model'))
    with open(os.path.join(path, 'model_dict.json'), 'w', encoding='utf-8') as f:
        json.dump(dictionary, f, ensure_ascii=False)
    with open(os.path.join(path, 'model_rdict.json'), 'w', encoding='utf-8') as f:
        json.dump(reverse_dictionary, f, ensure_ascii=False)
        
def restore(session):
    with open(os.path.join(path, 'model_dict.json'), 'r', encoding='utf-8') as f:
        dictionary = json.load(f)
    with open(os.path.join(path, 'model_rdict.json'), 'r', encoding='utf-8') as f:
        reverse_dictionary = json.load(f)
    saver.restore(session, os.path.join(path, 'trained-model'))
    return dictionary

In [None]:
save()

In [None]:
sess.close()