In [1]:
import tensorflow as tf
import numpy as np
import math
import collections

# Finding relevant cases based on user query

In [2]:
test_graph = tf.Graph()
test_session = tf.Session(graph=test_graph)

In [3]:
import re

def cleanhtml(raw_html):
    #raw_html.decode('unicode_escape').encode('ascii','ignore')
    cleanr = re.compile('<.*?>|[\W\d]|_')
    cleantext = re.sub(cleanr, ' ', raw_html)
    cleantext = cleantext.lower()
    return cleantext

## Preprocessing user input for model

In [4]:
import nltk
input_file = open('input.txt', 'r', encoding='utf-8')
para = input_file.read()
y = cleanhtml(para)
tokens = nltk.word_tokenize(y)
n_documents = 11555
print(tokens)

['harassed', 'by', 'husband', 'and', 'family', 'for', 'money', 'jewellery', 'dowry', 'beaten', 'by', 'husband', 'and', 'in', 'laws', 'threatened', 'to', 'kill', 'thrown', 'out', 'of', 'house', 'if', 'dowry', 'is', 'not', 'given', 'tried', 'to', 'poison', 'and', 'commit', 'suicide', 'i', 'suffer', 'from', 'depression', 'because', 'excessive', 'dowry', 'demands']


## Specifying inputs to TensorFlow Graph

In [5]:
batch_size = 5
window_size = 5
vocabulary_size = 50000
n_neg_samples = 64
learning_rate = 1
n_steps = 100000
epochs = 3
#Length of word vector
embedding_size_w = 500
#Length of document vector
embedding_size_d = 500

In [6]:
with test_graph.as_default():
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size, window_size + 1])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [7]:
with test_graph.as_default():
    word_embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size_w], -1.0, 1.0), name='word_embeddings')
    doc_embeddings = tf.Variable(
        tf.random_uniform([n_documents, embedding_size_w], -1.0, 1.0), name='doc_embeddings')
    doc_embedding = tf.Variable(
        tf.random_uniform([1, embedding_size_d], -1.0, 1.0), name='doc_embedding')

    combined_vector_length = embedding_size_d + window_size * embedding_size_w

    # Softmax weights
    weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, combined_vector_length],
                            stddev=1.0 / math.sqrt(combined_vector_length)), name='weights')
    # Softmax biases
    biases = tf.Variable(tf.zeros([vocabulary_size]), name='biases')
    
    # Concatenating doc vector and word vectors
    combined_vector_length = embedding_size_d + window_size * embedding_size_w

    input_matrix = []
    for j in range(window_size):
        # Find word embeddings for 'batch_size' number of words at window index j
        input_word_embeddings = tf.nn.embedding_lookup(word_embeddings, train_dataset[:, j])
        input_matrix.append(input_word_embeddings)

    input_doc_embedding = tf.nn.embedding_lookup(doc_embedding, train_dataset[:, window_size])
    input_matrix.append(input_doc_embedding)
    input_matrix = tf.concat(1, input_matrix)

    doc_loss = tf.nn.sampled_softmax_loss(weights, biases, input_matrix, 
                                      train_labels, n_neg_samples, vocabulary_size)

    doc_optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(doc_loss, var_list=[doc_embedding])

    doc_init = tf.global_variables_initializer()

    norm_d = tf.sqrt(tf.reduce_sum(tf.square(doc_embedding), 1, keep_dims=True))
    normalized_doc_embedding = doc_embedding / norm_d
    
    saver = tf.train.Saver([word_embeddings, doc_embeddings, weights, biases])

In [8]:
with test_session.as_default():
    doc_init.run()

## Restoring model from disk

In [9]:
import os
import json

path = 'server-model'
def save():
    saver.save(sess, os.path.join(path, 'trained-model'))
    with open(os.path.join(path, 'model_dict.json'), 'w', encoding='utf-8') as f:
        json.dump(dictionary, f, ensure_ascii=False)
    with open(os.path.join(path, 'model_rdict.json'), 'w', encoding='utf-8') as f:
        json.dump(reverse_dictionary, f, ensure_ascii=False)
        
def restore(session):
    with open(os.path.join(path, 'model_dict.json'), 'r', encoding='utf-8') as f:
        dictionary = json.load(f)
    with open(os.path.join(path, 'model_rdict.json'), 'r', encoding='utf-8') as f:
        reverse_dictionary = json.load(f)
    saver.restore(session, os.path.join(path, 'trained-model'))
    return dictionary

In [10]:
with test_graph.as_default():
    dictionary = restore(test_session)

In [11]:
with test_session.as_default(), test_graph.as_default():
    print(doc_embeddings.eval())

[[  1.17180037  -0.54881263  -0.39782715 ...,   1.80713189   2.05496287
    1.47726083]
 [ -0.77837354   0.94513148 -10.13407326 ...,   2.66784477   2.93940616
    4.32101774]
 [  1.43049073  -0.76193863  -3.26454353 ...,   3.75134826   4.48116827
   -1.14625895]
 ..., 
 [ -0.87669206   0.45652223  -0.919384   ...,   0.90272999  -0.25631523
   -0.81226397]
 [  0.24927497   0.44542861  -0.1495347  ...,  -0.84609509  -0.4349165
   -0.53321362]
 [  0.47091246  -0.28414917   0.81335592 ...,  -0.19377565   0.74336004
    0.22409534]]


### Obtain document vector for user provided paragraph

In [12]:
doc_word_ids = []
for word in tokens:
    if word in dictionary:
        index = dictionary[word]
    else:
        index = 0  # dictionary['UNK']
    doc_word_ids.append(index)
print(doc_word_ids)

[6939, 9, 830, 5, 500, 11, 776, 9620, 1348, 7544, 9, 830, 5, 4, 600, 3797, 3, 3835, 4604, 77, 2, 294, 62, 1348, 8, 15, 149, 1382, 3, 4184, 5, 1885, 2111, 76, 1754, 30, 10567, 296, 3100, 1348, 2840]


In [13]:
from itertools import compress
data_index = 0
batch = np.ndarray(shape=(batch_size, window_size + 1), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = window_size + 1
buffer = collections.deque(maxlen=span) # used for collecting word_ids[data_index] in the sliding window
def generate_batch_pvdm_doc(word_ids, batch_size, window_size):
    '''
    Batch generator for PV-DM (Distributed Memory Model of Paragraph Vectors).
    batch should be a shape of (batch_size, window_size+1)
    Parameters
    ----------
    doc_ids: list of document indices 
    word_ids: list of word indices
    batch_size: number of words in each mini-batch
    window_size: number of leading words before the target word 
    '''
    global data_index
    global batch
    global labels
    global span
    global buffer
    # collect the first window of words
    if (data_index == 0):
        for _ in range(span):
            buffer.append(doc_word_ids[data_index])
            data_index = (data_index + 1)
    mask = [1] * span
    mask[-1] = 0 
    i = 0
    doc_id = 0
    while i < batch_size:
        if (data_index == len(doc_word_ids)):
            data_index = 0
            return None, None
        # all leading words and the doc_id
        batch[i, :] = list(compress(buffer, mask)) + [doc_id]
        labels[i, 0] = buffer[-1] # the last word at end of the sliding window
        i += 1
        # move the sliding window  
        buffer.append(doc_word_ids[data_index])
        data_index += 1            
    return batch, labels

## Inferring the input document vector

In [14]:
average_loss = 0
step = 0
for epoch in range(5):
    while(True):
        batch_data, batch_labels = generate_batch_pvdm_doc(doc_word_ids,
            batch_size, window_size)
        if batch_data is None:
            print('broke')
            break
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        op, l = test_session.run([doc_optimizer, doc_loss], feed_dict=feed_dict)
        loss = np.mean(l)
        if step > 0:
            average_loss = average_loss / 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, loss))
            average_loss = 0
        step += 1

Average loss at step 1: 1608.716553
Average loss at step 2: 1227.205811
Average loss at step 3: 804.639404
Average loss at step 4: 2009.318726
Average loss at step 5: 2072.885986
Average loss at step 6: 1245.071533
broke
Average loss at step 7: 972.971497
Average loss at step 8: 670.461060
Average loss at step 9: 753.789673
Average loss at step 10: 369.410156
Average loss at step 11: 2188.986816
Average loss at step 12: 1862.519775
Average loss at step 13: 584.724304
broke
Average loss at step 14: 718.570190
Average loss at step 15: 1036.047241
Average loss at step 16: 835.010559
Average loss at step 17: 292.137543
Average loss at step 18: 1959.807861
Average loss at step 19: 1774.422607
Average loss at step 20: 312.335144
broke
Average loss at step 21: 447.456696
Average loss at step 22: 695.523376
Average loss at step 23: 829.688110
Average loss at step 24: 362.546722
Average loss at step 25: 1690.319702
Average loss at step 26: 1676.359741
Average loss at step 27: 524.598999
broke
A

In [15]:
with test_session.as_default(), test_graph.as_default():
    word_embeddings = word_embeddings.eval()
    doc_embeddings = doc_embeddings.eval()
    doc_embedding = doc_embedding.eval()

In [16]:
test_session.close()

In [17]:
doc_embedding

array([[ -1.61235058e+00,  -1.98829699e+00,   3.81818116e-01,
         -2.14010096e+00,   1.92247617e+00,  -9.25120711e-01,
         -7.59949636e+00,   3.27577496e+00,   2.48243308e+00,
          7.33018219e-01,   1.95946205e+00,   1.48673797e+00,
          1.27801239e+00,   3.30408788e+00,   7.62083352e-01,
         -3.53002250e-01,   8.19922745e-01,   3.97603154e+00,
         -3.81781667e-01,  -5.04135418e+00,   5.94559073e-01,
         -9.91142035e-01,   4.76521778e+00,   3.02548027e+00,
         -2.15528488e+00,  -2.75961328e+00,  -2.07385087e+00,
          2.38021874e+00,   5.56441069e-01,  -1.25326324e+00,
         -7.40746856e-01,  -2.63129687e+00,   2.05743217e+00,
         -3.51938844e+00,  -1.86649859e+00,  -2.19924021e+00,
         -8.93701375e-01,  -1.61902761e+00,   2.06132218e-01,
         -4.96853352e+00,   1.72255778e+00,   2.16816711e+00,
          2.62566400e+00,  -6.53638542e-01,  -5.82914972e+00,
         -4.76629162e+00,   2.09511232e+00,  -5.55043340e-01,
        

## Loading cases from files

In [18]:
import json
tids = []
case_file = open('all_cases.txt', 'r', encoding='utf-8')
cases = json.load(case_file)
i = 0
for case in cases:
    tids.append(case[0])
n_documents = len(docs)

NameError: name 'docs' is not defined

In [19]:
import numpy as np
from scipy import spatial
with test_session.as_default():
    cosines = []
    for i in range(len(doc_embeddings)):
        cosines.append(spatial.distance.cosine(doc_embedding, doc_embeddings[i]))
    #ind = np.argpartition(cosines, 4)
    cos = np.array(cosines)
    ind = cos.argsort()[:10]
    print(tids[ind[0]])
    for i in ind:
        print('{} : {}'.format(i, tids[i]))

12564942
649 : 12564942
63 : 142324899
420 : 149450971
8950 : 488831
8880 : 1926347
795 : 107667489
9075 : 72187374
132 : 172153811
6 : 40022871
632 : 41002179


In [None]:
tids[184]