In [2]:
import numpy as np
import random
from random import shuffle
import os
import gensim
import re
import pickle
import operator
import math
import sys
from copy import deepcopy
from collections import Counter
import string
import fileinput
from nltk.tokenize import sent_tokenize, word_tokenize 
import nltk
import tensorflow as tf

In [3]:
with open('training_vars.pickle', 'rb') as f: 
    word2id_dict, id2word_dict, vocab, abstract_pad_len, title_pad_len = pickle.load(f)

In [4]:
vocab_size = len(vocab)

In [5]:
def normalize_text(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        exclude.discard('.')
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    def add_space(text):
        return text.replace('.', ' . ')

    return white_space_fix(add_space(remove_punc(lower(s))))

In [6]:
def convert_word2id(word):

    try:
        word_id = word2id_dict[word]
    except:
        word_id = word2id_dict['UNK']

    return word_id

In [7]:
def apply_padding(s, pad_len):
    sequence = s[:]

    PAD_sym = word2id_dict['PAD'] 

    if len(sequence) < pad_len:
        sequence += [PAD_sym for i in range(pad_len - len(sequence))]
    elif len(sequence) > pad_len:
        sequence = sequence[:pad_len]
    else:
        pass
    return sequence

In [8]:
def tokenize_seq(s, pad_len):
    return apply_padding([convert_word2id(i) for i in normalize_text(s).split()], pad_len)

In [9]:
graph = tf.Graph()

In [10]:
def build_model(n_hidden, word_dim, learning_rate, gpu_device=0):
    
    tf.reset_default_graph()
    gpu_device_name = '/gpu:{}'.format(gpu_device)
    
    with graph.as_default(), tf.device(gpu_device_name):
        
        global passage 
        passage = tf.placeholder(tf.int32,[None, abstract_pad_len])
        global summary 
        summary = tf.placeholder(tf.int64,[None, title_pad_len])
        
        global dropout 
        dropout = tf.placeholder(tf.float32)
        global batch_size
        batch_size = tf.placeholder(tf.int32)
        global use_prev 
        use_prev = tf.placeholder(tf.bool)

        encoder_inputs = tf.unstack(passage, axis=1)
        decoder_inputs = tf.unstack(summary, axis=1)
        decoder_inputs = [tf.zeros_like(encoder_inputs[0], dtype=tf.int64, name='GO')] + decoder_inputs[:-1]
        
        with tf.variable_scope('embedding'):
            embedding = tf.Variable(tf.truncated_normal(shape=[vocab_size, word_dim], stddev=1e-4))
            #embedding = tf.constant(glove_embedding_matrix, name='embeddings', dtype=tf.float32)

            emb_enc_inputs = [tf.nn.embedding_lookup(embedding, x)
                              for x in encoder_inputs]
            emb_dec_inputs = [tf.nn.embedding_lookup(embedding, x)
                              for x in decoder_inputs]  
        
        with tf.variable_scope('encoder'):
            enc_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)
            enc_cell = tf.nn.rnn_cell.DropoutWrapper(enc_cell, output_keep_prob=dropout)
            #enc_cell = tf.nn.rnn_cell.MultiRNNCell([enc_cell]*n_hidden_layers)

            enc_state = enc_cell.zero_state(batch_size, dtype=tf.float32)

            for i in range(abstract_pad_len):
                h, enc_state = enc_cell(emb_enc_inputs[i], enc_state)
                tf.get_variable_scope().reuse_variables()
                
        w = tf.Variable(tf.truncated_normal(shape=[n_hidden, vocab_size], stddev=1e-4))
        w_t = tf.transpose(w)
        b = tf.Variable(tf.truncated_normal(shape=[vocab_size], stddev=1e-4))

        with tf.variable_scope('decoder'):

            dec_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)
            dec_cell = tf.nn.rnn_cell.DropoutWrapper(dec_cell, output_keep_prob=dropout)
            #dec_cell = tf.nn.rnn_cell.OutputProjectionWrapper(dec_cell, vocab_size)

            dec_state =  enc_state
            
            dec_h_states = []
            
            for i in range(title_pad_len):
                if use_prev == True and i>0:
                    prev_ids = tf.argmax(tf.nn.softmax(tf.matmul(h, w) + b), axis=1)
                    prev_word = tf.nn.embedding_lookup(embedding, prev_ids)
                else: 
                    prev_word = tf.zeros_like(emb_enc_inputs[0], dtype=tf.float32, name='GO')

                h, dec_state = dec_cell(prev_word, dec_state)
                tf.get_variable_scope().reuse_variables()
                dec_h_states.append(h)
                    
        with tf.variable_scope('dense_output'):
            global generated_tokens 
            generated_tokens = []
            output_logits = []
            
            for h in dec_h_states:
                logits = tf.matmul(h, w) + b
                probs = tf.nn.softmax(logits)
                pred_ids = tf.argmax(probs, axis=1)
                
                output_logits.append(logits)
                generated_tokens.append(pred_ids)
        
        with tf.variable_scope('sampled_loss'):
            labels = decoder_inputs
            decoder_loss = 0.0
            
            for i, logits in enumerate(output_logits):
                step_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels[i])
                decoder_loss += tf.reduce_mean(step_loss)
            loss = decoder_loss / float(title_pad_len)
        
        with tf.variable_scope('accuracy'):
            labels = decoder_inputs
            accuracy = 0
            
            for i, token_id in enumerate(generated_tokens):
                accuracy += tf.reduce_mean(tf.cast(tf.equal(token_id, labels[i]), tf.float32))
            accuracy = accuracy / float(title_pad_len)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
        
        global saver 
        saver = tf.train.Saver(tf.global_variables())

In [11]:
def init_session(sess):
    
    n_hidden = 150
    word_dims = 50
    learning_rate = 0.001
    gpu_device = 0
    
    build_model(n_hidden, word_dims, learning_rate, gpu_device)
    #sess.run(tf.global_variables_initializer())
    
    saver.restore(sess, 'weights/seq2seq_weights_iter--4000.ckpt')
    return sess  

In [12]:
def decode_line(sess, sentence):
    
    token_ids = tokenize_seq(sentence, abstract_pad_len)
    
    dep_feed_dict = {passage:[token_ids],
                     dropout: 1.0,
                     batch_size: 1,
                     use_prev: True
                    }
    logits = sess.run(generated_tokens, feed_dict = dep_feed_dict) 
    print(logits)
    return " ".join([id2word_dict[i[0]] for i in logits])