In [1]:
import numpy as np
import random
from random import shuffle
import os
import gensim
import re
import pickle
import tensorflow as tf
import operator
import math
import sys
from copy import deepcopy
from collections import Counter
import string
import fileinput
from nltk.tokenize import sent_tokenize, word_tokenize 
import nltk

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import split, explode
from pyspark.sql import functions as func
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors, DenseVector, VectorUDT

from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.python.framework import ops
from tensorflow.models.rnn.translate import seq2seq_model

In [2]:
data_dir = '/media/ai2-rey/data_disk/data_sets/'

In [3]:
with open(data_dir+'abstracts.txt', 'r') as f:
    abstracts = [line.strip() for line in f]
    abstracts = abstracts[:1000000]

In [4]:
with open(data_dir+'titles.txt','r') as f:
    titles = [line.strip() for line in f]
    titles = titles[:1000000]

In [5]:
l = len(abstracts)
train_abstracts = abstracts[:int(0.9*l)]
train_titles = titles[:int(0.9*l)]

test_abstracts = abstracts[int(0.9*l):]
test_titles = titles[int(0.9*l):]

In [6]:
del abstracts
del titles

In [7]:
UNKNOWN = "*UNKNOWN*"
PAD = "*PAD*"

def generate_embedding_files(filename):
    embeddings = {}
    for line in open(filename):
        parts = line.split()
        embeddings[parts[0]] = list(map(float, parts[1:]))
    embedding_size = len(list(embeddings.values())[0])
    embeddings[UNKNOWN] = [0.0 for _ in range(embedding_size)]
    embeddings[PAD] = [1.0 for _ in range(embedding_size)]
    
    words = embeddings.keys()
    embedding_matrix = np.array([embeddings[word] for word in list(embeddings.keys())])
    return words, embedding_matrix

glove_words, glove_embedding_matrix = generate_embedding_files('/media/ai2-rey/data_disk/data_sets/glove.6B/glove.6B.50d.txt')
glove_vocab_lookup = {word: i for i, word  in enumerate(glove_words)}
glove_vocab_size, glove_embedding_size= glove_embedding_matrix.shape

In [8]:
class TextSum_Dataset:
  
    def __init__(self, abstract, title, word2id_dict=None, 
                 id2word_dict=None, use_glove = True, 
                 max_vocab_size=None, autopad_abstract='max', 
                 autopad_title ='max', abstract_pad_len=None, title_pad_len = None):

        assert autopad_abstract in {'min', 'max', 'avg'}
        assert autopad_title in {'min', 'max', 'avg'}
     
        self.abstract = abstract
        self.title = title
        
        self.max_vocab_size = max_vocab_size
        
        self.abstract_pad_len = abstract_pad_len
        self.title_pad_len = title_pad_len
        self.__autopad_abstract = autopad_abstract
        self.__autopad_title = autopad_title
        
        self.vocab = set()
        self.word_counter = dict()
        self.use_glove = use_glove
        self.word2id_dict = word2id_dict
        self.id2word_dict = id2word_dict
        
        self.abstract_ids = None
        self.title_ids = None
        self.abstract_lens = None
        self.titles_lens = None
        
        self.__parse_data()
        if self.word2id_dict is None: self.__create_word2id_dict() 
        self.__numericize_data()

    def __normalize_text(self, s):
        def remove_articles(text):
            return re.sub(r'\b(a|an|the)\b', ' ', text)

        def white_space_fix(text):
            return ' '.join(text.split())

        def remove_punc(text):
            exclude = set(string.punctuation)
            exclude.discard('.')
            return ''.join(ch for ch in text if ch not in exclude)

        def lower(text):
            return text.lower()

        def add_space(text):
            return text.replace('.', ' . ')

        return white_space_fix(add_space(remove_punc(lower(s))))

    
    def __update_word_counter(self, sequence):
        """ Update word_counter with counts for words in a sentence
        
        Args:
            sequence (list<str>) : list of words in a sequence
        
        """
        for word in sequence:
            self.word_counter[word] = self.word_counter.get(word, 0) + 1
            
    def __create_vocab(self):
        """ Create set of most frequent unique words found in the training data """
        
        if self.max_vocab_size == None:
            self.vocab = set(self.word_counter.keys())
        else:
            self.vocab = set(sorted(self.word_counter, key=self.word_counter.get, reverse=True)[:self.max_vocab_size])
        
    def __shuffle_data(self, data):
        random.shuffle(data)
        return list(zip(*data))
    
    def __parse_data(self):
        
        abstracts = []
        titles = []
        
        for idx in range(len(self.abstract)):
            a = self.__normalize_text(self.abstract[idx])
            t = self.__normalize_text(self.title[idx])
            
            self.__update_word_counter(a.split())
            self.__update_word_counter(t.split())
            
            abstracts.append(a)
            titles.append(t)
            
        self.abstract = abstracts
        self.title = titles
        
        del abstracts
        del titles
        
        self.__create_vocab()
        
        shuffle = list(zip(self.abstract, self.title))        
        self.abstract, self.title = self.__shuffle_data(shuffle)
    
    def __create_word2id_dict(self):
        
        if self.word2id_dict == None:
            self.word2id_dict = dict()
            self.id2word_dict = dict()
            misc_tokens = ['PAD', 'UNK']

            for i, token in enumerate(misc_tokens):
                self.word2id_dict[token] = i

            for word in self.vocab:
                self.word2id_dict[word] = len(self.word2id_dict)

            self.vocab |= set(misc_tokens)
            
            word2id = glove_vocab_lookup if self.use_glove else self.word2id_dict
            self.id2word_dict = dict(zip(word2id.values(), word2id.keys()))
            
            self.num_tokens = len(self.word2id_dict)
    
    def __convert_word2id(self, word):
        
        try:
            word_id = glove_vocab_lookup[word] if self.use_glove else self.word2id_dict[word]
        except:
            word_id = glove_vocab_lookup[UNKNOWN] if self.use_glove else self.word2id_dict['UNK']

        return word_id
    
    def __apply_padding(self, s, pad_len):
        sequence = s[:]
        
        PAD_sym = glove_vocab_lookup[PAD] if self.use_glove else self.word2id_dict['PAD'] 
        
        if len(sequence) < pad_len:
            sequence += [PAD_sym for i in range(pad_len - len(sequence))]
        elif len(sequence) > pad_len:
            sequence = sequence[:pad_len]
        else:
            pass
        return sequence
        
    def __get_seq_length_stats(self, sequences):
        max_len = 0
        min_len = 100000
        avg_len = 0
        for sequence in sequences:
            max_len = max(max_len, len(sequence))
            min_len = min(min_len, len(sequence))
            avg_len += len(sequence)
        avg_len = int(float(avg_len) / len(sequences))
        return min_len, max_len, avg_len

    def __get_max_sequence_lengths(self, abstract_ids, title_ids):
        min_abstract_len, max_abstract_len, avg_abstract_len = self.__get_seq_length_stats(abstract_ids)
        min_title_len, max_title_len, avg_title_len = self.__get_seq_length_stats(title_ids)

        if self.abstract_pad_len == None:
            if self.__autopad_abstract != None:
                if self.__autopad_abstract == 'min':
                    self.abstract_pad_len = min_abstract_len
                elif self.__autopad_abstract == 'max':
                    self.abstract_pad_len = max_abstract_len
                elif self.__autopad_abstract == 'avg':
                    self.abstract_pad_len = avg_abstract_len
            else:
                self.abstract_pad_len = avg_abstract_len
                
        if self.title_pad_len == None:
            if self.__autopad_title != None:
                if self.__autopad_title == 'min':
                    self.title_pad_len = min_title_len
                elif self.__autopad_title == 'max':
                    self.title_pad_len = max_title_len
                elif self.__autopad_title == 'avg':
                    self.title_pad_len = avg_title_len
            else:
                self.title_pad_len = avg_title_len     
    
    def __tokenize_sentences(self, abstract_ids, title_ids):
        """ Tokenizes sentences.
        :param raw: dict returned from load_babi
        :param word_table: WordTable
        :return:
        """
        abstract_tokens = []
        title_tokens = []
        
        for i in range(len(abstract_ids)):
            a_tkn_ids = self.__apply_padding(abstract_ids[i], self.abstract_pad_len)
            abstract_tokens.append(a_tkn_ids)
            t_tkn_ids = self.__apply_padding(title_ids[i], self.title_pad_len)
            title_tokens.append(t_tkn_ids)
            
        return abstract_tokens, title_tokens
        
    def __convert_text2ids(self, abstract, title):
        """ Tokenizes sentences.
        :param raw: dict returned from load_babi
        :param word_table: WordTable
        :return:
        """
        abstract_ids = []
        title_ids = []
        
        for i in range(len(abstract)):
            a_ids = [self.__convert_word2id(word) for word in abstract[i].split()]
            abstract_ids.append(a_ids)
            t_ids = [self.__convert_word2id(word) for word in title[i].split()]
            title_ids.append(t_ids) 
        
        return abstract_ids, title_ids
          
    
    def __numericize_data(self):
        a, t = self.__convert_text2ids(self.abstract, self.title)
        
        self.abstract_lens = [len(i) for i in a]
        self.titles_lens = [len(i) for i in t]
        
        self.__get_max_sequence_lengths(a,t)
        
        self.abstract_ids, self.title_ids = self.__tokenize_sentences(a,t)
        del a,t

In [9]:
train = TextSum_Dataset(train_abstracts, train_titles, max_vocab_size=50000, abstract_pad_len=200, title_pad_len=20, use_glove=False)

In [10]:
test = TextSum_Dataset(test_abstracts, test_titles, train.word2id_dict, train.id2word_dict, 
                      abstract_pad_len = train.abstract_pad_len, title_pad_len = train.title_pad_len, use_glove=False)

In [11]:
len(train.word2id_dict)

50002

In [12]:
# Saving the objects:
with open('training_vars.pickle', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([train.word2id_dict, train.id2word_dict, train.vocab, train.abstract_pad_len, train.title_pad_len], f)

In [13]:
del train_abstracts, train_titles, test_abstracts, test_titles

In [14]:
class DataIterator:  
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
        self.data_iterator = self.make_random_iter()
        
    def next_batch(self):
        try:
            idxs = next(self.data_iterator)
        except StopIteration:
            self.data_iterator = self.make_random_iter()
            idxs = next(self.data_iterator)
            
        batch = [self.data[i] for i in idxs]
        batch_idxs = [idx for idx in idxs]
        return batch, batch_idxs

    def make_random_iter(self):
        splits = np.arange(self.batch_size, len(self.data), self.batch_size)
        it = np.split(np.random.permutation(range(len(self.data))), splits)[:-1]
        return iter(it)

In [15]:
train_ids = list(zip(train.abstract_ids, train.title_ids))
test_ids = list(zip(test.abstract_ids, test.title_ids))

In [16]:
train_data_iter = DataIterator(train_ids,128)
test_data_iter = DataIterator(test_ids,128)
deploy_data_iter = DataIterator(train_ids,1)

In [17]:
def prepare_model_dir():
    model_dir = os.getcwd() + '/'
    if not os.path.exists(model_dir + 'weights'):
        os.makedirs(model_dir + 'weights')
    return model_dir

In [18]:
def write_to_log(string,filename):
#     print(string)
    with open(filename,'a') as write_file:
        write_file.write(string + '\n')

In [19]:
def loop_function(prev, embedding, w, b, update_embedding=True):
    prev = tf.nn.xw_plus_b(prev, w, b)
    prev_symbol = tf.argmax(prev,1) #maybe 0 if not transpose
    emb_prev = tf.nn.embedding_lookup(embedding, prev_symbol)
    if not update_embedding:
        emb_prev = tf.stop_gradient(emb_prev)
    return 

In [33]:
abstract_max_len = train.abstract_pad_len
title_max_len = train.title_pad_len

n_hidden = 150 
n_hidden_layers = 2
word_dim = glove_embedding_size
vocab_size = len(train.vocab)

learning_rate = 0.001
train_iters = 100000
keep_prob = 0.5

display_step = 200
val_interval = 1000
save_weights_interval = 1000
deploy_interval = 1000


device = '/gpu:0'

In [40]:
model_dir = prepare_model_dir()
tf.reset_default_graph()
sess = tf.InteractiveSession()

In [41]:
passage = tf.placeholder(tf.int32,[None, abstract_max_len])
#passage_lens = tf.placeholder(tf.int32,[None])
summary = tf.placeholder(tf.int64,[None, title_max_len])

dropout = tf.placeholder(tf.float32)
batch_size = tf.placeholder(tf.int32)
use_prev = tf.placeholder(tf.bool)

encoder_inputs = tf.unstack(passage, axis=1)
labels = tf.unstack(summary, axis=1)
decoder_inputs = [tf.zeros_like(labels[0], dtype=tf.int64, name='GO')] + labels[:-1]

with tf.variable_scope('embedding'):
    embedding = tf.Variable(tf.truncated_normal(shape=[vocab_size, word_dim], stddev=1e-4))
    #embedding = tf.constant(glove_embedding_matrix, name='embeddings', dtype=tf.float32)
    
    emb_enc_inputs = [tf.nn.embedding_lookup(embedding, x)
                      for x in encoder_inputs]
    emb_dec_inputs = [tf.nn.embedding_lookup(embedding, x)
                      for x in decoder_inputs]

In [42]:
with tf.variable_scope('encoder'):
    enc_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)
    enc_cell = tf.nn.rnn_cell.DropoutWrapper(enc_cell, output_keep_prob=dropout)
    #enc_cell = tf.nn.rnn_cell.MultiRNNCell([enc_cell]*n_hidden_layers)

    enc_state = enc_cell.zero_state(batch_size, dtype=tf.float32)
    h_states = []
    c_states = []
    
    for i in range(abstract_max_len):
        h, enc_state = enc_cell(emb_enc_inputs[i], enc_state)
        tf.get_variable_scope().reuse_variables()
#         h_states.append(enc_state.h)
#         c_states.append(enc_state.c)
    
#     h_states = tf.transpose(tf.stack(h_states),[1,0,2])
#     h_indices = tf.range(tf.shape(h_states)[0])*tf.shape(h_states)[1]+(passage_lens-1)
#     h_thought = tf.gather(tf.reshape(h_states,[-1,n_hidden]), h_indices)
    
#     c_states = tf.transpose(tf.stack(c_states),[1,0,2])
#     c_indices = tf.range(tf.shape(c_states)[0])*tf.shape(c_states)[1]+(passage_lens-1)
#     c_thought = tf.gather(tf.reshape(c_states,[-1,n_hidden]), c_indices)

w = tf.Variable(tf.truncated_normal(shape=[n_hidden, vocab_size], stddev=1e-4))
w_t = tf.transpose(w)
b = tf.Variable(tf.truncated_normal(shape=[vocab_size], stddev=1e-4))

with tf.variable_scope('decoder'):

    dec_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)
    dec_cell = tf.nn.rnn_cell.DropoutWrapper(dec_cell, output_keep_prob=dropout)
    #dec_cell = tf.nn.rnn_cell.OutputProjectionWrapper(dec_cell, vocab_size)

    dec_state =  enc_state

    dec_h_states = []
    
    for i in range(title_max_len):
        if use_prev == True and i>0:
            prev_ids = tf.argmax(tf.nn.softmax(tf.matmul(h, w) + b), axis=1)
            prev_word = tf.nn.embedding_lookup(embedding, prev_ids)
        else: 
            prev_word = emb_dec_inputs[i]

        h, dec_state = dec_cell(prev_word, dec_state)
        tf.get_variable_scope().reuse_variables()
        dec_h_states.append(h)

with tf.variable_scope('dense_output'):
    generated_tokens = []
    output_logits = []
    
    for h_state in dec_h_states:
        logits = tf.matmul(h_state, w) + b
        probs = tf.nn.softmax(logits)
        pred_ids = tf.argmax(probs, axis=1)

        output_logits.append(logits)
        generated_tokens.append(pred_ids)

with tf.variable_scope('sampled_loss'):
    labels = labels
    decoder_loss = 0.0

    for i, logits in enumerate(output_logits):
        step_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels[i])
        decoder_loss += tf.reduce_mean(step_loss)
    loss = decoder_loss / float(title_max_len)

with tf.variable_scope('accuracy'):
    labels = decoder_inputs
    accuracy = 0

    for i, token_id in enumerate(generated_tokens):
        accuracy += tf.reduce_mean(tf.cast(tf.equal(token_id, labels[i]), tf.float32))
    accuracy = accuracy / float(title_max_len)

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

In [43]:
sess.run(tf.global_variables_initializer())

In [44]:
log_file = model_dir + 'seq2seq_log.txt'
saver = tf.train.Saver(max_to_keep=100)
log=open(log_file,'w')
log.close()

In [None]:
for train_iter in range(train_iters):
    train_iter +=1
    train_batch, _ = train_data_iter.next_batch()
    train_abstract, train_title = zip(*train_batch)
    if train_iter==1:
        init_feed_dict = {passage:train_abstract,
                          summary: train_title,
                          dropout: 1.0,
                          batch_size: train_data_iter.batch_size,
                          use_prev: False}
        
        init_loss, init_acc = tuple(sess.run([loss, accuracy], 
                                            feed_dict=init_feed_dict))
        log_output = ("Iter {}, accuracy = {:.4f}, loss = {:.6f}").format(train_iter, init_acc, init_loss)
        
        print(log_output)
        write_to_log(log_output, log_file)
    
    train_feed_dict = {passage:train_abstract, 
                       summary: train_title,
                       dropout: keep_prob,
                       batch_size: train_data_iter.batch_size,
                       use_prev: False}
    
    sess.run(optimizer, feed_dict=train_feed_dict)
    
    if train_iter % display_step ==0:
        disp_feed_dict = {passage:train_abstract,
                          summary: train_title,
                          dropout: 1.0,
                          batch_size: train_data_iter.batch_size,
                          use_prev: False
                         }
        
        train_loss, train_acc = tuple(sess.run([loss, accuracy], 
                                               feed_dict=disp_feed_dict))
        
        log_output = ("Iter {}, accuracy = {:.4f}, loss = {:.6f}").format(train_iter, train_acc, train_loss)
        print(log_output)
        write_to_log(log_output, log_file)
    
    if test_data_iter !=None and train_iter% val_interval==0:
        test_batch, _ = test_data_iter.next_batch()
        test_abstract, test_title = zip(*test_batch)
        test_feed_dict = {passage:test_abstract,
                          summary: test_title,
                          dropout: 1.0,
                          batch_size: test_data_iter.batch_size,
                          use_prev: True}
        
        test_loss, test_acc = tuple(sess.run([loss, accuracy], 
                                             feed_dict=test_feed_dict))
        
        log_output = ("Iter {}, VALIDATION -- accuracy = {:.4f}, loss = {:.6f}").format(train_iter, test_acc, test_loss)
        print(log_output)
        write_to_log(log_output, log_file)
    
    if deploy_data_iter !=None and train_iter % deploy_interval==0:
        deploy_batch, _ = deploy_data_iter.next_batch()
        deploy_abstract, deploy_title = zip(*deploy_batch)
        
        model_pred = sess.run(generated_tokens, feed_dict={passage: deploy_abstract,
                                                           summary: deploy_title,
                                                           dropout: 1.0,
                                                           batch_size: deploy_data_iter.batch_size, 
                                                           use_prev: True})
        
        label_ids = deploy_title[0]
        pred_ids = [x[0] for x in model_pred]
        log_output = ''
        if train.id2word_dict != None:
            log_output = ('PASSAGE\n{}\nModel Pred = {}\nGround Truth = {}\n'
                          .format(' '.join([train.id2word_dict[idx] for idx in deploy_abstract[0]]),
                                  ' '.join([train.id2word_dict[idx] for idx in pred_ids]),
                                  ' '.join([train.id2word_dict[idx] for idx in label_ids]))
                         )
        else:
            log_output = ('Model Pred = {}, Ground Truth = {}'
                          .format(pred_ids, label_ids)
                         )
        print(log_output)
        write_to_log(log_output, log_file)
    
    if train_iter% save_weights_interval ==0:
        weights_dir = model_dir + "weights/seq2seq_weights_iter--{}.ckpt".format(train_iter)
        save_path = saver.save(sess,weights_dir)
        save_string = "Model saved in file: {}".format(save_path)
        print(save_string)
        write_to_log(save_string, log_file)

Iter 1, accuracy = 0.2512, loss = 4.643216
