In [1]:
from Utils.FS import file
from Utils.tensorflow_helper import show_graph
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from scipy.sparse import coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
import tensorflow as tf
import math
import TextPreprocess.words2dict as words2dict
from tensorflow.python.layers import core as layers_core
from tensorflow.python.client import timeline
import time
from DataLoader import GloVe
from TextPreprocess.sequences import Sequences
from TextPreprocess.Tokenizer.RegExp import tokenize
import Utils.pandas_helper as ph

In [2]:
np.random.seed(1234)
WORD_DIM = 300
WORD_COUNT = 400000+3

In [3]:
df = file.read('data/Quora/train.csv')

from sklearn.model_selection import train_test_split

data = {}
data['train'], data['test'] = train_test_split(df, test_size = 0.2)

In [4]:
glove = GloVe.load2('./data/GloVe/glove.6B.{}d.txt'.format(WORD_DIM))

Start: Loading Glove Model
End: Loaded 400000 rows.


In [5]:
# emb: Symbol to float32 of fixed DIMENSION
# Create an index mapping, index to symbol, symbol to index

class Embedding:
    def __init__(self, emb, verbose = False):
        # assert emb is dictionary and each entry has same dimension
        self.emb = emb
        self.dim = len(self.emb[list(self.emb.keys())[0]])
        self.emb['<UNK>'] = [0. for i in range(self.dim)]
        self.emb['<PAD>'] = [1. for i in range(self.dim)]
        self.emb['<GO>'] = [-1. for i in range(self.dim)]
        
        self.build_dicts()
        
        if verbose:
            self.describe()
        
    def describe(self):
        print('Embedding Dimension: {}'.format(self.dim))
        print('Embedding Symbols: {}'.format(len(self.emb)))
        print('Index to symbol: {}'.format([(i, self.idx2Sym[i]) for i in range(10)]))
        
    def getIndex(self, symbol):
        if symbol in self.sym2Idx:
            return self.sym2Idx[symbol]
        else:
            return self.sym2Idx['<UNK>']

    def getEmb(self, symbol):
        return self.emb[self.idx2Sym[self.getIndex(symbol)]]
    
    def getSymbols(self, indices):
        return [self.idx2Sym[idx] for idx in indices]

    def getNumpyArray(self):
        return np.array([self.emb[self.idx2Sym[idx]] for idx in range(len(self.emb))])
    
    def build_dicts(self):
        self.sym2Idx = {}
        index = 0
        for key in self.emb.keys():
            self.sym2Idx[key] = index
            index += 1
            
        self.idx2Sym = { v:k for k, v in self.sym2Idx.items()}

glove_emb = Embedding(glove, verbose=True)

Embedding Dimension: 300
Embedding Symbols: 400003
Index to symbol: [(0, 'eltingville'), (1, 'wadih'), (2, 'facchetti'), (3, 'niger'), (4, 'coomaraswamy'), (5, 'minister-delegate'), (6, 'lemgo'), (7, 'accelerants'), (8, '-------------------------'), (9, 'colebatch')]


In [6]:
def preprocess(string):
    return [glove_emb.getIndex(token.lower()) for token in tokenize(string)]

def preprocessLabels(val):
    return [1., 0.] if val == 0 else [0., 1.]

In [7]:
# Turns iteratable of symbols into padded batch
from functools import lru_cache

class Batcher:
    def __init__(self, sequences, verbose = False):
        self.seqs = sequences
        self.verbose = verbose
        self.size = len(self.seqs)
        self.seq_lens = [len(seq) for seq in self.seqs]
        
        if self.verbose:
            self.describe()
    
    @lru_cache(maxsize=None)
    def max_length(self):
        return max(self.seq_lens)
    
    @lru_cache(maxsize=None)
    def longgest_sequence(self):
        for seq in self.seqs:
            if len(seq) == self.max_length():
                return seq
    
    def describe(self):
        print('Size: {}'.format(self.size))
        print("Longest sequence length: {}".format(self.max_length()))
        bin_width = max(1, self.max_length() // 30)
        plt.hist(self.seq_lens, range(0, self.max_length() + bin_width, bin_width))
        plt.title('Sequence length distribution')
        plt.show()
        
    def batchPadding(self, batch, padding_symbol):
        size = max([len(record) for record in batch])
        result = np.full((len(batch), size), padding_symbol)
        for i in range(len(batch)):
            result[i][:len(batch[i])] = batch[i]
        return result

    def batchMask(self, batch):
        size = max([len(record) for record in batch])
        result = np.full((len(batch), size), 0.0)
        for i in range(len(batch)):
            result[i][:len(batch[i])] = 1.0
        return result
        
    # Same length within the batch, stuffed with padding symbol
    def generator(self, padding_symbol, batch_size=None, epouch=-1):
        if batch_size == None:
            batch_size = self.size
        train = []
        length = []
        while(epouch < 0 or epouch > 0):
            for seq in self.seqs:
                train.append([sym for sym in seq])
                length.append(len(seq))
                if(len(train) == batch_size):
                    yield self.batchPadding(train, padding_symbol), length, self.batchMask(train)
                    train = []
                    length = []
            epouch -= 1
            if self.verbose:
                print('epouch done...')
                
                

class Batcher2:
    def __init__(self, sequences, verbose = False):
        self.seqs = sequences
        self.size = len(self.seqs)

    def generator(self, batch_size=32, epouch=-1):
        if batch_size == None:
            batch_size = self.size
        train = []
        while(epouch < 0 or epouch > 0):
            for sym in self.seqs:
                train.append([sym])
                if(len(train) == batch_size):
                    yield train
                    train = []
            epouch -= 1
            print('epouch done...')

In [8]:
q1={}
q2={}
label={}
for i in ['train', 'test']:
    q1[i] = data[i]['question1'].astype(str).apply(preprocess)
    q2[i] = data[i]['question2'].astype(str).apply(preprocess)
    label[i] = data[i]['is_duplicate'].astype('float32')  

In [9]:
q1_batcher = {}
q2_batcher = {}
label_batcher = {}
for i in ['train', 'test']:
    q1_batcher[i] = Batcher(q1[i])
    q2_batcher[i] = Batcher(q2[i])
    label_batcher[i] = Batcher2(label[i])

In [23]:
EMBEDDING = glove_emb.getNumpyArray()

LV1_DIM = 10
LV2_STEP = 1
LV2_DIM = 20

In [24]:
#IN (batch, time, dim)
def simple_dynamic_rnn(cell, inputs, lengths):
    outputs, states = tf.nn.dynamic_rnn(
        cell, 
        inputs, 
        initial_state = cell.zero_state(tf.shape(inputs)[0], dtype=tf.float32),
        dtype = tf.float32, 
        sequence_length = lengths
    )

    batch_size = tf.shape(inputs)[0]
    step_size = tf.shape(inputs)[1]
    indices = tf.range(0, batch_size) * step_size + (lengths - 1)
    gather = tf.reshape(tf.gather(tf.reshape(outputs, [-1, cell.output_size]), indices), [-1, cell.output_size])
    return gather
#OUT (batch, dim)

#IN (batch, time, dim)
def simple_encoder(inputs, input_lengths, reuse=None):
    
    with tf.variable_scope('Level_1_Conv'):
        #
        # Conv layer does not support dynamic length ;/
        #
        filter_2 = tf.Variable(
            tf.random_uniform([2, WORD_DIM, LV1_DIM], -1, 1), 
            dtype=tf.float32
        )

        #IN (batch, time, dim)
        conv_2 = tf.nn.conv1d(
            value = inputs,
            filters = filter_2,
            stride = 1,
            padding = 'VALID',
            use_cudnn_on_gpu=True,
            data_format=None,
            name='Conv_Witdh_2'
        )
        #OUT (batch, time-1, dim)

    
    with tf.variable_scope('Level_2_RNN'):
        
        cell = tf.contrib.rnn.GRUCell(
            num_units = LV2_DIM,
            input_size=None,
            activation=tf.tanh,
            reuse = reuse
        )
        
        print(conv_2)
        
        rnn_output_2 = simple_dynamic_rnn(
            cell = cell,
            inputs = conv_2,
            lengths = input_lengths-1
        )
        
    return rnn_output_2

    
    
#OUT (batch, dim)

In [25]:
graph = tf.Graph()
with graph.as_default():
    
    with tf.variable_scope("Inputs"):
    
        #IN
        inputs = [tf.placeholder(tf.int32, (None, None), name = "Q{}_Word_Indices".format(i+1)) for i in range(2)]
        #OUT: (batch, time) int32
        
        batch_size = [tf.shape(inputs[i], name= "Q{}_Batch_Size".format(i+1))[0] for i in range(2)]
        steps = [tf.shape(inputs[i], name= "Q{}_Steps".format(i+1))[1] for i in range(2)]
        
        #IN
        input_lengths = [tf.placeholder(tf.int32, (None), name = "Q{}_Lengths".format(i+1)) for i in range(2)]
        #OUT: (batch) int32
        
        truth = tf.placeholder(tf.float32, (None, 1), name = "labels")
        
    with tf.variable_scope("Embedding"):
        
        embeddings = tf.Variable(tf.constant(0.0, shape=[WORD_COUNT, WORD_DIM]), trainable=False, name='embeddings', dtype=tf.float32)
        embedding_placeholder = tf.placeholder(tf.float32, [WORD_COUNT, WORD_DIM])
        embedding_init = embeddings.assign(embedding_placeholder)

        encoder_inputs = [tf.nn.embedding_lookup(embeddings, inputs[i]) for i in range(2)] 
        #OUT: (batch, time, dim) float32
        
    #IN: (batch, time, dim)
    with tf.variable_scope("Encoder"):
        encoder_outputs = [simple_encoder(
                encoder_inputs[i], 
                input_lengths[i],
                reuse = None if i == 0 else True
            ) for i in range(2)]
    #OUT: (batch, dim)

    with tf.variable_scope("Final_Prediction"):
        final_input = tf.concat([encoder_outputs[0], encoder_outputs[1]], 1)
        
        final_weights = tf.Variable(tf.random_uniform([LV2_DIM * 2, 1], -1, 1), name='weights')
        final_bias = tf.Variable(tf.constant(0.0, shape=[1]), name="bias")
        
        predict = tf.nn.sigmoid(final_input @ final_weights + final_bias)
        
    loss = tf.reduce_mean(tf.contrib.keras.losses.binary_crossentropy(truth, predict))
    acc = tf.reduce_mean(tf.contrib.keras.metrics.binary_accuracy(truth, predict))
    optimizer = tf.train.AdamOptimizer().minimize(loss)
    saver = tf.train.Saver(tf.trainable_variables())

Tensor("Encoder/Level_1_Conv/Conv_Witdh_2/Squeeze:0", shape=(?, ?, 10), dtype=float32)
Tensor("Encoder/Level_1_Conv_1/Conv_Witdh_2/Squeeze:0", shape=(?, ?, 10), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [26]:
#show_graph(graph.as_graph_def())

In [27]:
num_steps = 2000000
MODEL = './model/q.ckpt'

In [None]:
BATCH_SIZE = 128

q1_gen = {}
q2_gen = {}
label_gen = {}
for i in ['train', 'test']:
    q1_gen[i] = q1_batcher[i].generator(glove_emb.getIndex('<PAD>'), batch_size=BATCH_SIZE)
    q2_gen[i] = q2_batcher[i].generator(glove_emb.getIndex('<PAD>'), batch_size=BATCH_SIZE)
    label_gen[i] = label_batcher[i].generator(batch_size=BATCH_SIZE)

In [None]:
DEBUG_SIZE = 1000

with tf.Session(graph = graph) as session:
    try:
        #saver.restore(session, MODEL)
        #print('Restored training...')
        session.run(tf.global_variables_initializer())
        print('Restarting training...')
    except:
        session.run(tf.global_variables_initializer())
        print('Restarting training...')
        
    session.run(embedding_init, feed_dict={embedding_placeholder: glove_emb.getNumpyArray()})
    
    #run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
    #run_metadata = tf.RunMetadata()
    
    #tvars_vals = session.run(tvars)
    #for var, val in zip(tvars, tvars_vals):
    #    print(var.name, val)  # Prints the name of the variable alongside its value.

    #for name in session.run( tf.report_uninitialized_variables( tf.global_variables( ) ) ):
    #    print(name)
    
    
    average_loss = 0
    average_max_loss = 0
    start = time.time()
    
    for step in range(num_steps):
        
        train_q1, train_q1_lengths, _ = next(q1_gen['train'])
        train_q2, train_q2_lengths, _ = next(q2_gen['train'])
        train_label = next(label_gen['train'])
        
        feed_dict = {
            inputs[0]: train_q1,
            inputs[1]: train_q2,
            input_lengths[0]: train_q1_lengths,
            input_lengths[1]: train_q2_lengths,
            truth: train_label
        }
        
        #_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata)
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        
        if step % DEBUG_SIZE == 0:
            if step > 0:
                average_loss /= DEBUG_SIZE
                print('Total time for {0} steps: {1:.2f}s, each step: {2:.2f}s'.format(DEBUG_SIZE, time.time()-start, (time.time()-start) / DEBUG_SIZE))
                print('Average mean loss at step ', step, ': ', average_loss)
                average_loss = 0
                start = time.time()
                
                avg_loss_val = 0
                avg_acc_val = 0
                for i in range(10):
                
                    test_q1, test_q1_lengths, _ = next(q1_gen['test'])
                    test_q2, test_q2_lengths, _ = next(q2_gen['test'])
                    test_label = next(label_gen['test'])
        
                    feed_dict = {
                        inputs[0]: test_q1,
                        inputs[1]: test_q2,
                        input_lengths[0]: test_q1_lengths,
                        input_lengths[1]: test_q2_lengths,
                        truth: test_label
                    }

                    loss_val, acc_val = session.run([loss, acc], feed_dict=feed_dict)
                    avg_loss_val+=loss_val
                    avg_acc_val += acc_val
                
                print('Testing Set 10 batch loss: {0}, acc {1}:'.format(avg_loss_val/10.0, avg_acc_val / 10.0))
                
        if step % DEBUG_SIZE == 0:
            save_path = saver.save(session, MODEL)
            print("Model saved in file: %s" % save_path)
            
            # Create the Timeline object, and write it to a json
            #tl = timeline.Timeline(run_metadata.step_stats)
            #ctf = tl.generate_chrome_trace_format()
            #with open('timeline.json', 'w') as f:
            #    f.write(ctf)


In [None]:
DEBUG_SIZE = 100
with tf.Session(graph=graph) as session:
    saver.restore(session, MODEL)
    print('Restored model...')
    
    average_loss = 0
    start = time.time()
    
    for step in range(num_steps):
        
        test_q1, test_q1_lengths, _ = next(q1_gen['test'])
        test_q2, test_q2_lengths, _ = next(q2_gen['test'])
        test_label = next(label_gen['test'])
        
        feed_dict = {
            inputs[0]: test_q1,
            inputs[1]: test_q2,
            input_lengths[0]: test_q1_lengths,
            input_lengths[1]: test_q2_lengths,
            truth: test_label
        }
        
        loss_val = session.run(loss, feed_dict=feed_dict)
        average_loss += loss_val
        
        if step % DEBUG_SIZE == 0:
            if step > 0:
                average_loss /= DEBUG_SIZE
                print('Total time for {0} steps: {1:.2f}s, each step: {2:.2f}s'.format(DEBUG_SIZE, time.time()-start, (time.time()-start) / DEBUG_SIZE))
                print('Average mean loss at step ', step, ': ', average_loss)
                average_loss = 0
                start = time.time()