In [1]:
import tensorflow as tf
from read_data import read_data, get_squad_data_filter, update_config
from tensorflow.contrib.rnn.python.ops.rnn_cell import _linear
import flag as fg
import os
import json
import numpy as np
from pprint import pprint
from functools import reduce
from operator import mul
from my.tensorflow import get_initializer

config = fg.main(_)
config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2))

assert config.load or config.mode == 'train', "config.load must be True if not training"
if not config.load and os.path.exists(config.out_dir):
    shutil.rmtree(config.out_dir)

config.save_dir = os.path.join(config.out_dir, "save")
config.log_dir = os.path.join(config.out_dir, "log")
config.eval_dir = os.path.join(config.out_dir, "eval")
config.answer_dir = os.path.join(config.out_dir, "answer")
if not os.path.exists(config.out_dir):
    os.makedirs(config.out_dir)
if not os.path.exists(config.save_dir):
    os.mkdir(config.save_dir)
if not os.path.exists(config.log_dir):
    os.mkdir(config.log_dir)
if not os.path.exists(config.answer_dir):
    os.mkdir(config.answer_dir)
if not os.path.exists(config.eval_dir):
    os.mkdir(config.eval_dir)

In [14]:
VERY_BIG_NUMBER = 1e30
VERY_SMALL_NUMBER = 1e-30
VERY_POSITIVE_NUMBER = VERY_BIG_NUMBER
VERY_NEGATIVE_NUMBER = -VERY_BIG_NUMBER

In [3]:
data_filter = get_squad_data_filter(config)

train_data = read_data(config, 'train', False, data_filter=data_filter)
dev_data = read_data(config, 'dev', False, data_filter=data_filter)

update_config(config, [train_data, dev_data])

word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec']
word2idx_dict = train_data.shared['word2idx']

idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict}
emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict
                    else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size))
                    for idx in range(config.word_vocab_size)])
config.emb_mat = emb_mat

# pprint(config.__flags, indent=2)

Loaded 87507/87599 examples from train
Loaded 10544/10570 examples from dev


In [18]:
# Context and Ques Parameters
N = config.batch_size
M = config.max_num_sents
JX = config.max_sent_size
JQ = config.max_ques_size
VW = config.word_vocab_size
VC = config.char_vocab_size
W = config.max_word_size
d =  config.hidden_size
dc = config.char_emb_size
dw = config.word_emb_size
dco = config.char_out_size

In [4]:
# Placeholders

x = tf.placeholder('int32', [N, None, None], name='x')
cx = tf.placeholder('int32', [N, None, None, W], name='cx')
x_mask = tf.placeholder('bool', [N, None, None], name='x_mask')
q = tf.placeholder('int32', [N, None], name='q')
cq = tf.placeholder('int32', [N, None, W], name='cq')
q_mask = tf.placeholder('bool', [N, None], name='q_mask')
y1 = tf.placeholder('bool', [N, None, None], name='y1')
y2 = tf.placeholder('bool', [N, None, None], name='y2')
is_train = tf.placeholder('bool', [], name='is_train')
new_emb_mat = tf.placeholder('float', [None, config.word_emb_size], name='new_emb_mat')

global_step = tf.get_variable('global_step', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False)

In [5]:
def get_initializer(matrix):
    def _initializer(shape, dtype=None, partition_info=None, **kwargs): return matrix
    return _initializer


def dropout(x, keep_prob, is_train, noise_shape=None, seed=None, name=None):
    with tf.name_scope(name or "dropout"):
        if keep_prob < 1.0:
            d = tf.nn.dropout(x, keep_prob, noise_shape=noise_shape, seed=seed)
            out = tf.cond(is_train, lambda: d, lambda: x)
            return out
        return x

def conv1d(in_, filter_size, height, padding, is_train=None, keep_prob=1.0, scope=None):
    with tf.variable_scope(scope or "conv1d"):
        num_channels = in_.get_shape()[-1]
        filter_ = tf.get_variable("filter", shape=[1, height, num_channels, filter_size], dtype='float')
        bias = tf.get_variable("bias", shape=[filter_size], dtype='float')
        strides = [1, 1, 1, 1]
        if is_train is not None and keep_prob < 1.0:
            in_ = dropout(in_, keep_prob, is_train)
        xxc = tf.nn.conv2d(in_, filter_, strides, padding) + bias  # [N*M, JX, W/filter_stride, d]
        out = tf.reduce_max(tf.nn.relu(xxc), 2)  # [-1, JX, d]
        return out

def multi_conv1d(in_, filter_sizes, heights, padding, is_train=None, keep_prob=1.0, scope=None):
    with tf.variable_scope(scope or "multi_conv1d"):
        assert len(filter_sizes) == len(heights)
        outs = []
        for filter_size, height in zip(filter_sizes, heights):
            if filter_size == 0:
                continue
            out = conv1d(in_, filter_size, height, padding, is_train=is_train, keep_prob=keep_prob, scope="conv1d_{}".format(height))
            outs.append(out)
        concat_out = tf.concat(outs, 2)
        return concat_out
    
def flatten(tensor, keep):
    fixed_shape = tensor.get_shape().as_list()
    start = len(fixed_shape) - keep
    left = reduce(mul, [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start)])
    out_shape = [left] + [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start, len(fixed_shape))]
    flat = tf.reshape(tensor, out_shape)
    return flat

def reconstruct(tensor, ref, keep):
    ref_shape = ref.get_shape().as_list()
    tensor_shape = tensor.get_shape().as_list()
    ref_stop = len(ref_shape) - keep
    tensor_start = len(tensor_shape) - keep
    pre_shape = [ref_shape[i] or tf.shape(ref)[i] for i in range(ref_stop)]
    keep_shape = [tensor_shape[i] or tf.shape(tensor)[i] for i in range(tensor_start, len(tensor_shape))]
    # pre_shape = [tf.shape(ref)[i] for i in range(len(ref.get_shape().as_list()[:-keep]))]
    # keep_shape = tensor.get_shape().as_list()[-keep:]
    target_shape = pre_shape + keep_shape
    out = tf.reshape(tensor, target_shape)
    return out

def linear(args, output_size, scope=None, is_train=None, input_keep_prob=1.0):
    
    flat_args = [flatten(arg, 1) for arg in args]
    shape = args[0].get_shape().as_list()
    input_size = shape[-1]
    
    if input_keep_prob < 1.0:
        assert is_train is not None
        flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg)
                     for arg in flat_args]
        
    with tf.variable_scope(scope or "linear"):
        W = tf.get_variable("W", [output_size, input_size], dtype=args[0].dtype)
        b = tf.get_variable("b", [output_size], dtype=args[0].dtype)

        flat_out = tf.matmul(flat_args[0], tf.transpose(W)) + b
        out = reconstruct(flat_out, args[0], 1)

        return out

def highway_layer(arg, scope=None, input_keep_prob=1.0, is_train=None):
    with tf.variable_scope(scope or "highway_layer"):
        d = arg.get_shape()[-1]
        trans = linear([arg], d, scope='trans', input_keep_prob=input_keep_prob, is_train=is_train)
        trans = tf.nn.relu(trans)
        gate = linear([arg], d, scope='gate', input_keep_prob=input_keep_prob, is_train=is_train)
        gate = tf.nn.sigmoid(gate)
        out = gate * trans + (1 - gate) * arg
        return out


def highway_network(arg, num_layers, input_keep_prob=1.0, is_train=None):
    prev = arg
    cur = None
    for layer_idx in range(num_layers):
        cur = highway_layer(prev, scope="layer_{}".format(layer_idx), 
                            input_keep_prob=input_keep_prob, is_train=is_train)
        prev = cur
    return cur

def mask(val, mask, name=None):
    if name is None:
        name = 'mask'
    return tf.mul(val, tf.cast(mask, 'float'), name=name)


def exp_mask(val, mask, name=None):
    """Give very negative number to unmasked elements in val.
    For example, [-3, -2, 10], [True, True, False] -> [-3, -2, -1e9].
    Typically, this effectively masks in exponential space (e.g. softmax)
    Args:
        val: values to be masked
        mask: masking boolean tensor, same shape as tensor
        name: name for output tensor
    Returns:
        Same shape as val, where some elements are very small (exponentially zero)
    """
    if name is None:
        name = "exp_mask"
    return tf.add(val, (1 - tf.cast(mask, 'float')) * VERY_NEGATIVE_NUMBER, name=name)

def softmax(logits, mask=None, scope=None):
    with tf.name_scope(scope or "Softmax"):
        if mask is not None:
            logits = exp_mask(logits, mask)
        flat_logits = flatten(logits, 1)
        flat_out = tf.nn.softmax(flat_logits)
        out = reconstruct(flat_out, logits, 1)

        return out

In [6]:
with tf.variable_scope("embedding_layer"):
    if config.use_char_emb:
        with tf.variable_scope("char"):

            char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')
    
            Acx = tf.nn.embedding_lookup(char_emb_mat, cx)  # [N, M, JX, W, dc]
            Acq = tf.nn.embedding_lookup(char_emb_mat, cq)  # [N, JQ, W, dc]
            Acx = tf.reshape(Acx, [-1, JX, W, dc])
            Acq = tf.reshape(Acq, [-1, JQ, W, dc])
            
            filter_sizes = list(map(int, config.out_channel_dims.split(',')))
            heights = list(map(int, config.filter_heights.split(',')))
            
            with tf.variable_scope("conv"):
                xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", is_train, config.keep_prob, scope="xx")
                tf.get_variable_scope().reuse_variables()
                qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", is_train, config.keep_prob, scope="xx")

                xx = tf.reshape(xx, [-1, M, JX, dco])
                qq = tf.reshape(qq, [-1, JQ, dco])
            
            
    if config.use_word_emb:
        with tf.name_scope("word"):
            
            if config.mode == 'train':
                word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat))
            else:
                word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float')
            
            if config.use_glove_for_unk:
                word_emb_mat = tf.concat([word_emb_mat, new_emb_mat], 0)

            Ax = tf.nn.embedding_lookup(word_emb_mat, x)  # [N, M, JX, d]
            Aq = tf.nn.embedding_lookup(word_emb_mat, q)  # [N, JQ, d]
            
        if config.use_char_emb:
            xx = tf.concat([xx, Ax], 3)  # [N, M, JX, di]
            qq = tf.concat([qq, Aq], 2)  # [N, JQ, di]
        else:
            xx = Ax
            qq = Aq

In [7]:
with tf.variable_scope("highway_network_layer"):
    xx = highway_network(xx, config.highway_num_layers, is_train=is_train)
    tf.get_variable_scope().reuse_variables()
    qq = highway_network(qq, config.highway_num_layers, is_train=is_train)

In [8]:
x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2)  # [N, M]
q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1)  # [N]

flat_len_q = None if q_len is None else tf.cast(flatten(q_len, 0), 'int64')
flat_len_x = None if x_len is None else tf.cast(flatten(x_len, 0), 'int64')

with tf.variable_scope("contextual_layer"):
    cell=tf.nn.rnn_cell.BasicLSTMCell(d,state_is_tuple=True);

    flat_qq = flatten(qq, 2)  
    (flat_fwu_outputs, flat_bwu_outputs), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, flat_qq, sequence_length=flat_len_q, dtype='float', scope='lstm')
    fw_u = reconstruct(flat_fwu_outputs, qq, 2)
    bw_u = reconstruct(flat_bwu_outputs, qq, 2)
    u = tf.concat([fw_u, bw_u], 2)

    tf.get_variable_scope().reuse_variables()
    
    flat_xx = flatten(xx, 2)  
    (flat_fwh_outputs, flat_bwh_outputs), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, flat_xx, sequence_length=flat_len_x, dtype='float', scope='lstm')
    fw_h = reconstruct(flat_fwh_outputs, xx, 2)
    bw_h = reconstruct(flat_bwh_outputs, xx, 2)
    h = tf.concat([fw_h, bw_h], 3)

In [9]:
with tf.variable_scope("attention_layer"):
    h_aug = tf.tile(tf.expand_dims(h, 3), [1, 1, 1, JQ, 1])
    u_aug = tf.tile(tf.expand_dims(tf.expand_dims(u, 1), 1), [1, M, JX, 1, 1])
    h_mask_aug = tf.tile(tf.expand_dims(x_mask, 3), [1, 1, 1, JQ])
    u_mask_aug = tf.tile(tf.expand_dims(tf.expand_dims(q_mask, 1), 1), [1, M, JX, 1])
    hu_mask = h_mask_aug & u_mask_aug

    h_u = h_aug * u_aug

    with tf.variable_scope("similarity"):
        sim = linear([tf.concat([h_aug, u_aug, h_u], -1)], 1, is_train=is_train, scope="sim")
        sim = tf.squeeze(sim, [len(sim.get_shape().as_list())-1])
        sim = exp_mask(sim, hu_mask)

    with tf.variable_scope("context_2_query"):
        a = softmax(sim)
        rank_u = len(u_aug.get_shape().as_list())
        u_a = tf.reduce_sum(tf.expand_dims(a, -1) * u_aug, rank_u-2)

    with tf.variable_scope("query_2_context"):
        b = softmax(tf.reduce_max(sim, 3))
        rank_h = len(h.get_shape().as_list())
        h_a = tf.reduce_sum(tf.expand_dims(b, -1) * h, rank_h-2)
        h_a = tf.tile(tf.expand_dims(h_a, 2), [1, 1, JX, 1])
    
    with tf.variable_scope("final"):
        g = tf.concat([h, u_a, h * u_a, h * h_a], 3)
            

In [10]:
with tf.variable_scope("modeling_layer"):
    flat_g = flatten(g, 2)  
    cell1 = tf.nn.rnn_cell.BasicLSTMCell(d,state_is_tuple=True);
    (flat_fw_g0_outputs, flat_bw_g0_outputs), _ =tf.nn.bidirectional_dynamic_rnn(cell1, cell1, flat_g, sequence_length=flat_len_x, dtype='float', scope='g0')
    fw_g0 = reconstruct(flat_fw_g0_outputs, g, 2)
    bw_g0 = reconstruct(flat_bw_g0_outputs, g, 2)

    g0 = tf.concat([fw_g0, bw_g0], 3)

    flat_g0 = flatten(g0, 2)
    cell2 = tf.nn.rnn_cell.BasicLSTMCell(d,state_is_tuple=True);

    (flat_fw_g1_outputs, flat_bw_g1_outputs), _ =tf.nn.bidirectional_dynamic_rnn(cell2, cell2, flat_g0, sequence_length=flat_len_x, dtype='float', scope='g1')
    fw_g1 = reconstruct(flat_fw_g1_outputs, g0, 2)
    bw_g1 = reconstruct(flat_bw_g1_outputs, g0, 2)

    g1 = tf.concat([fw_g1, bw_g1], 3)

In [11]:
with tf.variable_scope("output_layer"):
    logits1 = linear([tf.concat([g1, g], -1)], 1, input_keep_prob=config.input_keep_prob, is_train=is_train, scope="logits1")
    logits1 = tf.squeeze(logits1, [len(logits1.get_shape().as_list())-1])
    logits1 = exp_mask(logits1, x_mask)
    
    a = softmax(tf.reshape(logits1, [N, M * JX]))
    g1_reshaped = tf.reshape(g1, [N, M * JX, 2 * d])
    rank_g1 = len(g1_reshaped.get_shape().as_list())
    a1i = tf.reduce_sum(tf.expand_dims(a, -1) * g1_reshaped, rank_g1-2)
    a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1])
    
    g2_input = tf.concat([g, g1, a1i, g1 * a1i], 3)
    flat_input = flatten(g2_input, 2)  
    cell = tf.nn.rnn_cell.BasicLSTMCell(d,state_is_tuple=True);
    (flat_fw_g2_outputs, flat_bw_g2_outputs), _ =tf.nn.bidirectional_dynamic_rnn(cell, cell, flat_input, sequence_length=flat_len_x, dtype='float', scope='g2')
    fw_g2 = reconstruct(flat_fw_g2_outputs, g, 2)
    bw_g2 = reconstruct(flat_bw_g2_outputs, g, 2)

    g2 = tf.concat([fw_g2, bw_g2], 3)
    
    logits2 = linear([tf.concat([g2, g], -1)], 1, input_keep_prob=config.input_keep_prob, is_train=is_train, scope="logits2")
    logits2 = tf.squeeze(logits2, [len(logits2.get_shape().as_list())-1])
    logits2 = exp_mask(logits2, x_mask)
    
    logits1 = tf.reshape(logits1, [-1, M * JX])
    flat_yp1 = tf.nn.softmax(logits1) 
    yp1 = tf.reshape(flat_yp1, [-1, M, JX])
    logits2 = tf.reshape(logits2, [-1, M * JX])
    flat_yp2 = tf.nn.softmax(logits2)
    yp2 = tf.reshape(flat_yp2, [-1, M, JX])

In [12]:
#Loss 
loss_mask = tf.reduce_max(tf.cast(q_mask, 'float'), 1)
losses = tf.nn.softmax_cross_entropy_with_logits(logits1, tf.cast(tf.reshape(y1, [-1, M * JX]), 'float'))
ce_loss = tf.reduce_mean(loss_mask * losses)
ce_loss2 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits2, tf.cast(tf.reshape(y2, [-1, M * JX]), 'float')))
tf.add_to_collection('losses', ce_loss)
tf.add_to_collection("losses", ce_loss2)

loss = tf.add_n(tf.get_collection('losses'), name='loss')


ValueError: Only call `softmax_cross_entropy_with_logits` with named arguments (labels=..., logits=..., ...)