In [1]:
import dynet_config
import os
import random
import time

dynet_config.set(mem='11000', autobatch=1, requested_gpus=1)

In [2]:
import dynet as dy
import numpy as np

import itertools

from baseline_load_data import load_questions, VOCAB_SIZE
from contextlib import contextmanager

In [3]:
@contextmanager
def parameters(*params):
    yield tuple(map(lambda x:dy.parameter(x), params))

In [4]:
train_set = load_questions('./data/train.tok.json')
dev_set = load_questions('./data/dev.tok.json')

In [5]:
model = dy.ParameterCollection()
trainer = dy.AdamTrainer(model)

In [6]:
NUM_LAYERS = 2
EMBED_SIZE = 256
HIDDEN_SIZE = 256
ATTENTION_SIZE = 256

In [7]:
embeds = model.add_lookup_parameters((VOCAB_SIZE, EMBED_SIZE))

In [8]:
encodeRNN = dy.BiRNNBuilder(NUM_LAYERS, EMBED_SIZE, HIDDEN_SIZE, model, dy.LSTMBuilder)
# h, q refer to Figure 3 on DeepMind's paper. We don't need r, because we always put intermedia into memory
hRNN = dy.LSTMBuilder(NUM_LAYERS, HIDDEN_SIZE + HIDDEN_SIZE + EMBED_SIZE, HIDDEN_SIZE, model, dy.LSTMBuilder)
qRNN = dy.LSTMBuilder(NUM_LAYERS, HIDDEN_SIZE + EMBED_SIZE, HIDDEN_SIZE, model, dy.LSTMBuilder)
h_init_input = model.add_parameters((HIDDEN_SIZE + HIDDEN_SIZE + EMBED_SIZE))

In [10]:
# Attention function
att_W_ctx = model.add_parameters((ATTENTION_SIZE, HIDDEN_SIZE))
att_W_h = model.add_parameters((ATTENTION_SIZE, HIDDEN_SIZE))
att_b = model.add_parameters((1, ATTENTION_SIZE))

def calc_attention(ctx_matrix, ctx_att, h):
    with parameters(att_W_h, att_b) as (W, b):
        att_score = dy.transpose(b * dy.tanh(dy.colwise_add(ctx_att, W * h)))
        att_p = dy.softmax(att_score)
        ctx_mixture = ctx_matrix * att_p
        return ctx_mixture, att_p

In [None]:
# Calculate the probability of each operation
NUM_OPERATIONS = len(operations_list)
opt_embeds = model.add_lookup_parameters((NUM_OPERATIONS, EMBED_SIZE))
opt_args = [argsNum[func] for func in operations_list]
opt_W = model.add_parameters((NUM_OPERATIONS, HIDDEN_SIZE))
opt_b = model.add_parameters((NUM_OPERATIONS))

def operation_softmax(h, opt_id):
    with parameters(opt_W, opt_b) as (W, b):
        probs = dy.softmax(W * h + b)
        return probs
        

In [None]:
# Calculate the probability of each source for an argument
NUM_ARGS_SOURCE = 3 # INPUT, MEM, VOC
INPUT_SOURCE = 0
MEMORY_SOURCE = 1
VOC_SOURCE = 2
arg_W = model.add_parameters((NUM_ARGS_SOURCE, HIDDEN_SIZE))
arg_b = model.add_parameters((NUM_ARGS_SOURCE))

def argsource_softmax(h):
    with parameters(arg_W, arg_b) as (W, b):
        probs = dy.softmax(W * h + b)
        return probs

In [None]:
# Calculate the softmax for copy-from-input
# Note from the paper: 
#      apply a linear project from [uij, qij] into a fixed size vector
#      followed by a tanh and a linear projection into a single value
FIXED_SIZE = 128
proj_input_W = model.add_parameters((FIXED_SIZE, HIDDEN_SIZE + EMBED_SIZE))
proj_input_b = model.add_parameters((FIXED_SIZE))
proj_input_SW = model.add_parameters((1, FIXED_SIZE))
proj_input_Sb = model.add_parameters((1))

def copyinput_softmax(x, ctx_seq, q):
    with parameters(proj_input_W, proj_input_b, proj_input_SW, proj_input_Sb) as (proj_W, proj_b, proj_SW, proj_Sb):
        scores = []
        for idx, _ in enumerate(x):
            val = proj_SW * dy.tanh(proj_W * dy.concatenate([ctx_seq[idx], q]) + proj_b) + proj_Sb;
            scores.append(val)
        scores_tensor = dy.inputTensor(scores)
        props = dy.softmax(scores_tensor)    
        return probs

In [None]:
# Calculate the softmax for copy-from-memory
mem_key = []
mem_dict = {}
FIXED_SIZE = 128

proj_mem_W = model.add_parameters((FIXED_SIZE, HIDDEN_SIZE + EMBED_SIZE))
proj_mem_b = model.add_parameters((FIXED_SIZE))
proj_mem_SW = model.add_parameters((1, FIXED_SIZE))
proj_mem_Sb = model.add_parameters((1))

def copymem_softmax(q):
    with parameters(proj_mem_W, proj_mem_b, proj_mem_SW, proj_mem_Sb) as (proj_W, proj_b, proj_SW, proj_Sb):
        scores = []
        for idx, key in enumerate(mem_key):
            val = proj_SW * dy.tanh(proj_W * dy.concatenate([mem_dict[idx], q]) + proj_b) + proj_Sb;
            scores.append(val)
        scores_tensor = dy.inputTensor(scores)
        props = dy.softmax(scores_tensor)    
        return probs            

In [None]:
# Calculate the softmax for copy-from-vocb
vocbsf_W = model.add_parameters((HIDDEN_SIZE, VOCAB_SIZE))
vocbsf_b = model.add_parameters((VOCAB_SIZE))

def copyvocb_softmax(q):
    with parameters(vocbsf_W, vocbsf_b) as (vocb_W, vocb_b):
        props = dy.softmax(q * vocb_W + vocb_b)
        return probs
        

In [12]:
def loss(ori_x, y):
    x = [embeds[tid] for tid in x]
    ctx_seq = fwRNN.transduce(x)
    ctx_matrix = dy.concatenate_cols(ctx_seq)
    with parameters(att_W_ctx, h_init_input) as (W, init_input):
        ctx_att = W * ctx_matrix
        current_h_state = hRNN.initial_state().add_input(init_input)
    h = current_h_state.output()
    losses = []
    
    # group = (opt_id, x1, x2, ..., xk, v)
    for group in y:
        # select operation
        opt_id = group[0]
        probs = operation_softmax(h, opt_id)
        loss = dy.pickneglogsoftmax(probs, opt_id)
        losses.append(loss)
        
        # select arguments for the operation
        # Here we need to use h to initilize qRNN for the argument generation
        current_q_state = qRNN.initial_state().add_input(dy.concatenate([h, opt_embeds[opt_id]]))
        q = current_q_state.output()
        for idx, cur_arg in enumerate(group[1:-1]):            
            # for each arguments, there are multiple source.
            q = current_q_state.output()
            src_probs = argsource_softmax(q)
            
            # for each source, we calculate the probability of the target argument
            for src in range(NUM_ARGS_SOURCE):
                cur_arg_id = tok2id[str(cur_arg)]
                # Arg from input
                if src == INPUT_SOURCE and cur_arg_id in ori_x:
                    cur_arg_oriIdx = ori_x.index(cur_arg_id)
                    props = copyinput_softmax(x, ctx_seq, q)
                    src_loss = src_probs[src] * dy.pickneglogsoftmax(probs, cur_arg_oriIdx)
                    losses.append(src_loss)
                # Arg from memory
                elif src == MEMORY_SOURCE and cur_arg in mem_dict:
                    cur_arg_memIdx = mem_key.index(cur_arg)
                    props = copymem_softmax(q)
                    src_loss = src_probs[src] * dy.pickneglogsoftmax(probs, cur_arg_memIdx)
                    losses.append(src_loss)
                # Arg from Voc
                else:
                    probs = copyvocb_softmax(q)
                    src_loss = src_probs[src] * dy.pickneglogsoftmax(probs, cur_arg_id)
                    losses.append(src_loss)
                
                # Update q state from the next argument
                current_q_state = current_q_state.add_input(dy.concatenate([q, embeds[cur_arg_id]]))
            
        # we need to put intermedia into memory
        v = group[-1]
        mem_key.append(v)
        mem_dict[v] = h
    
        # Update h state from the next group
        v_embed = embeds[tok2id[str(v)]]
        ctx_mixture, _ = calc_attention(ctx_matrix, ctx_att, h)
        current_h_state = current_h_state.add_input(dy.concatenate([ctx_mixture, q, v_embed]))
        h = current_h_state.output()
    
    return dy.esum(losses)
    
