In [None]:
from __future__ import print_function
from __future__ import division

import numpy as np
import sys,os
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) 

import utils
import csv

import keras.activations as activations
from keras.models import Model
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Input, TimeDistributed, BatchNormalization
from keras.layers.merge import concatenate, add, multiply
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.core import Activation, Dense, Dropout, Flatten, Lambda, Permute, RepeatVector
from keras.layers.recurrent import GRU, LSTM

from keras import backend as K
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

In [None]:
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.3
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [None]:
def config():
    c = dict()
    # embedding params
    c['emb'] = 'Glove'
    c['embdim'] = 300
    c['inp_e_dropout'] = 1/2

    # training hyperparams
    c['opt'] = 'adadelta'
    c['batch_size'] = 160   
    c['epochs'] = 16
    c['patience'] = 3
    
    # sentences with word lengths below the 'pad' will be padded with 0.
    c['pad'] = 60
    
    # rnn model
    c['dropoutfix_inp'] = 0
    c['dropoutfix_rec'] = 0           
    c['dropout'] = 1/2     
    c['l2reg'] = 1e-4
                                              
    c['rnnbidi'] = True                      
    c['rnn'] = GRU                                                     
    c['rnnbidi_mode'] = add
    c['rnnact'] = 'tanh'
    c['rnninit'] = 'glorot_uniform'                      
    c['sdim'] = 1

    # cnn model
    c['cnn_dropout'] = 1/2     
    c['pool_layer'] = MaxPooling1D
    c['cnnact'] = 'relu'
    c['cnninit'] = 'glorot_uniform'
    c['cdim'] = 2
    c['pact'] = 'tanh'

    # projection layer
    c['pdim'] = 1/2
    c['p_layers'] = 1
    c['p_dropout'] = 1/2
    c['p_init'] = 'glorot_uniform'
    
    # attention model
    c['adim'] = 1/2
    c['cdim'] = 2
    c['cfiltlen'] = 3

    # mlp scoring function
    c['Ddim'] = 1
    
    ps, h = utils.hash_params(c)

    return c, ps, h

In [None]:
conf = None
emb = None
vocab = None
inp_tr = None
inp_val = None
inp_test = None
y_val = None
y_test = None

In [None]:
def ranknet(y_true, y_pred):
    """ Bipartite ranking surrogate """
    return K.mean(K.log(1. + K.exp(-(y_true * y_pred - (1-y_true) * y_pred))), axis=-1)

In [None]:
'''
The format of the dataset is as follows.

question1, label, sentence1 
question1, label, sentence2 
         
         ...
                                         
question2, label, sentence1 
question2, label, sentence2 
          
         ...
          
questionN, label, sentenceM 
'''

def load_data_from_file(dsfile):
    #load a dataset in the csv format;
    q = [] # a set of questions
    sents = [] # a set of sentences
    labels = [] # a set of labels

    with open(dsfile) as f:
        c = csv.DictReader(f)
        for l in c:
            label = int(l['label'])
            labels.append(label)
            try:
                qtext = l['qtext'].decode('utf8')
                stext = l['atext'].decode('utf8')
            except AttributeError:  # python3 has no .decode()
                qtext = l['qtext']
                stext = l['atext']
            
            q.append(qtext.split(' '))
            sents.append(stext.split(' '))
            
    return (q, sents, labels)
    
def make_model_inputs(qi, si, f01, f10, q, sents, y):
    inp = {'qi': qi, 'si': si, 'f01':f01, 'f10':f10, 'q':q, 'sents':sents, 'y':y} 
    
    return inp

def load_set(fname, vocab=None, iseval=False):
    q, sents, y = load_data_from_file(fname)
    if not iseval:
        vocab = utils.Vocabulary(q + sents) 
    
    pad = conf['pad']
    
    qi = vocab.vectorize(q, pad=pad)  
    si = vocab.vectorize(sents, pad=pad)        
    f01, f10 = utils.sentence_flags(q, sents, pad)  
    
    inp = make_model_inputs(qi, si, f01, f10, q, sents, y)
    if iseval:
        return (inp, y)
    else:
        return (inp, y, vocab)        
    
def load_data(trainf, valf, testf):
    global vocab, inp_tr, inp_val, inp_test, y_train, y_val, y_test
    inp_tr, y_train, vocab = load_set(trainf, iseval=False)
    inp_val, y_val = load_set(valf, vocab=vocab, iseval=True)
    inp_test, y_test = load_set(testf, vocab=vocab, iseval=True)

In [None]:
def embedding():
    '''
    Declare all inputs (vectorized sentences and NLP flags)
    and generate outputs representing vector sequences with dropout applied.  
    Returns the vector dimensionality.       
    '''
    pad = conf['pad']
    dropout = conf['inp_e_dropout']
    
    # story selection
    input_qi = Input(name='qi', shape=(pad,), dtype='int32')                          
    input_si = Input(name='si', shape=(pad,), dtype='int32')                 
    input_f01 = Input(name='f01', shape=(pad, utils.flagsdim))
    input_f10 = Input(name='f10', shape=(pad, utils.flagsdim))         

    input_nodes = [input_qi, input_si, input_f01, input_f10]           
        
    N = emb.N + utils.flagsdim
    shared_embedding = Embedding(name='emb', input_dim=vocab.size(), input_length=pad,
                                output_dim=emb.N, mask_zero=True,
                                weights=[vocab.embmatrix(emb)], trainable=True)
    emb_qi = Dropout(dropout, noise_shape=(N,))(concatenate([shared_embedding(input_qi),
        input_f01]))
    emb_si = Dropout(dropout, noise_shape=(N,))(concatenate([shared_embedding(input_si),
        input_f10]))

    emb_outputs = [emb_qi, emb_si]
    
    return N, input_nodes, emb_outputs

In [None]:
def projection_layer(inputs, input_size):
    input0 = inputs[0]
    input1 = inputs[1]
    for p_i in range(conf['p_layers']):
        shared_dense = Dense(name='pdeep%d'%(p_i), output_dim=int(input_size*conf['pdim']),
                activation='linear', kernel_initializer=conf['p_init'], kernel_regularizer=l2(conf['l2reg']))
        qi_proj = Activation(conf['pact'])(BatchNormalization()(shared_dense(input0)))
        si_proj = Activation(conf['pact'])(BatchNormalization()(shared_dense(input1)))
        input0 = qi_proj
        input1 = si_proj
        input_size = int(input_size * conf['pdim'])

    dropout = conf['p_dropout']
    qi_proj = Dropout(dropout, noise_shape=(input_size,))(qi_proj)
    si_proj = Dropout(dropout, noise_shape=(input_size,))(si_proj)

    return qi_proj, si_proj

In [None]:
def avg_model(input_nodes, N, pfx=''):
    shared_dense = Dense(int(N), activation='tanh', name='wproj'+pfx)
    qi_wproj = TimeDistributed(shared_dense)(input_nodes[0])
    si_wproj = TimeDistributed(shared_dense)(input_nodes[1])
    
    avg_layer = Lambda(name='bow'+pfx, function=lambda x: K.mean(x, axis=1), output_shape=lambda shape:(shape[0],) + shape[2:])
    qi_avg = avg_layer(qi_wproj)
    si_avg = avg_layer(si_wproj)

    qi_avg, si_avg = projection_layer([qi_avg, si_avg], int(N))

    return [qi_avg, si_avg]

In [None]:
def cnn_model(input_nodes, N, pfx=''):
    qi_cnn, si_cnn, nc = cnnsum_input(N, conf['pad'], dropout=conf['dropout'],
                                l2reg=conf['l2reg'], cnninit=conf['cnninit'], cnnact=conf['cnnact'],
                                inputs=input_nodes)
    
    qi_cnn, si_cnn = projection_layer([qi_cnn, si_cnn], nc)

    return [qi_cnn, si_cnn]

def cnnsum_input(N, pad, dropout=3/4, l2reg=1e-4, cnninit='glorot_uniform', cnnact='relu',
        cdim={1: 1/2, 2: 1/2, 3: 1/2, 4: 1/2, 5: 1/2, 6: 1/2, 7: 1/2}, inputs=None, pfx=''):
    qi_cnn_res_list = []
    si_cnn_res_list = []
    tot_len = 0
    for fl, cd in cdim.items():
        nb_filter = int(N*cd)
        shared_conv = Convolution1D(name=pfx+'conv%d'%(fl), input_shape=(None, conf['pad'], N),
                    kernel_size=fl, filters=nb_filter, activation='linear',
                    kernel_regularizer=l2(l2reg), kernel_initializer=cnninit)
        qi_cnn_one = Activation(cnnact)(BatchNormalization()(shared_conv(inputs[0])))
        si_cnn_one = Activation(cnnact)(BatchNormalization()(shared_conv(inputs[1])))
        
        pool = MaxPooling1D(pool_size=int(conf['pad']-fl+1), name=pfx+'pool%d'%(fl))
        qi_pool_one = pool(qi_cnn_one)
        si_pool_one = pool(si_cnn_one)

        flatten = Flatten(name=pfx+'flatten%d'%(fl))
        qi_out_one = flatten(qi_pool_one)
        si_out_one = flatten(si_pool_one)

        qi_cnn_res_list.append(qi_out_one)
        si_cnn_res_list.append(si_out_one)
    
        tot_len += nb_filter

    qi_cnn = Dropout(dropout, noise_shape=(tot_len,))(concatenate(qi_cnn_res_list))
    si_cnn = Dropout(dropout, noise_shape=(tot_len,))(concatenate(si_cnn_res_list))

    return (qi_cnn, si_cnn, tot_len)

In [None]:
def rnn_model(input_nodes, N, pfx=''):
    qi_rnn, si_rnn, nc = rnn_input(N, pfx=pfx, dropout=conf['dropout'], dropoutfix_inp=conf['dropoutfix_inp'], 
                            dropoutfix_rec=conf['dropoutfix_rec'], sdim=conf['sdim'], 
                            rnnbidi_mode=conf['rnnbidi_mode'], rnn=conf['rnn'], rnnact=conf['rnnact'], 
                            rnninit=conf['rnninit'], inputs=input_nodes)

    qi_rnn, si_rnn = projection_layer([qi_rnn, si_rnn], nc)

    return [qi_rnn, si_rnn]

def rnn_input(N, dropout=3/4, dropoutfix_inp=0, dropoutfix_rec=0,           
              sdim=2, rnn=GRU, rnnact='tanh', rnninit='glorot_uniform', rnnbidi_mode=add, 
              inputs=None, pfx=''):
    if rnnbidi_mode == 'concat':
        sdim /= 2
    shared_rnn_f = rnn(int(N*sdim), kernel_initializer=rnninit, input_shape=(None, conf['pad'], N), 
                       activation='linear', return_sequences=False, dropout=dropoutfix_inp,
                       recurrent_dropout=dropoutfix_rec, name='rnnf'+pfx)
    shared_rnn_b = rnn(int(N*sdim), kernel_initializer=rnninit, input_shape=(None, conf['pad'], N),
                       activation='linear', return_sequences=False, dropout=dropoutfix_inp,
                       recurrent_dropout=dropoutfix_rec, go_backwards=True, name='rnnb'+pfx)
    qi_rnn_f = Activation(rnnact)(BatchNormalization()(shared_rnn_f(inputs[0])))
    si_rnn_f = Activation(rnnact)(BatchNormalization()(shared_rnn_f(inputs[1])))
    
    qi_rnn_b = Activation(rnnact)(BatchNormalization()(shared_rnn_b(inputs[0])))
    si_rnn_b = Activation(rnnact)(BatchNormalization()(shared_rnn_b(inputs[1])))
    
    qi_rnn = Dropout(dropout, noise_shape=(int(N*sdim),))(rnnbidi_mode([qi_rnn_f, qi_rnn_b]))
    si_rnn = Dropout(dropout, noise_shape=(int(N*sdim),))(rnnbidi_mode([si_rnn_f, si_rnn_b]))
    
    return (qi_rnn, si_rnn, int(N*sdim))

In [None]:
def mlp_ptscorer(inputs, Ddim, N, l2reg, pfx='out', oact='sigmoid', extra_inp=[]):
    """ Element-wise features from the pair fed to an MLP. """

    sum_vec = add(inputs)
    mul_vec = multiply(inputs)

    mlp_input = concatenate([sum_vec, mul_vec])

    # Ddim may be either 0 (no hidden layer), scalar (single hidden layer) or
    # list (multiple hidden layers)
    if Ddim == 0:
        Ddim = []
    elif not isinstance(Ddim, list):
        Ddim = [Ddim]
    if Ddim:
        for i, D in enumerate(Ddim):
            shared_dense = Dense(int(N*D), kernel_regularizer=l2(l2reg), 
                                 activation='tanh', name=pfx+'hdn%d'%(i))
            mlp_input = shared_dense(mlp_input)

    shared_dense = Dense(1, kernel_regularizer=l2(l2reg), activation=oact, name=pfx+'mlp')
    mlp_out = shared_dense(mlp_input)
    
    return mlp_out

In [None]:
def build_model():
    # input embedding         
    N, input_nodes_emb, output_nodes_emb = embedding()
    
    # answer sentence selection
    ptscorer_inputs1 = cnn_model(output_nodes_emb, N, pfx='S')

    scoreS1 = mlp_ptscorer(ptscorer_inputs1, conf['Ddim'], N,  
            conf['l2reg'], pfx='outS', oact='sigmoid')                

    output_nodes = scoreS1

    model = Model(inputs=input_nodes_emb, outputs=output_nodes)
    
    model.compile(loss=ranknet, optimizer=conf['opt'])
    return model

In [None]:
def train_and_eval(runid):
    print('Model')
    model = build_model()
    print(model.summary())
    
    print('Training')
    fit_model(model, weightsf='weights-'+runid+'-bestval.h5')
    model.save_weights('weights-'+runid+'-final.h5', overwrite=True)
    model.load_weights('weights-'+runid+'-bestval.h5')

    print('Predict&Eval (best val epoch)')
    res = eval(model)

In [None]:
def fit_model(model, **kwargs):
    epochs = conf['epochs']
    callbacks = fit_callbacks(kwargs.pop('weightsf'))
    
    return model.fit(inp_tr, y=y_train, validation_data=[inp_val, y_val], 
                     callbacks = callbacks, epochs=epochs)

def fit_callbacks(weightsf):                                  
    return [utils.AnsSelCB(inp_val['q'], inp_val['sents'], y_val, inp_val),
            ModelCheckpoint(weightsf, save_best_only=True, monitor='mrr', mode='max'),
            EarlyStopping(monitor='mrr', mode='max', patience=conf['patience'])]

In [None]:
def eval(model):
    res = []
    for inp in [inp_val, inp_test]:
        if inp is None:
            res.append(None)
            continue

        pred = model.predict(inp)
        res.append(utils.eval_QA(pred, inp['q'], inp['y'], MAP=False))
    return tuple(res)

In [None]:
if __name__ == "__main__":
    trainf = 'data/train-all.csv' 
    valf = 'data/dev.csv'
    testf = 'data/test.csv'
    params = []
    
    conf, ps, h = config()

    if conf['emb'] == 'Glove':
        print('GloVe')
        emb = utils.GloVe(N=conf['embdim'])

    print('Dataset')
    load_data(trainf,valf,testf)
    runid = 'Model-%x' % (h)
    print('RunID: %s  (%s)' % (runid, ps))
    train_and_eval(runid)