In [0]:
from __future__ import absolute_import
from __future__ import print_function

from sklearn import metrics
from itertools import chain
from six.moves import range, reduce
import re

import tensorflow as tf
from tensorflow import keras

import numpy as np

from google.colab import files

import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import cm

In [0]:
# ############################ data utils

def load_task(train_file, only_supporting=False, test_count=11):
    train_data = parse_stories(train_file[:-1 * test_count], only_supporting)
    test_data = parse_stories(train_file[-1 * test_count:], only_supporting)
    return train_data, test_data


def load_task3(train_file, only_supporting=False, test_count=10):
    #   filter for binary rules
    train2_data = list()
    for line in train_file:
        line = line.replace('\n', '')
        _, qna = line.split('##')
        if (len(qna.split(' ')) == 3):
            train2_data.append(line)

    train_data = parse_stories3(train2_data[:-1 * test_count], only_supporting)
    test_data = parse_stories3(train2_data[-1 * test_count:], only_supporting)
    print('lens', len(train2_data), '#', len(train_data), '#', len(test_data))
    return train_data, test_data

def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]


def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbI tasks format
    If only_supporting is true, only the sentences that support the answer are kept.
    '''
    # In circle O , diameter AB is perpendicular to chord CD at E .##perpendicular AB CD

    data = []
    story = []
    for line in lines:
        story = []
        line = line.replace('\n', '')
        fact1, qna = line.split('##')
        fact1 = tokenize(fact1)
        if fact1[-1] == "?" or fact1[-1] == ".":
            fact1 = fact1[:-1]

        story.append(fact1)
        qna = qna.split(' ')
        if (len(qna) == 3):
            q1, a1 = qna[0], qna[1:]
            q1 = ' '.join(q1)
            q1 = tokenize(q1)
            data.append((story, q1, a1))

    return data


def parse_stories3(lines, only_supporting=False):
    '''Parse stories provided in the bAbI tasks format
    If only_supporting is true, only the sentences that support the answer are kept.
    '''
    # In circle O , diameter AB is perpendicular to chord CD at E .##perpendicular AB CD

    data = []
    story = []
    for line in lines:
        story = []
        line = line.replace('\n', '')
        fact1, qna = line.split('##')
        fact1 = tokenize(fact1)
        if fact1[-1] == "?" or fact1[-1] == ".":
            fact1 = fact1[:-1]

        for word in fact1:
            story.append(word)
        qna = qna.split(' ')
        if (len(qna) == 3):
            q = qna[0]
            a1 = qna[1]
            a2 = qna[2]
            data.append((story, [q], [a1,a2]))
    return data


def vectorize_datas(data, word_idx, sentence_size, memory_size):
    """
    Vectorize stories and queries.

    If a sentence length < sentence_size, the sentence will be padded with 0's.

    If a story length < memory_size, the story will be padded with empty memories.
    Empty memories are 1-D arrays of length sentence_size filled with 0's.

    The answer array is returned as a one-hot encoding.
    """
    S = []
    Q = []
    A1 = []
    A2 = []
    
    for story, query, answer in data:

        if "_on" in query[0]: 
          query[0].replace('_on', '')
        
        if ((answer[0] not in story) or (answer[1] not in story) or (query[0] not in story) or (len(query) > 1 and query[1] not in story)):
            continue

        q_position = story.index(query[0])
        ss = []
        for i, sentence in enumerate(story, 1):
            ls = 1
            ss.append([word_idx[sentence]] + [0] * ls)

        ss = ss[::-1][:memory_size][::-1]
        ss[q_position][-1] = len(word_idx) - memory_size
        count = 0
        for i in range(q_position - 1, -1, -1):
            count += 1
            ss[i][-1] = ss[q_position][-1] - count

        count = 0
        for j in range(q_position + 1, len(ss)):
            count += 1
            ss[j][-1] = ss[q_position][-1] + count
        lm = max(0, memory_size - len(ss))
        for _ in range(lm):
            ss.append([0, 0])
        lq = max(0, sentence_size - len(query))
        q = [word_idx[w] for w in query] + [len(word_idx) - memory_size] * lq

        y1 = np.zeros(memory_size)  # 0 is reserved for nil word

        y1 = np.zeros(memory_size)
        y1[story.index(answer[0])] = 1

        A1.append(y1)


        y2 = np.zeros(memory_size)  # 0 is reserved for nil word
        
        y2 = np.zeros(memory_size)
        y2[story.index(answer[1])] = 1
        
        S.append(ss)
        Q.append(q)
        A2.append(y2)
    return S, np.array(Q), np.array(A1), np.array(A2)


def vectorize_datas_for_predict(data, word_idx, sentence_size, memory_size):
    """
    Vectorize stories and queries.

    If a sentence length < sentence_size, the sentence will be padded with 0's.

    If a story length < memory_size, the story will be padded with empty memories.
    Empty memories are 1-D arrays of length sentence_size filled with 0's.

    The answer array is returned as a one-hot encoding.
    """
    S = []
    Q = []
    A = []
    for story, query, answer in data:
        if ((answer[0] != 'point') and (
                        (answer[0] not in story) or (query[0] not in story) or (
                        len(query) > 1 and query[1] not in story))):
            continue

        q_position = story.index(query[0])
        ss = []
        for i, sentence in enumerate(story, 1):
            ls = 1
            ss.append([word_idx[sentence]] + [0] * ls)

        # take only the most recent sentences that fit in memory
        ss = ss[::-1][:memory_size][::-1]

        # Make the last word of each sentence the time 'word' which
        # corresponds to vector of lookup table
        ss[q_position][-1] = len(word_idx) - memory_size
        count = 0
        for i in range(q_position - 1, -1, -1):
            count += 1
            ss[i][-1] = ss[q_position][-1] - count

        count = 0
        for j in range(q_position + 1, len(ss)):
            count += 1
            ss[j][-1] = ss[q_position][-1] + count

        # pad to memory_size
        lm = max(0, memory_size - len(ss))
        for _ in range(lm):
            ss.append([0, 0])
        lq = max(0, sentence_size - len(query))
        q = [word_idx[w] for w in query] + [len(word_idx) - memory_size] * lq

        y = np.zeros(memory_size)  # 0 is reserved for nil word

        y = np.zeros(memory_size)
        y[0] = 1
        y[1] = 1

    S.append(ss)
    Q.append(q)
    A.append(y)


    return S, np.array(Q), np.array(A)


def vectorize_data(story, query, answer, word_idx, sentence_size, memory_size, S=[], Q=[], A=[]):
    S = []
    Q = []
    A = []
    ss = []
    q_position = story.index(query[0])
    for i, sentence in enumerate(story, 1):
        ls = 2
        ss.append([word_idx[sentence]] + [0] * ls)
    ss = ss[::-1][:memory_size][::-1]
    ss[q_position][-1] = len(word_idx) - memory_size
    count = 0
    for i in range(q_position - 1, -1, -1):
        count += 1
        ss[i][-1] = ss[q_position][-1] - count

    count = 0
    for j in range(q_position + 1, len(ss)):
        count += 1
        ss[j][-1] = ss[q_position][-1] + count
    lm = max(0, memory_size - len(ss))
    for _ in range(lm):
        ss.append([0, 0, 0])
    lq = max(0, sentence_size - len(query))
    q = [word_idx[w] for w in query] + [len(word_idx) - memory_size] * lq

    y = np.zeros(len(word_idx) + 1)  # 0 is reserved for nil word
    for a in answer:
        y = np.zeros(memory_size)
        y[0] = 1
    S.append(ss)
    Q.append(q)
    A.append(y)
    return S, Q, A


def vectorize_question(story, query, answer, word_idx, sentence_size, memory_size, S=[], Q=[], A=[]):
    Q = []
    lq = max(0, sentence_size - len(query))
    q = [word_idx[w] for w in query] + [0] * lq
    Q.append(q)
    return Q


def vectorize_datas_for_multiple(data, word_idx, sentence_size, memory_size):
    """
    Vectorize stories and queries.

    If a sentence length < sentence_size, the sentence will be padded with 0's.

    If a story length < memory_size, the story will be padded with empty memories.
    Empty memories are 1-D arrays of length sentence_size filled with 0's.

    The answer array is returned as a one-hot encoding.
    """
    S = []
    Q = []
    A = []
    for story, query, answer in data:
        ss = []
        for i, sentence in enumerate(story, 1):
            ls = max(0, sentence_size - len(sentence))
            ss.append([word_idx[w] for w in sentence] + [0] * ls)

        ss = ss[::-1][:memory_size][::-1]

        for i in range(len(ss)):
            ss[i][-1] = len(word_idx) - memory_size - i + len(ss)

        lm = max(0, memory_size - len(ss))
        for _ in range(lm):
            ss.append([0] * sentence_size)

        lq = max(0, sentence_size - len(query))
        q = [word_idx[w] for w in query] + [len(word_idx) - memory_size] * lq

        yy = []
        y = np.zeros(len(word_idx) + 1)  # 0 is reserved for nil word
        for a in answer:
            y = np.zeros(len(word_idx) + 1)
            y[word_idx[a]] = 1
            yy.append(y)

        S.append(ss)
        Q.append(q)
        A.append(yy)
    return np.array(S), np.array(Q), np.array(A)


def position_encoding(sentence_size, embedding_size):
    """
    Position Encoding described in section 4.1 [1]
    """
    encoding = np.ones((embedding_size, sentence_size), dtype=np.float32)
    ls = sentence_size + 1
    le = embedding_size + 1
    for i in range(1, le):
        for j in range(1, ls):
            encoding[i - 1, j - 1] = (i - (embedding_size + 1) / 2) * (j - (sentence_size + 1) / 2)
    encoding = 1 + 4 * encoding / embedding_size / sentence_size
    encoding[:, -1] = 1.0
    return np.transpose(encoding)


def zero_nil_slot(t, name=None):
    """
    Overwrites the nil_slot (first row) of the input Tensor with zeros.

    The nil_slot is a dummy slot and should not be trained and influence
    the training algorithm.
    """
    with tf.op_scope([t], name, "zero_nil_slot") as name:
        t = tf.convert_to_tensor(t, name="t")
        s = tf.shape(t)[1]
        z = tf.zeros(tf.stack([1, s]))
        return tf.concat(axis=0, values=[z, tf.slice(t, [1, 0], [-1, -1])], name=name)


def add_gradient_noise(t, stddev=1e-3, name=None):
    """
    Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2].

    The input Tensor `t` should be a gradient.

    The output will be `t` + gaussian noise.

    0.001 was said to be a good fixed value for memory networks [2].
    """
    with tf.op_scope([t, stddev], name, "add_gradient_noise") as name:
        t = tf.convert_to_tensor(t, name="t")
        gn = tf.random_normal(tf.shape(t), stddev=stddev)
        return tf.add(t, gn, name=name)

# ###############################################################################


In [0]:
keywords = open('keywords.txt','r').readlines()
keyword_dict = dict(zip([i.split('##')[0] for i in keywords], [int(i.split('##')[1]) for i in keywords]))

evaluate_data = open('evaluate_data_file.txt','r').readlines()
evaluate_data = [data.replace('\n','') for data in evaluate_data]

evaluate_rules = open('evaluate_rule_file.txt','r').readlines()
evaluate_rules = [data.replace('\n','') for data in evaluate_rules if len(data.split('##')[1].split(' '))==3]

ruleKeys = list(set(rule.split('##')[0] for rule in evaluate_rules))
rule_dict = {key: list() for key in ruleKeys}
for rule in evaluate_rules:
  rule_dict[rule.split('##')[0]].append(rule.split('##')[1].replace('\n',''))

In [0]:
class MemN2N(object):
    """End-To-End Memory Network."""

    def __init__(self, batch_size, vocab_size, sentence_size, memory_size, embedding_size,
                 hops=3,
                 max_grad_norm=40.0,
                 nonlin=None,
                 initializer=tf.random_normal_initializer(stddev=0.1),
                 encoding=position_encoding,
                 session=tf.Session(),
                 name='MemN2N'):
        """Creates an End-To-End Memory Network

        Args:
            batch_size: The size of the batch.

            vocab_size: The size of the vocabulary (should include the nil word). The nil word
            one-hot encoding should be 0.

            sentence_size: The max size of a sentence in the data. All sentences should be padded
            to this length. If padding is required it should be done with nil one-hot encoding (0).

            memory_size: The max size of the memory. Since Tensorflow currently does not support jagged arrays
            all memories must be padded to this length. If padding is required, the extra memories should be
            empty memories; memories filled with the nil word ([0, 0, 0, ......, 0]).

            embedding_size: The size of the word embedding.

            hops: The number of hops. A hop consists of reading and addressing a memory slot.
            Defaults to `3`.

            max_grad_norm: Maximum L2 norm clipping value. Defaults to `40.0`.

            nonlin: Non-linearity. Defaults to `None`.

            initializer: Weight initializer. Defaults to `tf.random_normal_initializer(stddev=0.1)`.

            optimizer: Optimizer algorithm used for SGD. Defaults to `tf.train.AdamOptimizer(learning_rate=1e-2)`.

            encoding: A function returning a 2D Tensor (sentence_size, embedding_size). Defaults to `position_encoding`.

            session: Tensorflow Session the model is run with. Defaults to `tf.Session()`.

            name: Name of the End-To-End Memory Network. Defaults to `MemN2N`.
        """

        self._batch_size = batch_size
        self._vocab_size = vocab_size
        self._sentence_size = sentence_size
        self._memory_size = memory_size
        self._embedding_size = embedding_size
        self._hops = hops
        self._max_grad_norm = max_grad_norm
        self._nonlin = nonlin
        self._init = initializer
        self._name = name

        self._build_inputs()
        self._build_vars()

        self._opt = tf.train.AdagradOptimizer(learning_rate=self._lr)

        self._encoding = tf.constant(encoding(self._sentence_size, self._embedding_size), name="encoding")

        logits, att = self._inference_att_2_child(self._stories, self._queries)  # (batch_size, vocab_size)

        cross_entropy1 = tf.nn.softmax_cross_entropy_with_logits(logits=att[-2],
                                                                 labels=tf.cast(self._answers1, tf.float32),
                                                                 name="cross_entropy")
        cross_entropy_sum1 = tf.reduce_sum(cross_entropy1, name="cross_entropy_sum")

        cross_entropy2 = tf.nn.softmax_cross_entropy_with_logits(logits=att[-1],
                                                                 labels=tf.cast(self._answers2, tf.float32),
                                                                 name="cross_entropy")
        cross_entropy_sum2 = tf.reduce_sum(cross_entropy2, name="cross_entropy_sum")
        loss_op1 = cross_entropy_sum1 + cross_entropy_sum2
        grads_and_vars1 = self._opt.compute_gradients(loss_op1)
        grads_and_vars_temp = []
        for grad, var in grads_and_vars1:
            if grad is not None:
                grads_and_vars_temp.append([grad, var])
            else:
                grads_and_vars_temp.append([tf.zeros_like(var), var])

        grads_and_vars1 = grads_and_vars_temp

        grads_and_vars1 = [(tf.clip_by_norm(g, self._max_grad_norm), v) for g, v in grads_and_vars1]
        grads_and_vars1 = [(add_gradient_noise(g), v) for g, v in grads_and_vars1]
        nil_grads_and_vars1 = []
        for g, v in grads_and_vars1:
            if v.name in self._nil_vars:
                nil_grads_and_vars1.append((zero_nil_slot(g), v))
            else:
                nil_grads_and_vars1.append((g, v))
        train_op1 = self._opt.apply_gradients(nil_grads_and_vars1, name="train_op1")

        predict_out = logits
        predict_op1 = tf.argmax(att[-1], 1, name="predict_op")
        predict_op2 = tf.argmax(att[-2], 1, name="predict_op")

        predict_proba_op = tf.nn.softmax(logits, name="predict_proba_op")
        predict_log_proba_op = tf.log(predict_proba_op, name="predict_log_proba_op")
        predict_att = tf.nn.softmax(att, name="predict_att")

        # assign ops
        self.loss_op1 = loss_op1
        self.predict_op1 = predict_op1
        self.predict_op2 = predict_op2
        self.predict_out = predict_out
        self.predict_att = predict_att

        self.predict_proba_op = predict_proba_op
        self.predict_log_proba_op = predict_log_proba_op

        self.train_op1 = train_op1

        init_op = tf.global_variables_initializer()
        self._sess = session
        self._sess.run(init_op)

    def _build_inputs(self):
        self._stories = tf.placeholder(tf.int32, [None, self._memory_size, self._sentence_size], name="stories")
        self._queries = tf.placeholder(tf.int32, [None, self._sentence_size], name="queries")
        self._answers1 = tf.placeholder(tf.int32, [None, self._memory_size], name="answers1")
        self._answers2 = tf.placeholder(tf.int32, [None, self._memory_size], name="answers2")
        self._lr = tf.placeholder(tf.float32, [], name="learning_rate")

    def _build_vars(self):
        with tf.variable_scope(self._name):
            nil_word_slot = tf.zeros([1, self._embedding_size])
            A = tf.concat(axis=0, values=[nil_word_slot, self._init([self._vocab_size - 1, self._embedding_size])])
            C = tf.concat(axis=0, values=[nil_word_slot, self._init([self._vocab_size - 1, self._embedding_size])])

            Q_emb = tf.concat(axis=0, values=[nil_word_slot, self._init([self._vocab_size - 1, self._embedding_size])])

            self.A_1 = tf.Variable(A, name="A")

            self.Q_1 = tf.Variable(Q_emb, name="Q_emb")

            self.C = []

            for hopn in range(self._hops - 1):
                with tf.variable_scope('hop_{}'.format(hopn)):
                    self.C.append(tf.Variable(C, name="C"))

        self._nil_vars = set([self.A_1.name] + [x.name for x in self.C])

    def _inference_att(self, stories, queries):
        with tf.variable_scope(self._name):
            # Use A_1 for thee question embedding as per Adjacent Weight Sharing
            q_emb = tf.nn.embedding_lookup(self.Q_1, queries)
            print('q_emb:', q_emb)
            print('self._encoding', self._encoding)
            u_0 = tf.reduce_sum(q_emb * self._encoding, 1)
            print('u_0:', u_0)
            u = [u_0]

            att_v = []
            for hopn in range(self._hops - 1):
                if hopn == 0:
                    m_emb_A = tf.nn.embedding_lookup(self.A_1, stories)
                    print('m_emb_A', m_emb_A)
                    m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)
                    print('m_A', m_A)


                else:
                    with tf.variable_scope('hop_{}'.format(hopn - 1)):
                        m_emb_A = tf.nn.embedding_lookup(self.C[hopn - 1], stories)
                        m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)

                # hack to get around no reduce_dot
                u_temp = tf.transpose(tf.expand_dims(u[-1], -1), [0, 2, 1])
                dotted = tf.reduce_sum(m_A * u_temp, 2)

                print('u_temp', u_temp)
                print('dotted', dotted)

                # Calculate probabilities
                probs = tf.nn.softmax(dotted)
                print('probs', probs)
                att_v.append(probs)

                probs_temp = tf.transpose(tf.expand_dims(probs, -1), [0, 2, 1])
                print('probs_temp', probs_temp)
                with tf.variable_scope('hop_{}'.format(hopn)):
                    m_emb_C = tf.nn.embedding_lookup(self.C[hopn], stories)
                    print('m_emb_C', m_emb_C)
                m_C = tf.reduce_sum(m_emb_C * self._encoding, 2)
                print('m_C', m_C)

                c_temp = tf.transpose(m_C, [0, 2, 1])
                print('c_temp', c_temp)
                o_k = tf.reduce_sum(c_temp * probs_temp, 2)
                print('o_k', o_k)

                # Dont use projection layer for adj weight sharing
                # u_k = tf.matmul(u[-1], self.H) + o_k

                u_k = u[-1] + o_k
                print('u_k', u_k)

                # nonlinearity
                if self._nonlin:
                    u_k = nonlin(u_k)

                print('u_k', u_k)
                u.append(u_k)

            # Use last C for output (transposed)
            with tf.variable_scope('hop_{}'.format(self._hops)):
                #                 return dotted
                if hopn == 0:
                    m_emb_A = tf.nn.embedding_lookup(self.A_1, stories)
                    print('m_emb_A', m_emb_A)
                    m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)
                    print('m_A', m_A)


                else:
                    with tf.variable_scope('hop_{}'.format(hopn - 1)):
                        m_emb_A = tf.nn.embedding_lookup(self.C[hopn - 1], stories)
                        m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)

                # hack to get around no reduce_dot
                u_temp = tf.transpose(tf.expand_dims(u[-1], -1), [0, 2, 1])
                dotted = tf.reduce_sum(m_A * u_temp, 2)

                print('u_temp', u_temp)
                print('dotted', dotted)

                # Calculate probabilities
                probs = tf.nn.softmax(dotted)
                print('probs', probs)
                att_v.append(probs)
                return probs, tf.stack(att_v)

    def _inference_att_1_child(self, stories, queries):
        with tf.variable_scope(self._name):
            # Use A_1 for thee question embedding as per Adjacent Weight Sharing
            q_emb = tf.nn.embedding_lookup(self.Q_1, queries)
            print('q_emb:', q_emb)
            print('self._encoding', self._encoding)
            u_0 = tf.reduce_sum(q_emb * self._encoding, 1)
            print('u_0:', u_0)
            u = [u_0]

            att_v = []
            for hopn in range(self._hops - 2):
                if hopn == 0:
                    m_emb_A = tf.nn.embedding_lookup(self.A_1, stories)
                    print('m_emb_A', m_emb_A)
                    m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)
                    print('m_A', m_A)


                else:
                    with tf.variable_scope('hop_{}'.format(hopn - 2)):
                        m_emb_A = tf.nn.embedding_lookup(self.C[hopn - 2], stories)
                        m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)

                # hack to get around no reduce_dot
                u_temp = tf.transpose(tf.expand_dims(u[-1], -1), [0, 2, 1])
                dotted = tf.reduce_sum(m_A * u_temp, 2)

                print('u_temp', u_temp)
                print('dotted', dotted)

                # Calculate probabilities
                probs = tf.nn.softmax(dotted)
                print('probs', probs)
                att_v.append(probs)

                probs_temp = tf.transpose(tf.expand_dims(probs, -1), [0, 2, 1])
                print('probs_temp', probs_temp)
                with tf.variable_scope('hop_{}'.format(hopn)):
                    m_emb_C = tf.nn.embedding_lookup(self.C[hopn], stories)
                    print('m_emb_C', m_emb_C)
                m_C = tf.reduce_sum(m_emb_C * self._encoding, 2)
                print('m_C', m_C)

                c_temp = tf.transpose(m_C, [0, 2, 1])
                print('c_temp', c_temp)
                o_k = tf.reduce_sum(c_temp * probs_temp, 2)
                print('o_k', o_k)

                # Dont use projection layer for adj weight sharing
                # u_k = tf.matmul(u[-1], self.H) + o_k

                u_k = u[-1] + o_k
                print('u_k', u_k)

                # nonlinearity
                if self._nonlin:
                    u_k = nonlin(u_k)

                print('u_k', u_k)
                u.append(u_k)

            # Use last C for output (transposed)
            with tf.variable_scope('hop_{}'.format(self._hops - 1)):
                #                 return dotted
                if hopn == 0:
                    m_emb_A = tf.nn.embedding_lookup(self.A_1, stories)
                    print('m_emb_A', m_emb_A)
                    m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)
                    print('m_A', m_A)


                else:
                    with tf.variable_scope('hop_{}'.format(hopn - 2)):
                        m_emb_A = tf.nn.embedding_lookup(self.C[hopn - 2], stories)
                        m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)

                # hack to get around no reduce_dot
                u_temp = tf.transpose(tf.expand_dims(u[-1], -1), [0, 2, 1])
                dotted = tf.reduce_sum(m_A * u_temp, 2)

                print('u_temp', u_temp)
                print('dotted', dotted)

                # Calculate probabilities
                probs = tf.nn.softmax(dotted)
                print('probs', probs)
                att_v.append(probs)
                #                 print('att', att)
                #                 print('soft', att_v1)
                return probs, tf.stack(att_v)

    def _inference_att_2_child(self, stories, queries):
        with tf.variable_scope(self._name):
            # Use A_1 for thee question embedding as per Adjacent Weight Sharing
            q_emb = tf.nn.embedding_lookup(self.Q_1, queries)
            u_0 = tf.reduce_sum(q_emb * self._encoding, 1)
            u = [u_0]

            att = []
            for hopn in range(self._hops - 2):
                if hopn == 0:
                    m_emb_A = tf.nn.embedding_lookup(self.A_1, stories)
                    m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)


                else:
                    with tf.variable_scope('hop_{}'.format(hopn - 1)):
                        m_emb_A = tf.nn.embedding_lookup(self.C[hopn - 1], stories)
                        m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)

                # hack to get around no reduce_dot
                u_temp = tf.transpose(tf.expand_dims(u[-1], -1), [0, 2, 1])
                dotted = tf.reduce_sum(m_A * u_temp, 2)

                # Calculate probabilities
                probs = tf.nn.softmax(dotted)
                # probs_temp = tf.nn.softmax(dotted)
                att.append(probs)
                # att_v1.append(probs_temp)

                probs_temp = tf.transpose(tf.expand_dims(probs, -1), [0, 2, 1])
                with tf.variable_scope('hop_{}'.format(hopn)):
                    m_emb_C = tf.nn.embedding_lookup(self.C[hopn], stories)
                m_C = tf.reduce_sum(m_emb_C * self._encoding, 2)

                c_temp = tf.transpose(m_C, [0, 2, 1])
                o_k = tf.reduce_sum(c_temp * probs_temp, 2)

                # Dont use projection layer for adj weight sharing
                # u_k = tf.matmul(u[-1], self.H) + o_k

                u_k = u[-1] + o_k

                # nonlinearity
                if self._nonlin:
                    u_k = nonlin(u_k)

                u.append(u_k)

            # hop before last hop
            hopn = hopn +1
            with tf.variable_scope('hop_{}'.format(hopn - 1)):
                m_emb_A = tf.nn.embedding_lookup(self.C[hopn - 1], stories)
                m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)

            # hack to get around no reduce_dot
            u_temp = tf.transpose(tf.expand_dims(u[-1], -1), [0, 2, 1])
            dotted = tf.reduce_sum(m_A * u_temp, 2)

            # Calculate probabilities
            probs = tf.nn.softmax(dotted)
            # probs_temp = tf.nn.softmax(dotted)
            att.append(probs)
            # att_v1.append(probs_temp)

            probs_temp = tf.transpose(tf.expand_dims(probs, -1), [0, 2, 1])
            with tf.variable_scope('hop_{}'.format(hopn)):
                m_emb_C = tf.nn.embedding_lookup(self.C[hopn], stories)
            m_C = tf.reduce_sum(m_emb_C * self._encoding, 2)
            c_temp = tf.transpose(m_C, [0, 2, 1])
            o_k = tf.reduce_sum(c_temp * probs_temp, 2)

            # Dont use projection layer for adj weight sharing
            # u_k = tf.matmul(u[-1], self.H) + o_k

            u_k = u[-1] + o_k

            # nonlinearity
            if self._nonlin:
                u_k = nonlin(u_k)

            u.append(u_k)

            hopn = hopn + 1
            # Use last C for output (transposed)
            # last hop
            with tf.variable_scope('hop_{}'.format(self._hops - 1)):
                with tf.variable_scope('hop_{}'.format(hopn - 1)):
                    m_emb_A = tf.nn.embedding_lookup(self.C[hopn - 1], stories)
                    m_A = tf.reduce_sum(m_emb_A * self._encoding, 2)

                # hack to get around no reduce_dot
                u_temp = tf.transpose(tf.expand_dims(u[-1], -1), [0, 2, 1])
                dotted = tf.reduce_sum(m_A * u_temp, 2)

                # Calculate probabilities
                probs = tf.nn.softmax(dotted)
                att.append(probs)
                return probs, tf.stack(att)

    def batch_fit(self, stories, queries, answers1, answers2, learning_rate):
        """Runs the training algorithm over the passed batch

        Args:
            stories: Tensor (None, memory_size, sentence_size)
            queries: Tensor (None, sentence_size)
            answers: Tensor (None, vocab_size)

        Returns:
            loss: floating-point number, the loss computed for the batch
        """
        feed_dict = {self._stories: stories, self._queries: queries,
                     self._answers1: answers1, self._answers2: answers2, self._lr: learning_rate}
        loss1, _ = self._sess.run([self.loss_op1, self.train_op1],
                                            feed_dict=feed_dict)
        return loss1

    def predict(self, stories, queries):
        """Predicts answers as one-hot encoding.

        Args:
            stories: Tensor (None, memory_size, sentence_size)
            queries: Tensor (None, sentence_size)

        Returns:
            answers: Tensor (None, vocab_size)
        """

        feed_dict = {self._stories: stories, self._queries: queries}
        att = self._sess.run(self.predict_att, feed_dict=feed_dict)
        return self._sess.run([self.predict_op1, self.predict_op2], feed_dict=feed_dict), att

    # edit
    def predict_out(self, stories, queries):
        """Predicts answers as one-hot encoding.

        Args:
            stories: Tensor (None, memory_size, sentence_size)
            queries: Tensor (None, sentence_size)

        Returns:
            answers: Tensor (None, vocab_size)
        """
        feed_dict = {self._stories: stories, self._queries: queries}
        return self._sess.run(self.predict_out, feed_dict=feed_dict)

    def predict_proba(self, stories, queries):
        """Predicts probabilities of answers.

        Args:
            stories: Tensor (None, memory_size, sentence_size)
            queries: Tensor (None, sentence_size)

        Returns:
            answers: Tensor (None, vocab_size)
        """
        feed_dict = {self._stories: stories, self._queries: queries}
        return self._sess.run(self.predict_proba_op, feed_dict=feed_dict)

    def predict_log_proba(self, stories, queries):
        """Predicts log probabilities of answers.

        Args:
            stories: Tensor (None, memory_size, sentence_size)
            queries: Tensor (None, sentence_size)
        Returns:
            answers: Tensor (None, vocab_size)
        """
        feed_dict = {self._stories: stories, self._queries: queries}
        return self._sess.run(self.predict_log_proba_op, feed_dict=feed_dict)

    def predict_att(self, stories, queries):
        """Predicts log probabilities of answers.

        Args:
            stories: Tensor (None, memory_size, sentence_size)
            queries: Tensor (None, sentence_size)
        Returns:
            answers: Tensor (None, vocab_size)
        """
        feed_dict = {self._stories: stories, self._queries: queries}
        return self._sess.run(self.predict_att, feed_dict=feed_dict)


In [0]:
print("Started Task:", 1)

learning_rate = 0.001  # 0.1
anneal_rate = 10
anneal_stop_epoch = 100  # 100
max_grad_norm = 40.0
evaluation_interval = 10
batch_size = 10
hops = 5
epochs = 100
embedding_size = 100
memory_size = 40
task_id = 1
random_state = 10

train_data2 = open('unary_binary', 'r').readlines()

train2, test2 = load_task3(train_data2)

train_data = None
data2 = train2 + test2

vocab = sorted(reduce(lambda x, y: x | y, (set(list(s) + q) for s, q, a in data2)))
vocab.extend(keyword_dict.keys())

edata = [tokenize(p.replace('\n', '')) for p in evaluate_data]
edata = [item for sublist in edata for item in sublist]
vocab.extend(edata)

edata = [tokenize(p.replace('\n', '')) for p in evaluate_rules]
edata = [item for sublist in edata for item in sublist]
vocab.extend(edata)

vocab = list(set(vocab))
edata = None
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))

max_story_size = max(map(len, (s for s, _, _ in data2)))
mean_story_size = int(np.mean([len(s) for s, _, _ in data2]))
sentence_size = 1
query_size = max(map(len, (q for _, q, _ in data2)))
memory_size = min(memory_size, max_story_size)

print('word_ids', len(word_idx))
for i in range(memory_size * 2):
    word_idx['time{}'.format(i + 1)] = 'time{}'.format(i + 1)
print('word_ids2', len(word_idx))

vocab_size = len(word_idx) + 1  # +1 for nil word
sentence_size = max(query_size, sentence_size)  # for the position
sentence_size += 1  # +1 for time words and to accomodate two worded queries

print("Longest sentence length", sentence_size)
print("Longest story length", max_story_size)
print("Average story length", mean_story_size)

S2, Q2, A21, A22 = vectorize_datas(train2, word_idx, sentence_size, memory_size)
trainS2 = S2
trainQ2 = Q2
trainA21 = A21
trainA22 = A22
S2 = Q2 = A21 = A22 = None
testS2, testQ2, testA1, testA2 = vectorize_datas(test2, word_idx, sentence_size, memory_size)

trainS2shape = np.array(trainS2).shape
testS2shape = np.array(testS2).shape
print("Training set shape", trainS2shape)

# params
n_train = trainS2shape[0]
n_test = testS2shape[0]

print("Training Size", n_train)

print("Training labels shape", trainA21.shape)
print("Training labels shape", trainA22.shape)

train_labels21 = np.argmax(trainA21.reshape(-1, memory_size), axis=1)
train_labels22 = np.argmax(trainA22.reshape(-1, memory_size), axis=1)


trainA21 = trainA21.reshape(-1, memory_size)
trainA22 = trainA22.reshape(-1, memory_size)

test_labels21 = np.array(np.argmax(testA1.reshape(-1, memory_size), axis=1))
test_labels22 = np.array(np.argmax(testA2.reshape(-1, memory_size), axis=1))

In [0]:
tf.set_random_seed(random_state)
config_binary_results = open("config_binary_results.txt", "w+")
configs = list()
epochss = [50, 70, 80, 100]
lrs = [0.001, 0.005, 0.01]
batch_sizes = [10, 20, 30, 50]
embedding_sizes = [30, 40, 50, 100]
hops = [3, 4, 5]
max_grad_norm = 40.0

for eps in epochss:
    for lr in lrs:
        for bs in batch_sizes:
            for es in embedding_sizes:
                for hp in hops:
                    configs.append([eps, lr, bs, es, hp, max_grad_norm])

for config in configs:
    epochs, learning_rate,batch_size, embedding_size, hops, max_grad_norm = config
    config_binary_results.write(str(epochs)+','+str(learning_rate)+','+str(batch_size)+
                         ','+ str(embedding_size)+','+ str(hops)+',')
    batches = zip(range(0, n_train - batch_size, batch_size), range(batch_size, n_train, batch_size))
    batches = [(start, end) for start, end in batches]

    batches2 = zip(range(0, n_train - batch_size, batch_size), range(batch_size, n_train, batch_size))
    batches2 = [(start, end) for start, end in batches]

    model = None
    val_acc_list = list()
    train_acc_list = list()
    ep_list = list()

    val_acc_list21 = list()
    train_acc_list21 = list()
    ep_list21 = list()
    lr_list1 = list()

    val_acc_list22 = list()
    train_acc_list22 = list()
    ep_list22 = list()

    val_acc_list22 = list()
    train_acc_list22 = list()
    ep_list22 = list()
    lr_list22 = list()

    tf.reset_default_graph()
    sess2 = tf.Session()

    # with tf.Session() as sess:
    print (
        'batch_size, vocab_size, sentence_size, memory_size, embedding_size, hops, max_grad_norm\n', batch_size, vocab_size,
        sentence_size, memory_size, embedding_size, hops, max_grad_norm)
    model2 = MemN2N(batch_size, vocab_size, sentence_size, memory_size, embedding_size, session=sess2,
                    hops=hops, max_grad_norm=max_grad_norm)

    train_acc2 = 0.0
    for t in range(1, epochs + 1):
        anneal = 1.0
        lr = learning_rate / anneal

        np.random.shuffle(batches)
        total_cost21 = 0.0
        total_cost22 = 0.0

        for start, end in batches:
            s2 = trainS2[start:end]
            q2 = trainQ2[start:end]
            a21 = trainA21[start:end]
            a22 = trainA22[start:end]

            #         print('s2', s2)
            cost_t21 = model2.batch_fit(s2, q2, a21, a22, lr)
            total_cost21 += cost_t21

        if t % evaluation_interval == 0:
            train_preds21 = []
            train_preds22 = []
            for start in range(0, n_train, batch_size):
                end = start + batch_size
                s2 = trainS2[start:end]
                q2 = trainQ2[start:end]

                if (len(s2) != batch_size):
                    continue

                preds, att = model2.predict(s2, q2)
                pred21 = preds[0]
                preds22 = preds[1]

                train_preds21 += list(preds[0])
                train_preds22 += list(preds[1])

            train_acc21 = metrics.accuracy_score(np.array(train_preds21), train_labels21[:len(train_preds21)])
            train_acc_list21.append(train_acc21)
            test_preds21, att = model2.predict(testS2, testQ2)
            print(np.array(test_preds21).shape, len(test_labels21))
            val_acc21 = metrics.accuracy_score(test_preds21[0], test_labels21)
            val_acc_list21.append(val_acc21)
            ep_list21.append(t)

            train_acc22 = metrics.accuracy_score(np.array(train_preds22), train_labels21[:len(train_preds22)])
            train_acc_list22.append(train_acc22)
            test_preds22, att = model2.predict(testS2, testQ2)
            val_acc22 = metrics.accuracy_score(test_preds22[1], test_labels22)
            val_acc_list22.append(val_acc22)
            ep_list22.append(t)

    test_preds22, att = model2.predict(testS2, testQ2)
    test_acc22 = metrics.accuracy_score(test_preds22[1], test_labels22)


    tp, fp, tn, fn = evaluate(evaluate_data, keyword_dict)
    if((tp + fp)==0 or ((tp + fn)==0)):
      continue
    precison = 1.0 * tp / (tp + fp)
    recall = 1.0 * tp / (tp + fn)
    if((precison + recall)==0):
      continue
    f1Score = 2.0 * precison * recall / (precison + recall)

    print(tp, fp, tn, fn, precison, recall, f1Score)
    config_binary_results.write(str(tp)+','+str(fp)+','+str(fn)+','+str(precison)+','+str(recall)+','+str(f1Score))
    config_binary_results.write('\n')
    mpl.pyplot.scatter(ep_list22, train_acc_list22)
    mpl.pyplot.show()

    mpl.pyplot.scatter(ep_list22, val_acc_list22)
    mpl.pyplot.show()


In [0]:
mpl.pyplot.scatter(ep_list21,train_acc_list21)
mpl.pyplot.show()

mpl.pyplot.scatter(ep_list21,val_acc_list21)
mpl.pyplot.show()


mpl.pyplot.scatter(ep_list22,train_acc_list22)
mpl.pyplot.show()

mpl.pyplot.scatter(ep_list22,val_acc_list22)
mpl.pyplot.show()


In [0]:
log_info =1

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

def plot_attention(in_seq, out_seq, attentions):
    """ From http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html"""

    out_seq = out_seq
    attentions = attentions
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions, cmap='bone')
#     fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([' '] + [str(x) for x in in_seq], rotation=90)
    ax.set_yticklabels([' '] + [str(x) for x in out_seq])

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [0]:
import re


def predictLong(se):
    raws = se.split('##')[0].split()
    testS, testQ = se.split('##')
    splitQ = testQ.split(' ')
    tempQ = ' '.join(splitQ[:2])
    tempS = testS + '##' + tempQ + ' point' + ' point'
    s, q, a = parse_stories3([tempS])[0]
    s, q, a = vectorize_datas_for_predict([[s, q, a]], word_idx, sentence_size, memory_size)
    a11, att1 = model2.predict(s, q)
    att1 = att1.reshape(hops, memory_size)
    a1, a2 = a11[0][0], a11[1][0]

    if ((a1 + 1 > len(raws)) or (a2 + 1 > len(raws))):
        return -1, -1
    if(log_info):
        plot_attention(raws, [str(i) for i in range(hops)], att1)
    return [a1], [a2]

def evaluate(lines, keyword_dict, rule_dict=rule_dict):
    double_rule_list = list()
    qs = list()
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    selected_rules = list()
    for line in lines:
        qKey = line.split('##')[0]

        pos_rules = list()
        if qKey in rule_dict.keys():
            pos_rules = rule_dict[qKey]

        pos_rule_encs = list()
        pos_rule_encs2 = list()

        for rule in pos_rules:
            rule_words = rule.split(' ')
            rule1 = [rule_words[0], rule_words[1], rule_words[2]]
            rule2 = [rule_words[0], rule_words[2], rule_words[1]]
            rule_words1 = ''.join([str(word_idx[rule_word.replace('@', '')]) for rule_word in rule1])
            rule_words2 = ''.join([str(word_idx[rule_word.replace('@', '')]) for rule_word in rule2])

            lss1 = ''.join(str(rule_words1))
            lss2 = ''.join(str(rule_words2))
            pos_rule_encs.append(lss1)
            pos_rule_encs2.append(lss2)

        line = line.split('##')[1]
        if (log_info):
            print('line', line)

        if (log_info):
            print('pos_rules', pos_rules)

        words = line.split(' ')

        #   pruning
        if (len(words) > memory_size):
            words = words[:memory_size]
            line = ' '.join(words)

        for wordId in range(len(words)):

            word = words[wordId]
            if word in keyword_dict.keys() and keyword_dict[word] == 2:
                if (word == 'on'):
                    continue

                a1, a2 = predictLong(line + ' ##' + word)
                if (a1 == a2):
                    continue
                pred_word = str(words[a1[0]])
                pred_word2 = str(words[a2[0]])
                if containRuleAlready(qKey, word, pred_word, pred_word2, double_rule_list):
                    continue
                print('predicted rule ', word, pred_word, pred_word2)
                if((word.lower() in ['lies','lie']) or (word.lower()=='is' and words[wordId+1]=='on' )):
                  word =word + '_on'
                word_id = word_idx[word]
                lss3 = ''.join([str(word_id), str(word_idx[words[a1[0]].replace('@', '')]),
                                str(word_idx[words[a2[0]].replace('@', '')])])
                lss4 = ''.join([str(word_id), str(word_idx[words[a2[0]].replace('@', '')]),
                                str(word_idx[words[a1[0]].replace('@', '')])])
                double_rule_list.append(qKey + '##' + word + ' ' + pred_word2 + ' ' + pred_word)
                selected_rules.append(lss1)
                if containList(lss3, lss4, pos_rule_encs, 'tp'):
                    tp = tp + 1                    
                else:
                    fp = fp + 1
                    if (log_info):
                      print('inc fp ', fp)
                if (log_info):
                    print('inc_over', tp, fp)

        for ruleId in range(len(pos_rule_encs)):
            if not containList(pos_rule_encs[ruleId], pos_rule_encs2[ruleId], selected_rules, 'fp'):
                fn = fn + 1
                if (log_info):
                    print('inc fn ',fn)

    double_rule_list = list(set(double_rule_list))
    predict_double_rule_file = open("predict_double_rule_file_mm.txt", "w+")
    for v in double_rule_list:
        predict_double_rule_file.write(v)
        predict_double_rule_file.write('\n')
    predict_double_rule_file.close()

    return tp, fp, tn, fn


def containList(list1, list2, lists, a):
    if (list1 in lists) or (list2 in lists):
        return True
    return False


def containRuleAlready(qKey, word, pred_word, pred_word2, double_rule_list):
    rule11 = qKey + '##' + word + ' ' + pred_word + ' ' + pred_word2
    rule12 = qKey + '##' + word + ' ' + pred_word2 + ' ' + pred_word
    if (rule11 in double_rule_list) or (rule12 in double_rule_list):
        return True
    return False

tp, fp, tn, fn = evaluate(evaluate_data, keyword_dict)
precison = 1.0 * tp / (tp + fp)
recall = 1.0 * tp / (tp + fn)
f1Score = 2.0 * precison * recall / (precison + recall)

print(tp, fp, tn, fn, precison, recall, f1Score)
