# Assignment 6

## Task
+ Build an End-to-End Memory Network
+ Solve some babi-tasks with it

__Task of choice:__ Positional reasoning

### Imports

In [59]:
# blocks
from blocks.algorithms import GradientDescent, Scale
from blocks.bricks.cost import CategoricalCrossEntropy
from blocks.extensions import FinishAfter
from blocks.graph import ComputationGraph
from blocks.main_loop import MainLoop
from blocks.model import Model

# fuel
from fuel.datasets import Dataset
from fuel.streams import DataStream

# numpy
from numpy import unique, vstack, where, array, transpose
from numpy.random import randn

# theano
from theano import scan, function, shared
from theano.tensor import tensor3, tensor4, matrix, TensorVariable, batched_dot, dot, TensorType, vector
from theano.tensor.nnet import softmax

# others
from nltk import word_tokenize

### Constants

In [2]:
DATA_FILE = './data/17/100.txt'
batch_size = 1


### Read data

We represent out input sentences $x_1\dots x_n$ as __BOW__-vectors $\vec{b}_1 \dots \vec{b}_n$. For this task $n = 2$. Further there is a BOW-vector $\vec{q}$ representing the question as well as an answer vector $\vec{y}$, which is also BOW-type but refers to a different vocabulary of size $2$ representing 'yes' and 'no'.

#### Dimensions:

$\vec{b}_1, \vec{b}_2$ and $\vec{q}_i$ have dimension $V$, whereas $\vec{a}_i \in\bigg\{\bigg(\begin{matrix}1\\0
\end{matrix}\bigg), \bigg(\begin{matrix}0\\1
\end{matrix}\bigg)\bigg\}$.

$\vec{b}_1$ and $\vec{b}_2$ are glued together to a $V\times 2$-matrix $M_b$.

After obtaining the representations, $M_b$ is stacked $k$ times to an input tensor $x$ with dimensions $k\cdot V\times 2$ with $k$ being the number of questions/answers, such that our training data for __1__ input sentence pair looks like the following:

$
x = \left(\begin{matrix}
M_b\\
M_b\\
\vdots\\
M_b
\end{matrix}\right), q = \left( \begin{matrix}\vec{q}_1\\\vdots\\\vec{q}_k \end{matrix} \right), y = \left( \begin{matrix} \vec{y}_1\\\vdots\\\vec{y}_k \end{matrix} \right)
$

In [91]:
class BAbIDataset(Dataset):
    def __init__(self, source_file_location):
        self.axis_labels = None
        self.sources = ['sentences', 'questions', 'answers']
        with open(source_file_location, 'r') as f:
            lines = f.readlines()
            txt = ' '.join([line[2:].strip() for line in lines])
            vocab = unique(word_tokenize(txt))
            avocab = array(('yes', 'no'))  # used to be array
        
        # contains entries ([BOW_1 BOW_2], Q_i, A_i)
        sets = []
        for i in range(0, len(lines), 10):
            b_1 = self._get_bow(lines[i][2:].strip(), vocab)
            b_2 = self._get_bow(lines[i+1][2:].strip(), vocab)
            bm = transpose(array((b_1, b_2)))  # used to be array
            q_i = []
            a_i = []
            for j in range(2, 10):
                l = lines[i+j][2:].strip().split('\t')
                q_i.append(self._get_bow(l[0], vocab))
                a_i.append(self._get_bow(l[1], avocab))
            sets.append((bm, q_i, a_i))
        # stack all the data

        ## first step: stack for one sentence-pair (x)
        elements = [(array([s[0]]*len(s[1])), array(s[1]), array(s[2])) for s in sets]  # used to be array

        ## second step: stack everything
        self.x = array([e[0] for e in elements], dtype='float32')  # used to be array
        self.q = array([e[1] for e in elements], dtype='float32')  # used to be array
        self.y = array([e[2] for e in elements], dtype='float32')  # used to be array
        
        self._vocab = vocab
        self._avocab = avocab
        
        self._pointer = 0
        self._max_pointer = self.x.shape[0]
        self.v = len(self._vocab)
        
    def _get_bow(self, txt, vocab):
        t = array(word_tokenize(txt))
        v = [len(where(t == vocab[i])[0]) for i in range(len(vocab))]
        return array(v)
    
    def get_vocab(self):
        return self._vocab
    
    def get_avocab(self):
        return self._avocab
    
    def get_data(self, a, b):
        size = batch_size # workaround
        p = self._pointer
        self._pointer = min(self._max_pointer, p+size)
        return self.x[p:p+size][0], self.q[p:p+size][0], self.y[p:p+size]
        


In [85]:
data = BAbIDataset(DATA_FILE)

#### Inspection area

In [90]:
data.q.shape

(100, 8, 24)

### Wiring the network with theano operations

#### Getting started: Embedding sentences $x_i$ and questions $q_j$
The following thoughts refer to a single input sentence pair with $k$ questions/answers. First compute $m_1, m_2$ of dimension $d$:

$
m = (\vec{m}_1 \vec{m}_2) = A \cdot M_b
$

$A$ has dimensions $d\times V$, but __*how to choose d*__?

Also there is embedding matrix $B$ for computing the question $q_j$'s memory-vectors which is the $d$-dimensional inner state $u_j$ for $1 \le j \le k$. $B$ also has dimensions $d \times V$:

$
u = (\vec{u}_1 \dots \vec{u}_k) = B \cdot (\vec{q}_1^T\dots\vec{q}_k^T)
$

In [67]:
# hyperparameters/initial Values:
d = 100
v = data.v

In [72]:
# paramters
A = shared(randn(d, v), name='A')
B = shared(randn(d, v), name='B')
C = shared(randn(d, v), name='C')
W = shared(randn(2, d), name='W')

# io
x_in = matrix('sentences', dtype='float32')
q = matrix('questions', dtype='float32')
y_hat = matrix('y_hat', dtype='float32')
y = matrix('answers', dtype='float32')

Now the _match_ (__ASK ABOUT THIS__) between $m_i$ and $u_j$ has to be computed as

$
p_{ij} = Softmax(\vec{u}_j^T\cdot \vec{m}_i),
$

giving us the „probability over the inputs“.

In a next step the output representations should be computed.

#### 1. Transform the inputs $c_i$
$
c = (\vec{c}_1 \vec{c}_2) = C\cdot M_b,
$

where $C$ is another embedding matrix with dimensions $d \times V$.

#### 2. Compute the memory's response to the input
$
\vec{o}_j = \sum_i{p_{ij} \vec{c}_i} = p_{1j}\vec{c}_1 + p_{2j}\vec{c}_2
$

#### Final computation of the predicted answer
$
\hat{y} = Softmax(W\cdot(\vec{o}_j + \vec{u}_j)),
$

where $W$ has dimensions $2\times d$, since the answering-vocabular has size 2 („yes“, „no“). As a consequence $\hat{y}$ is 2-dimensional.

Summarized in one formula we compute:

$
\hat{y} = Softmax(W\cdot (\vec{p}^T\cdot (c_1, c_2))+B\cdot\vec{q}_j^T)\\
= Softmax(W\cdot (p_{1j}\vec{c}_1 + p_{2j}\vec{c}_2 + B\cdot\vec{q}_j^T))\\
= Softmax(W\cdot (p_{1j}C\cdot\vec{b}_1 + p_{2j}C\cdot\vec{b}_2 + B\cdot\vec{q}_j^T))\\
= Softmax(W\cdot ( Softmax(\vec{u}_j^T\cdot A\cdot\vec{b}_1)\cdot C\cdot\vec{b}_1 + Softmax(\vec{u}_j^T\cdot A \cdot \vec{b}_2)\cdot C\cdot\vec{b}_2 + B\cdot\vec{q}_j^T))
$ 

|matrix|dimensions||vector|dimension|
|---|-----------||---|---|
|$A$|$d\times V$||$b$|$V$|
|$B$|$d\times V$||$m, u$|$d$|
|$C$|$d\times V$||$c, o$|$d$|
|$W$|$2\times d$||$\hat{y}$|$2$|

__NOTE:__ Biases were ignored in the equations.


In [77]:
m = batched_dot(A, x_in)
u = batched_dot(B, q)  # warning: transpose?
p = softmax(batched_dot(u, m))  # warning: transpose?
c = batched_dot(C, x_in)
o = batched_dot(p, c)  # is this correct? check the whole thing with dummy values and make weight-matrices eyes
y_hat = softmax(batched_dot(W, o+u))

tf = function(inputs=[x_in, q], outputs=[y_hat], allow_input_downcast=True)  # not used so far

### Training the network
This procedure optimizes weight-matrices $A, B, C$ and $W$.

In [86]:
epochs = 3

cost = CategoricalCrossEntropy().apply(y, y_hat)
cg = ComputationGraph(cost)
algorithm = GradientDescent(cost=cost, parameters=[A, B, C, W], step_rule=Scale(learning_rate=0.1))

data_stream = DataStream.default_stream(data)

main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=[FinishAfter(after_n_epochs=epochs)])

In [87]:
main_loop.run()

TypeError: ('Bad input argument to theano function with name "/home/klotzmaz/anaconda3/lib/python3.5/site-packages/blocks/algorithms/__init__.py:261"  at index 1(0-based), Wrong number of dimensions: expected 2, got 3 with shape (8, 24, 2).\n\nOriginal exception:\n\tTypeError: Bad input argument to theano function with name "/home/klotzmaz/anaconda3/lib/python3.5/site-packages/blocks/algorithms/__init__.py:261"  at index 1(0-based), Wrong number of dimensions: expected 2, got 3 with shape (8, 24, 2).', 'Wrong number of dimensions: expected 2, got 3 with shape (8, 24, 2).')

### Using the network