In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from collections import Counter
import itertools
import tensorflow as tf

Let's deal at first with Common Nouns Question Answering.

In [2]:
sample_train = pd.read_csv('data/CBT_CN_train.csv',delimiter=';')
sample_valid = pd.read_csv('data/CBT_CN_valid.csv',delimiter=';')

In [3]:
sample_train = sample_train.sample(n=20000, replace=True)

In [4]:
sample_train['Document'] = sample_train['Document'].apply(literal_eval)
sample_train['Query'] = sample_train['Query'].apply(literal_eval)
sample_train['Candidates'] = sample_train['Candidates'].apply(literal_eval)

In [5]:
sample_train.head()

Unnamed: 0,Document,Query,Candidates,Answer
61630,"[he, asked, ,, stopping, before, a, window, fu...","[yes, ,, xxxxx, ,, and, jo, felt, as, calm, an...","[miss, block, bundles, dress, grapes, marketin...",sir
35840,"[even, the, fairies, who, had, been, bathing, ...","[the, fairy, opened, her, eyes, slowly, and, l...","[anyone, castle, emperor, flute, middle, palac...",flute
10415,"[the, king, asked, her, who, she, was, ., she,...","[the, procession, was, marching, on, quite, sm...","[contrary, courage, fate, idea, king, man, mot...",king
111560,"[very, beautiful, ,, very, beautiful, indeed, ...","[what, s, the, matter, ,, peter, rabbit, ,, wh...","[sir, clothes, feet, heart, matter, splash, su...",matter
49195,"[i, do, n, t, think, any, one, but, you, would...","[all, our, branches, are, famous, in, one, xxx...","[arch, breath, deal, family, muscle, sea, sigh...",way


### 1. Maximum frequency model.

In [6]:
# choose the candidate that has the maximum frequency in Query/Document
def maximum_frequency_prediction(row, use_Document_info=False):
    temp = row['Query']
    if use_Document_info:
        temp += row['Document']
    freqs = dict(Counter(temp))
    ans = row['Candidates'][0]
    for w in row['Candidates']:
        if w in freqs.keys() and (ans not in freqs.keys() or (freqs[w] > freqs[ans])):
            ans = w
    return ans

In [7]:
def max_freq_result(df, use_Document_info=False):
    accuracy = 0
    for _, row in df.iterrows():
        if maximum_frequency_prediction(row, use_Document_info) == row['Answer']:
            accuracy += 1
    print('\t Accuracy:', round(accuracy / df.shape[0], 2))

In [8]:
print('Maximum frequency (corpus):')
max_freq_result(sample_train, False)
print('Maximum frequency (corpus + context):')
max_freq_result(sample_train, True)

Maximum frequency (corpus):
	 Accuracy: 0.13
Maximum frequency (corpus + context):
	 Accuracy: 0.27


In the paper "THE GOLDILOCKS PRINCIPLE: READING CHILDREN’S BOOKS..." the results are $0.158$ with corpus-only and $0.281$ with corpus+context, which is pretty similar.

### 2. Embedding model.

We simultaneously learn input and output embedding matrices $I, O \in \mathbb{R}^{pxd}$, where $p$ is the embedding dimension and $d$ is the dictionary size.


For a given input passage $q$ and candidate word $c$ we compute score as:

$S(q,w) = (I\phi(q))^TO\phi(c)$, where $\phi(q)$ indicates one-hot representation of $q$.

In [9]:
def categorize_row(row, word_to_id):
    newrow = []
    for word in row:
        newrow.append(word_to_id[word])
    return np.array(newrow)
        
def categorize_data(df):
    words = list(itertools.chain.from_iterable(df['Document'].values))
    words += list(itertools.chain.from_iterable(df['Query'].values))
    words += list(itertools.chain.from_iterable(df['Candidates'].values))
    words += list(df['Answer'].values)
    print('Random word:', words[19374])
    words = set(words)
    print('Dictionary size:', len(words))
    word_to_id = {t: i for i, t in enumerate(words)}
    id_to_word = {i: t for i, t in enumerate(words)}
    
    df['Document'] = df['Document'].apply(lambda row: categorize_row(row, word_to_id))
    df['Query'] = df['Query'].apply(lambda row: categorize_row(row, word_to_id))
    df['Candidates'] = df['Candidates'].apply(lambda row: categorize_row(row, word_to_id))
    df['Answer'] = df['Answer'].apply(lambda word: word_to_id[word])
    return len(words), word_to_id, id_to_word

In [10]:
np.max([len(ar) for ar in sample_train['Query'].values])

1346

In [11]:
dictionary_size, word_to_id, id_to_word = categorize_data(sample_train)

Random word: of
Dictionary size: 37714


In [12]:
word_to_id['xxxxx']

25107

In [13]:
np.where(sample_train['Query'].iloc[0] == word_to_id['xxxxx'])

(array([2]),)

In [14]:
sample_train.head()

Unnamed: 0,Document,Query,Candidates,Answer
61630,"[10979, 1873, 6718, 13520, 28857, 2164, 24553,...","[3008, 6718, 25107, 6718, 26010, 21349, 6433, ...","[8752, 1196, 24403, 25492, 17069, 25301, 5153,...",9891
35840,"[15562, 1209, 2630, 1810, 35088, 35977, 27719,...","[1209, 26598, 17699, 19163, 33550, 12507, 2601...","[35112, 11611, 1121, 11316, 32622, 11917, 2520...",11316
10415,"[1209, 15376, 1873, 19163, 1810, 26044, 19030,...","[1209, 426, 19030, 12360, 22427, 27653, 26037,...","[28373, 13318, 10922, 34622, 15376, 19752, 333...",15376
111560,"[31873, 23443, 6718, 31873, 23443, 36951, 6718...","[33817, 27066, 1209, 2734, 6718, 567, 8934, 67...","[9891, 16073, 15128, 4312, 2734, 13996, 21278,...",2734
49195,"[17548, 4388, 7229, 19864, 37624, 31786, 33177...","[30029, 35929, 34620, 18148, 453, 13517, 33177...","[2826, 11292, 33679, 23932, 37155, 6425, 1223,...",4797


In [32]:
np.array(sample_train.iloc[0]['Candidates'] == sample_train.iloc[0]['Answer'],dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [35]:
def sample_batch(data, batch_size):
    inds = np.random.randint(0, len(data), size=batch_size)
    X = []
    for arr in data.iloc[inds]['Query'].values:
        i = np.where(arr == word_to_id['xxxxx'])[0][0]
        arr = list(arr)
        d1 = i
        d2 = len(arr) - 1 - i
        if d1 > 4 and d2 > 4:
            X += [arr[i-5:i] + arr[i+1:i+6]]
        elif d1 > 4:
            X += [arr[i-5:i] + arr[i+1:len(arr)] + [dictionary_size]*(5-d2)]
        elif d2 > 4:
            X += [[dictionary_size]*(5-d1) + arr[0:i] + arr[i+1:i+6]]
    X = np.array(X)
    C, y = [], []
    for i in inds:
        arr = data.iloc[i]['Candidates']
        C += [list(arr)]
        y += [list(np.array(arr == data.iloc[i]['Answer'],dtype=int))]
    C = np.array(C)
    y = np.array(y)
    return X, C, y

In [36]:
X_ex, C_ex, y_ex = sample_batch(sample_train, 3)

In [42]:
y_ex.shape

(3, 10)

In [43]:
embedding_size = 30
tf.reset_default_graph()
X = tf.placeholder(tf.int32, [None, 10], name='input_passage')
C = tf.placeholder(tf.int32, [None, 10], name='candidates')
y = tf.placeholder(tf.int32, [None, 10], name='answers')

input_embeddings = tf.Variable(tf.random_uniform([dictionary_size, embedding_size], 0, 0.1, dtype=tf.float32))
output_embeddings = tf.Variable(tf.random_uniform([dictionary_size, embedding_size], 0, 0.1, dtype=tf.float32))

emb_X = tf.nn.embedding_lookup(input_embeddings, X)
emb_X = tf.reduce_sum(emb_X, axis=1)
print(emb_X.shape)
emb_C = tf.nn.embedding_lookup(output_embeddings, C)
print(emb_C.shape)

scores = tf.reduce_sum(tf.tensordot(emb_C, emb_X, axes=[2, 1]), axis=2)
predictions = tf.nn.softmax(scores)
print(predictions.shape)

loss = tf.losses.log_loss(labels=y, predictions=predictions)

train_op = tf.train.AdamOptimizer().minimize(loss, )

(?, 30)
(?, 10, 30)
(?, 10)


In [None]:
s = tf.Session()
    
s.run(tf.global_variables_initializer())

n_epochs = 200
batches_per_epoch = 1000
batch_size = 10

train_losses = []
test_losses = []

for epoch in tqdm.tqdm(range(n_epochs)):

    #print("-------\n")
    old_test_loss = avg_test_loss
    avg_test_loss = 0
    avg_train_loss = 0
    for batch in range(batches_per_epoch):
        x_, y_, len_, inds_ = sample_batch(train_ix, raw_data['EAP'], raw_data['HPL'], raw_data['MWS'], batch_size)

        _, iloss, y_pred = s.run([train_op, loss, predictions], {X: x_,
                                                                y: y_,
                                                                lengths: len_,
                                                                learning_rate_ph: lr})
        avg_train_loss += iloss
        x_test, y_test, len_test, _ = sample_batch(test_ix, 
                                                   raw_data['EAP'], raw_data['HPL'], raw_data['MWS'], batch_size)
        
        iloss = s.run(loss, {X: x_test,y: y_test, lengths: len_test})
        avg_test_loss += iloss

    #print("EPOCH: ", epoch)
    #print("AVERAGE TRAIN LOSS: ", avg_train_loss/batches_per_epoch)
    #print("AVERAGE TEST LOSS: ", avg_test_loss/batches_per_epoch)
    train_losses.append([avg_train_loss/batches_per_epoch])
    test_losses.append([avg_test_loss/batches_per_epoch])
    
    if epoch != 0 and avg_test_loss > old_test_loss:
        pass
        #print('problems...')

In [36]:
s = tf.Session()
l2 = s.run([one_hotted], {l: sample_train['Document'].iloc[0
                                                          ]})

In [37]:
l2[0].shape

(37906,)

In [38]:
l2[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)