In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
import itertools
import random
import tensorflow as tf
from tqdm import tqdm
from matplotlib import pyplot as plt

# 1. Data preprocessing
(Collect word dictionary, transform each word to dict index)

In [2]:
sample_train = pd.read_csv('data/CBT_CN_train.csv',delimiter=';')
sample_valid = pd.read_csv('data/CBT_CN_valid.csv',delimiter=';')

In [3]:
%%time
sample_train['Document'] = sample_train['Document'].apply(literal_eval)
sample_valid['Document'] = sample_valid['Document'].apply(literal_eval)
sample_train['Query'] = sample_train['Query'].apply(literal_eval)
sample_valid['Query'] = sample_valid['Query'].apply(literal_eval)
sample_train['Candidates'] = sample_train['Candidates'].apply(literal_eval)
sample_valid['Candidates'] = sample_valid['Candidates'].apply(literal_eval)

CPU times: user 2min 17s, sys: 1.14 s, total: 2min 18s
Wall time: 2min 18s


In [4]:
# transform list of words to their indices
def categorize_text(text, word_to_id, mapped_words):
    newtext = []
    text_to_process = text
    if type(text) is not list:
        text_to_process = [text]
    for word in text_to_process:
        if mapped_words is None or word in mapped_words:
            newtext.append(word_to_id[word])
        else:
            newtext.append(word_to_id['<NA>'])
    return newtext

def categorize_df(df, word_to_id=None):
    cat_df = pd.DataFrame(dtype=str).reindex_like(df)
    cat_df['Document'] = [[]] * len(df)
    cat_df['Query'] = [[]] * len(df)
    cat_df['Candidates'] = [[]] * len(df)
    cat_df['Answer'] = ['<NA>'] * len(df)
    words = []
    mapped_words = None
    id_to_word = None
    if word_to_id is None:
        print('Processing train data...')
        words += list(itertools.chain.from_iterable(df['Document'].values))
        words += list(itertools.chain.from_iterable(df['Query'].values))
        words += list(itertools.chain.from_iterable(df['Candidates'].values))
        words += list(df['Answer'].values)
        print('\t random word:', words[19374])
        words += ['<NA>']
        words = set(words)
        print('\t dictionary size(with NA):', len(words))
        word_to_id = {t: i for i, t in enumerate(words)}
        id_to_word = {i: t for i, t in enumerate(words)}
    else:
        print('Processing test data...')
        mapped_words = set(word_to_id.keys())
    
    cat_df['Document'] = df['Document'].apply(lambda row: categorize_text(row, word_to_id, mapped_words))
    cat_df['Query'] = df['Query'].apply(lambda row: categorize_text(row, word_to_id, mapped_words))
    cat_df['Candidates'] = df['Candidates'].apply(lambda row: categorize_text(row, word_to_id, mapped_words))
    cat_df['Answer'] = df['Answer'].apply(lambda row: categorize_text(row, word_to_id, mapped_words)[0])
    
    return cat_df, len(words), word_to_id, id_to_word

In [5]:
def check_categorization(id_to_word, df, cat_df, isTrain=True):
    print('Checking categorization...')
    ind_s = np.random.randint(0, high=len(df), size=100)
    allClear = True
    NA_freq = 0
    tot_len = 0
    for i in ind_s:
        for col in df.columns[:-1]:
            temp = cat_df[col].iloc[i]
            assert type(temp) is list
            tot_len += len(temp)
            for j in range(len(temp)):
                if id_to_word[temp[j]] != df[col].iloc[i][j]:
                    if isTrain or id_to_word[temp[j]] != '<NA>':
                        allClear = False
                    else:
                        NA_freq += 1
        tot_len += 1
        if id_to_word[cat_df['Answer'].iloc[i]] != df['Answer'].iloc[i]:
            if isTrain or id_to_word[cat_df['Answer'].iloc[i]] != '<NA>':
                allClear = False
            else:
                NA_freq += 1
    NA_freq = 100.0 * float(NA_freq) / float(tot_len)
    if allClear:
        print('\t Sector is clear')
        if NA_freq > 0:
            print('\t Percentage of <NA> words:', NA_freq)
    else:
        print('\t ... Not clear! Not clear!')

In [6]:
cat_train, dictionary_size, word_to_id, id_to_word = categorize_df(sample_train)
check_categorization(id_to_word, sample_train, cat_train)

Processing train data...
	 random word: too
	 dictionary size(with NA): 42246
Checking categorization...
	 Sector is clear


In [7]:
cat_valid, _, _, _ = categorize_df(sample_valid, word_to_id)
check_categorization(id_to_word, sample_valid, cat_valid, isTrain=False)

Processing test data...
Checking categorization...
	 Sector is clear
	 Percentage of <NA> words: 0.8508800345535039


In [8]:
cat_train.head()

Unnamed: 0,Document,Query,Candidates,Answer
0,"[21208, 18336, 36403, 37681, 41172, 15806, 278...","[16838, 5773, 27599, 38830, 5773, 19355, 38301...","[39207, 19720, 1070, 14617, 17864, 34787, 2715...",8119
1,"[21208, 18336, 36403, 37681, 41172, 15806, 278...","[16838, 5773, 8119, 38830, 5773, 27599, 38301,...","[40271, 19720, 19355, 38509, 34787, 18542, 311...",19355
2,"[21208, 18336, 36403, 37681, 41172, 15806, 278...","[16838, 5773, 8119, 38830, 5773, 19355, 38301,...","[39207, 40271, 10628, 24077, 12051, 823, 36403...",40271
3,"[1237, 8340, 5773, 19355, 1817, 33083, 14575, ...","[18399, 32458, 2276, 37733, 8150, 29269, 37917...","[40271, 39465, 1070, 31738, 12051, 24437, 3980...",19355
4,"[1237, 8340, 5773, 19355, 1817, 33083, 14575, ...","[18399, 32458, 2276, 37733, 8150, 29269, 37917...","[40271, 39465, 1070, 24077, 31738, 22089, 3113...",1070


In [9]:
cat_train.shape

(120769, 4)

# 2. Batching and utils
(How to batch data from train/valid + some util functions)

In [10]:
def sample_batch(data, batch_size, word_to_id, D_max_len=1000, Q_max_len=100, offset=None):
    if offset is None:
        inds = random.sample(range(len(data)), batch_size)
    else:
        inds = range(offset, offset + batch_size)
    D_len = np.array([min(len(arr), D_max_len) for arr in data.iloc[inds]['Document'].values])
    Q_len = np.array([min(len(arr), Q_max_len) for arr in data.iloc[inds]['Query'].values])
    
    # D is document indices padded with '<NA>', same for Q
    # we use DQ_mask to indicate what part of matching scores matrix to take
    D, Q, DQ_mask, y = [], [], [], []
    for arr in data.iloc[inds].values:
        c_D_len, c_Q_len = len(arr[0]), len(arr[1])
        if c_D_len < D_max_len:
            # pad too short
            D += [arr[0] + [word_to_id['<NA>']]*(D_max_len - len(arr[0]))]
        else:
            # crop too long
            c_D_len = D_max_len
            D += [arr[0][0:D_max_len]]
        
        if c_Q_len < Q_max_len:
            Q += [arr[1] + [word_to_id['<NA>']]*(Q_max_len - len(arr[1]))]
        else:
            c_Q_len = Q_max_len
            Q += [arr[1][0:Q_max_len]]
        
        DQ_mask += [list(np.pad(np.ones([c_D_len, c_Q_len]), (0, max(D_max_len-c_D_len, Q_max_len-c_Q_len)), 
                                'constant', constant_values=(0,0))[:D_max_len, :Q_max_len])]
        # mark all places where y(answer word) is in Document
        y += [list(np.array(np.array(D[-1]) == arr[3],dtype=int))]
    D, Q = np.array(D), np.array(Q)
    DQ_mask, y = np.array(DQ_mask,dtype=float), np.array(y,dtype=float)
    return D, D_len, Q, Q_len, DQ_mask, y

In [11]:
%%time
# check time performance
sample_dict_ex = sample_batch(cat_train, 32, word_to_id)

CPU times: user 108 ms, sys: 36 ms, total: 144 ms
Wall time: 144 ms


In [12]:
DQ_mask_ex, y_ex = sample_dict_ex[4], sample_dict_ex[5]
print(DQ_mask_ex.shape, y_ex.shape)
print(np.where(DQ_mask_ex[0] == 1))
print(sample_dict_ex[1][0], sample_dict_ex[3][0])

(32, 1000, 100) (32, 1000)
(array([  0,   0,   0, ..., 278, 278, 278]), array([ 0,  1,  2, ..., 13, 14, 15]))
279 16


In [13]:
def softmax(M, axis, mask, EPS=1e-12, name=None):
    with tf.name_scope(name, 'softmax', [M]):
        max_axis = tf.reduce_max(M, axis, keep_dims=True)
        M_exp = tf.exp(M - max_axis) * mask
        norm = tf.reduce_sum(M_exp, axis, keep_dims=True)
        return M_exp / (norm + EPS)

In [15]:
def compute_accuracy(model, data):
    y_pred = model.predict(data,100)
    accuracy = np.sum(y_pred[:,0] == data['Answer'])
    print('accuracy:', accuracy/len(data))

# 3. Model
We'll implement Attention-over-attention reader, as described in https://arxiv.org/abs/1607.04423

In [28]:
class Attention_over_Attention_Reader():
    def __init__(self, dict_size, embedding_dim=100, hidden_state_dim=50, 
                 D_max=1000, Q_max=100, l2_w=0.0001, dropout_rate=0.1,
                 adam_w=0.001, grad_clip=5):
        self.dict_size = dict_size
        self.emb_dim, self.rnn_dim = embedding_dim, hidden_state_dim
        self.D_max, self.Q_max = D_max, Q_max
        self.l2_w, self.adam_w = l2_w, adam_w
        self.dropout, self.grad_clip = 1 - dropout_rate, grad_clip
        self.comp_graph = tf.Graph()
        self.init_graph()
        
        # initialize embedding matrix and rnn
        self.sess = tf.Session(graph=self.comp_graph)
        self.sess.run(self.init_all_op)
    
    def init_params(self):
        self.D = tf.placeholder(tf.int32, [None, self.D_max], name='Document')
        self.lenD = tf.placeholder(tf.int32, [None], name='Document_length')
        self.Q = tf.placeholder(tf.int32, [None, self.Q_max], name='Query')
        self.lenQ = tf.placeholder(tf.int32, [None], name='Query_length')
        self.DQ_mask = tf.placeholder(tf.float32, 
                                      [None, self.D_max, self.Q_max], name='Document_Query_mask')
        self.y = tf.placeholder(tf.float32, [None, self.D_max], name='Answer_mask')
        
        self.embedding_mtx = tf.Variable(
            tf.random_uniform([self.dict_size, self.emb_dim], 
                              -0.05, 0.05, dtype=tf.float32), name='Embedding_matrix')
    
    def process_text(self):
        # Embed input texts
        embedded_D = tf.nn.dropout(
            tf.nn.embedding_lookup(self.embedding_mtx, self.D, name='Embedded_document'), 
            self.dropout)
        embedded_Q = tf.nn.dropout(
            tf.nn.embedding_lookup(self.embedding_mtx, self.Q, name='Embedded_query'), 
            self.dropout)
        # Process Document using bi-GRU
        with tf.variable_scope('Document_processor', initializer=tf.orthogonal_initializer()):
            fwd_cell = tf.nn.rnn_cell.GRUCell(self.rnn_dim)
            bwd_cell = tf.nn.rnn_cell.GRUCell(self.rnn_dim)
            
            h_out, _ = tf.nn.bidirectional_dynamic_rnn(fwd_cell, bwd_cell, embedded_D, 
                                         sequence_length=self.lenD, dtype=tf.float32)
            self.h_Doc = tf.concat(h_out, 2)
        # Process Query using bi-GRU
        with tf.variable_scope('Query_processor', initializer=tf.orthogonal_initializer()):
            fwd_cell = tf.nn.rnn_cell.GRUCell(self.rnn_dim)
            bwd_cell = tf.nn.rnn_cell.GRUCell(self.rnn_dim)
            
            h_out, _ = tf.nn.bidirectional_dynamic_rnn(fwd_cell, bwd_cell, embedded_Q, 
                                         sequence_length=self.lenQ, dtype=tf.float32)
            self.h_Query = tf.concat(h_out, 2)
    
    def compute_scores(self):
        # get matching scores for Document and Query
        M = tf.matmul(self.h_Doc, self.h_Query, transpose_b=True, name='Matching_scores')
        # apply softmax Document-wise
        self.alpha = softmax(M, 1, self.DQ_mask, name='Query_to_Document_attention')
        # apply softmax Query-wise, then average to get importance of each word in Query
        self.beta = tf.reduce_sum(softmax(M, 2, self.DQ_mask), 
                                  1, keep_dims=True, name='Document_to_Query_attention')
        self.beta_imp = tf.div(self.beta, tf.maximum(tf.reduce_sum(self.DQ_mask,axis=1,keep_dims=True),1))
        self.s = tf.matmul(self.alpha, self.beta_imp, transpose_b=True, name='Final_scores')
    
    def init_graph(self):
        tf.reset_default_graph()
        with self.comp_graph.as_default():
            self.init_params()
            self.process_text()
            self.compute_scores()
            # get probability that y is the answer word
            with tf.variable_scope('Aggregating_results'):
                self.p_y = tf.reduce_sum(
                    tf.reduce_sum(self.s, axis=2) * self.y, 
                    axis=1)
            # train to maximize negative log loss of the answer word
            self.loss = -tf.reduce_mean(tf.log(tf.maximum(self.p_y, 1e-12)))
            self.loss += self.l2_w * tf.nn.l2_loss(self.embedding_mtx)
            
            optimizer = tf.train.AdamOptimizer(learning_rate=self.adam_w)
            # use gradient clipping to avoid exploding
            gvs = optimizer.compute_gradients(self.loss)
            capped_gvs = [(tf.clip_by_value(grad, -self.grad_clip, self.grad_clip), 
                           var) for grad, var in gvs]
            self.train_op = optimizer.apply_gradients(capped_gvs)
            
            self.init_all_op = tf.global_variables_initializer()
    
    def fit(self, data, n_steps, batches_per_step, batch_size=32, valid_data=None):
        self.train_losses = []
        self.valid_losses = []
        for step in tqdm(range(n_steps)):
            temp,temp2 = 0,0
            for i in range(batches_per_step):
                sample_data = sample_batch(data, batch_size, word_to_id,
                                                 D_max_len=self.D_max, Q_max_len=self.Q_max)
                inputs = [self.D, self.lenD, self.Q, self.lenQ, self.DQ_mask, self.y] 
                _, iloss = self.sess.run([self.train_op, self.loss], 
                                         feed_dict={i:d for i,d in zip(inputs,sample_data)})
                assert not np.isinf(iloss)
                temp += iloss
            if valid_data is not None:
                sample_data = sample_batch(valid_data, 1000, word_to_id,
                                                 D_max_len=self.D_max, Q_max_len=self.Q_max)
                inputs = [self.D, self.lenD, self.Q, self.lenQ, self.DQ_mask, self.y]
                temp2 = self.sess.run(self.loss, 
                                      feed_dict={i:d for i,d in zip(inputs,sample_data)})
                
            self.train_losses.append([temp/batches_per_step])
            if step != 0 and self.valid_losses[-1] < temp2 * 0.98:
                print('valid loss has reached minimum')
                break
            self.valid_losses.append([temp2])
    
    def predict(self, data, batch_size=100):
        y_hat = [[0,0]]*len(data)
        for step in tqdm(range(0, len(data), batch_size)):
            sample_data = sample_batch(data, batch_size, word_to_id,
                                      self.D_max, self.Q_max, offset=step)
            inputs = [self.D, self.lenD, self.Q, self.lenQ, self.DQ_mask, self.y]
            
            scores = np.sum(self.sess.run(self.s,
                                   feed_dict={i:d for i,d in zip(inputs,sample_data)}),2)
            
            for i,doc in enumerate(sample_data[0]):
                ans = doc[0]
                p_ans = 0
                p_tot = 0
                for word in np.unique(doc):
                    p_word = np.sum(scores[i][np.where(doc == word)])
                    p_tot += p_word
                    if p_word > p_ans:
                        ans = word
                        p_ans = p_word
                y_hat[step + i] = [ans, p_ans]
        return np.array(y_hat)      

In [29]:
Att_Reader = Attention_over_Attention_Reader(dictionary_size)
tf.summary.FileWriter("logs", Att_Reader.comp_graph).close()

In [31]:
print('initial accuracy:')
compute_accuracy(Att_Reader, cat_valid)

  0%|          | 0/20 [00:00<?, ?it/s]

initial accuracy:


100%|██████████| 20/20 [00:56<00:00,  2.81s/it]

accuracy: 0.0





In [32]:
train_losses = []
valid_losses = []
valid_accuracies = []

# 32 * 200 * 50 = 320,000
# 32 * 100 takes ~5min, 
total_n_steps = 200
batches_per_step = 50

In [36]:
total_n_steps = 20
for steps in range(0, total_n_steps, 5):
    Att_Reader.fit(cat_train, 5, batches_per_step, valid_data=cat_valid)
    train_losses += Att_Reader.train_losses
    valid_losses += Att_Reader.valid_losses

100%|██████████| 5/5 [23:55<00:00, 287.06s/it]
100%|██████████| 5/5 [20:28<00:00, 245.65s/it]
 40%|████      | 2/5 [07:29<11:14, 224.82s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A

valid loss has reached minimum



Exception in thread Thread-15:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

 40%|████      | 2/5 [08:06<12:09, 243.31s/it]

valid loss has reached minimum





In [37]:
total_n_steps = 20
for steps in range(0, total_n_steps, 5):
    Att_Reader.fit(cat_train, 5, batches_per_step, valid_data=cat_valid)
    train_losses += Att_Reader.train_losses
    valid_losses += Att_Reader.valid_losses

 40%|████      | 2/5 [10:02<15:04, 301.37s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A

valid loss has reached minimum



Exception in thread Thread-16:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

 40%|████      | 2/5 [08:36<12:55, 258.48s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

valid loss has reached minimum


100%|██████████| 5/5 [22:45<00:00, 273.09s/it]
100%|██████████| 5/5 [53:12<00:00, 638.53s/it]


In [38]:
total_n_steps = 20
for steps in range(0, total_n_steps, 5):
    Att_Reader.fit(cat_train, 5, batches_per_step, valid_data=cat_valid)
    train_losses += Att_Reader.train_losses
    valid_losses += Att_Reader.valid_losses

 20%|██        | 1/5 [07:15<29:03, 435.82s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A

valid loss has reached minimum



Exception in thread Thread-19:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 5/5 [33:29<00:00, 401.92s/it]
100%|██████████| 5/5 [37:48<00:00, 453.76s/it]
100%|██████████| 5/5 [37:25<00:00, 449.06s/it]


In [None]:
total_n_steps = 20
for steps in range(0, total_n_steps, 5):
    Att_Reader.fit(cat_train, 5, batches_per_step, valid_data=cat_valid)
    train_losses += Att_Reader.train_losses
    valid_losses += Att_Reader.valid_losses

100%|██████████| 5/5 [37:09<00:00, 445.82s/it]
100%|██████████| 5/5 [45:20<00:00, 544.05s/it]
100%|██████████| 5/5 [52:50<00:00, 634.16s/it]
100%|██████████| 5/5 [39:18<00:00, 471.66s/it]


In [None]:
total_n_steps = 20
for steps in range(0, total_n_steps, 5):
    Att_Reader.fit(cat_train, 5, batches_per_step, valid_data=cat_valid)
    train_losses += Att_Reader.train_losses
    valid_losses += Att_Reader.valid_losses

 80%|████████  | 4/5 [33:09<08:17, 497.44s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

valid loss has reached minimum


[A
Exception in thread Thread-26:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 5/5 [46:59<00:00, 563.90s/it]
100%|██████████| 5/5 [1:03:06<00:00, 757.38s/it]
 60%|██████    | 3/5 [35:34<23:43, 711.65s/it]

valid loss has reached minimum


In [None]:
print('accuracy after 100 steps')
compute_accuracy(Att_Reader, cat_valid)




accuracy after 100 steps


  0%|          | 0/20 [00:00<?, ?it/s][A
Exception in thread Thread-28:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 20/20 [01:07<00:00,  3.37s/it]


accuracy: 0.295


In [None]:
train_losses

In [None]:
print('accuracy after 100 steps')
compute_accuracy(Att_Reader, cat_valid)

In [None]:
plt.figure(figsize=(12,8))
plt.title('Negative log loss', size=25)
plt.plot(train_losses, label='train')
plt.plot(valid_losses, label='valid')
plt.legend(loc='best',fontsize=20)
plt.show()