In [1]:
from cbt_preprocessing import CBTProcessor

# give path to text corpus and embeddings
data_loader = CBTProcessor('data/cbt_train.txt', 'data/glove.6B.50d.txt', 50)

Words extracted. Total number: 33012
Number of pre-trained: 32416


In [2]:
# prepare for batch generation on train and val splits
data_loader.fit_on_texts('data/cbtest_CN_train.txt', 'train')
data_loader.fit_on_texts('data/cbtest_CN_valid_2000ex.txt', 'val')

In [3]:
# check batch sampling
ex_batch_train = data_loader.sample_batch('train', 32)
data_loader.show_example(ex_batch_train)

DOC:
it would be a singular thing for me to keep it . it 's not to be supposed that i would be any hindrance to gentlemen in your situation ; that would be a singular thing ! '' cries he , and began to pull gold out of his pocket with a mighty red face . alan said nothing , only looked on the ground . `` will you step to the door with me , sir ? '' said i. cluny said he would be very glad , and followed me readily enough , but he looked flustered and put out . `` and now , sir , '' says i , `` i must first acknowledge your generosity . '' `` nonsensical nonsense ! '' cries cluny . `` where 's the generosity ? this is just a most unfortunate affair ; but what would ye have me do -- boxed up in this of a cage of mine -- but just set my friends to the cartes , when i can get them ? and if they lose , of course , it 's not to be supposed -- '' and here he came to a pause . `` yes , '' said i , `` if they lose , you give them back their money ; and if they win , they carry away yours in the

In [4]:
# each batch contains:
# docs and queries
ex_D_train, ex_Q_train = ex_batch_train[0], ex_batch_train[1]
print('Doc shape:', ex_D_train.shape)
print('Query shape:', ex_Q_train.shape)
# 10 possible candidates
ex_C_train = ex_batch_train[2]
print('-'*10)
print('Cands shape:', ex_C_train.shape)
# real answers
ex_A_train = ex_batch_train[3]
print('-'*10)
print('Answer shape:', ex_A_train.shape)
# Doc2Query Masks
ex_mask_train = ex_batch_train[4]
print('-'*10)
print('Mask shape:', ex_mask_train.shape)

Doc shape: (32, 1000)
Query shape: (32, 150)
----------
Cands shape: (32, 10)
----------
Answer shape: (32,)
----------
Mask shape: (32, 1000, 150)


# 3. Model
We'll implement Attention-over-attention reader, as described in https://arxiv.org/abs/1607.04423

In [28]:
class Attention_over_Attention_Reader():
    def __init__(self, dict_size, embedding_dim=100, hidden_state_dim=50, 
                 D_max=1000, Q_max=100, l2_w=0.0001, dropout_rate=0.1,
                 adam_w=0.001, grad_clip=5):
        self.dict_size = dict_size
        self.emb_dim, self.rnn_dim = embedding_dim, hidden_state_dim
        self.D_max, self.Q_max = D_max, Q_max
        self.l2_w, self.adam_w = l2_w, adam_w
        self.dropout, self.grad_clip = 1 - dropout_rate, grad_clip
        self.comp_graph = tf.Graph()
        self.init_graph()
        
        # initialize embedding matrix and rnn
        self.sess = tf.Session(graph=self.comp_graph)
        self.sess.run(self.init_all_op)
    
    def init_params(self):
        self.D = tf.placeholder(tf.int32, [None, self.D_max], name='Document')
        self.lenD = tf.placeholder(tf.int32, [None], name='Document_length')
        self.Q = tf.placeholder(tf.int32, [None, self.Q_max], name='Query')
        self.lenQ = tf.placeholder(tf.int32, [None], name='Query_length')
        self.DQ_mask = tf.placeholder(tf.float32, 
                                      [None, self.D_max, self.Q_max], name='Document_Query_mask')
        self.y = tf.placeholder(tf.float32, [None, self.D_max], name='Answer_mask')
        
        self.embedding_mtx = tf.Variable(
            tf.random_uniform([self.dict_size, self.emb_dim], 
                              -0.05, 0.05, dtype=tf.float32), name='Embedding_matrix')
    
    def process_text(self):
        # Embed input texts
        embedded_D = tf.nn.dropout(
            tf.nn.embedding_lookup(self.embedding_mtx, self.D, name='Embedded_document'), 
            self.dropout)
        embedded_Q = tf.nn.dropout(
            tf.nn.embedding_lookup(self.embedding_mtx, self.Q, name='Embedded_query'), 
            self.dropout)
        # Process Document using bi-GRU
        with tf.variable_scope('Document_processor', initializer=tf.orthogonal_initializer()):
            fwd_cell = tf.nn.rnn_cell.GRUCell(self.rnn_dim)
            bwd_cell = tf.nn.rnn_cell.GRUCell(self.rnn_dim)
            
            h_out, _ = tf.nn.bidirectional_dynamic_rnn(fwd_cell, bwd_cell, embedded_D, 
                                         sequence_length=self.lenD, dtype=tf.float32)
            self.h_Doc = tf.concat(h_out, 2)
        # Process Query using bi-GRU
        with tf.variable_scope('Query_processor', initializer=tf.orthogonal_initializer()):
            fwd_cell = tf.nn.rnn_cell.GRUCell(self.rnn_dim)
            bwd_cell = tf.nn.rnn_cell.GRUCell(self.rnn_dim)
            
            h_out, _ = tf.nn.bidirectional_dynamic_rnn(fwd_cell, bwd_cell, embedded_Q, 
                                         sequence_length=self.lenQ, dtype=tf.float32)
            self.h_Query = tf.concat(h_out, 2)
    
    def compute_scores(self):
        # get matching scores for Document and Query
        M = tf.matmul(self.h_Doc, self.h_Query, transpose_b=True, name='Matching_scores')
        # apply softmax Document-wise
        self.alpha = softmax(M, 1, self.DQ_mask, name='Query_to_Document_attention')
        # apply softmax Query-wise, then average to get importance of each word in Query
        self.beta = tf.reduce_sum(softmax(M, 2, self.DQ_mask), 
                                  1, keep_dims=True, name='Document_to_Query_attention')
        self.beta_imp = tf.div(self.beta, tf.maximum(tf.reduce_sum(self.DQ_mask,axis=1,keep_dims=True),1))
        self.s = tf.matmul(self.alpha, self.beta_imp, transpose_b=True, name='Final_scores')
    
    def init_graph(self):
        tf.reset_default_graph()
        with self.comp_graph.as_default():
            self.init_params()
            self.process_text()
            self.compute_scores()
            # get probability that y is the answer word
            with tf.variable_scope('Aggregating_results'):
                self.p_y = tf.reduce_sum(
                    tf.reduce_sum(self.s, axis=2) * self.y, 
                    axis=1)
            # train to maximize negative log loss of the answer word
            self.loss = -tf.reduce_mean(tf.log(tf.maximum(self.p_y, 1e-12)))
            self.loss += self.l2_w * tf.nn.l2_loss(self.embedding_mtx)
            
            optimizer = tf.train.AdamOptimizer(learning_rate=self.adam_w)
            # use gradient clipping to avoid exploding
            gvs = optimizer.compute_gradients(self.loss)
            capped_gvs = [(tf.clip_by_value(grad, -self.grad_clip, self.grad_clip), 
                           var) for grad, var in gvs]
            self.train_op = optimizer.apply_gradients(capped_gvs)
            
            self.init_all_op = tf.global_variables_initializer()
    
    def fit(self, data, n_steps, batches_per_step, batch_size=32, valid_data=None):
        self.train_losses = []
        self.valid_losses = []
        for step in tqdm(range(n_steps)):
            temp,temp2 = 0,0
            for i in range(batches_per_step):
                sample_data = sample_batch(data, batch_size, word_to_id,
                                                 D_max_len=self.D_max, Q_max_len=self.Q_max)
                inputs = [self.D, self.lenD, self.Q, self.lenQ, self.DQ_mask, self.y] 
                _, iloss = self.sess.run([self.train_op, self.loss], 
                                         feed_dict={i:d for i,d in zip(inputs,sample_data)})
                assert not np.isinf(iloss)
                temp += iloss
            if valid_data is not None:
                sample_data = sample_batch(valid_data, 1000, word_to_id,
                                                 D_max_len=self.D_max, Q_max_len=self.Q_max)
                inputs = [self.D, self.lenD, self.Q, self.lenQ, self.DQ_mask, self.y]
                temp2 = self.sess.run(self.loss, 
                                      feed_dict={i:d for i,d in zip(inputs,sample_data)})
                
            self.train_losses.append([temp/batches_per_step])
            if step != 0 and self.valid_losses[-1] < temp2 * 0.98:
                print('valid loss has reached minimum')
                break
            self.valid_losses.append([temp2])
    
    def predict(self, data, batch_size=100):
        y_hat = [[0,0]]*len(data)
        for step in tqdm(range(0, len(data), batch_size)):
            sample_data = sample_batch(data, batch_size, word_to_id,
                                      self.D_max, self.Q_max, offset=step)
            inputs = [self.D, self.lenD, self.Q, self.lenQ, self.DQ_mask, self.y]
            
            scores = np.sum(self.sess.run(self.s,
                                   feed_dict={i:d for i,d in zip(inputs,sample_data)}),2)
            
            for i,doc in enumerate(sample_data[0]):
                ans = doc[0]
                p_ans = 0
                p_tot = 0
                for word in np.unique(doc):
                    p_word = np.sum(scores[i][np.where(doc == word)])
                    p_tot += p_word
                    if p_word > p_ans:
                        ans = word
                        p_ans = p_word
                y_hat[step + i] = [ans, p_ans]
        return np.array(y_hat)      

In [29]:
Att_Reader = Attention_over_Attention_Reader(dictionary_size)
tf.summary.FileWriter("logs", Att_Reader.comp_graph).close()

In [31]:
print('initial accuracy:')
compute_accuracy(Att_Reader, cat_valid)

  0%|          | 0/20 [00:00<?, ?it/s]

initial accuracy:


100%|██████████| 20/20 [00:56<00:00,  2.81s/it]

accuracy: 0.0





In [32]:
train_losses = []
valid_losses = []
valid_accuracies = []

# 32 * 200 * 50 = 320,000
# 32 * 100 takes ~5min, 
total_n_steps = 200
batches_per_step = 50

In [36]:
total_n_steps = 20
for steps in range(0, total_n_steps, 5):
    Att_Reader.fit(cat_train, 5, batches_per_step, valid_data=cat_valid)
    train_losses += Att_Reader.train_losses
    valid_losses += Att_Reader.valid_losses

100%|██████████| 5/5 [23:55<00:00, 287.06s/it]
100%|██████████| 5/5 [20:28<00:00, 245.65s/it]
 40%|████      | 2/5 [07:29<11:14, 224.82s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A

valid loss has reached minimum



Exception in thread Thread-15:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

 40%|████      | 2/5 [08:06<12:09, 243.31s/it]

valid loss has reached minimum





In [37]:
total_n_steps = 20
for steps in range(0, total_n_steps, 5):
    Att_Reader.fit(cat_train, 5, batches_per_step, valid_data=cat_valid)
    train_losses += Att_Reader.train_losses
    valid_losses += Att_Reader.valid_losses

 40%|████      | 2/5 [10:02<15:04, 301.37s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A

valid loss has reached minimum



Exception in thread Thread-16:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

 40%|████      | 2/5 [08:36<12:55, 258.48s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

valid loss has reached minimum


100%|██████████| 5/5 [22:45<00:00, 273.09s/it]
100%|██████████| 5/5 [53:12<00:00, 638.53s/it]


In [38]:
total_n_steps = 20
for steps in range(0, total_n_steps, 5):
    Att_Reader.fit(cat_train, 5, batches_per_step, valid_data=cat_valid)
    train_losses += Att_Reader.train_losses
    valid_losses += Att_Reader.valid_losses

 20%|██        | 1/5 [07:15<29:03, 435.82s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A

valid loss has reached minimum



Exception in thread Thread-19:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 5/5 [33:29<00:00, 401.92s/it]
100%|██████████| 5/5 [37:48<00:00, 453.76s/it]
100%|██████████| 5/5 [37:25<00:00, 449.06s/it]


In [None]:
total_n_steps = 20
for steps in range(0, total_n_steps, 5):
    Att_Reader.fit(cat_train, 5, batches_per_step, valid_data=cat_valid)
    train_losses += Att_Reader.train_losses
    valid_losses += Att_Reader.valid_losses

100%|██████████| 5/5 [37:09<00:00, 445.82s/it]
100%|██████████| 5/5 [45:20<00:00, 544.05s/it]
100%|██████████| 5/5 [52:50<00:00, 634.16s/it]
100%|██████████| 5/5 [39:18<00:00, 471.66s/it]


In [None]:
total_n_steps = 20
for steps in range(0, total_n_steps, 5):
    Att_Reader.fit(cat_train, 5, batches_per_step, valid_data=cat_valid)
    train_losses += Att_Reader.train_losses
    valid_losses += Att_Reader.valid_losses

 80%|████████  | 4/5 [33:09<08:17, 497.44s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

valid loss has reached minimum


[A
Exception in thread Thread-26:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 5/5 [46:59<00:00, 563.90s/it]
100%|██████████| 5/5 [1:03:06<00:00, 757.38s/it]
 60%|██████    | 3/5 [35:34<23:43, 711.65s/it]

valid loss has reached minimum


In [None]:
print('accuracy after 100 steps')
compute_accuracy(Att_Reader, cat_valid)




accuracy after 100 steps


  0%|          | 0/20 [00:00<?, ?it/s][A
Exception in thread Thread-28:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 20/20 [01:07<00:00,  3.37s/it]


accuracy: 0.295


In [None]:
train_losses

In [None]:
print('accuracy after 100 steps')
compute_accuracy(Att_Reader, cat_valid)

In [None]:
plt.figure(figsize=(12,8))
plt.title('Negative log loss', size=25)
plt.plot(train_losses, label='train')
plt.plot(valid_losses, label='valid')
plt.legend(loc='best',fontsize=20)
plt.show()