# Import dependencies

In [1]:
import tensorflow as tf
import data_utils
from sklearn.cross_validation import train_test_split
import numpy as np
import time



# Read a dataset

In [2]:
X, Y, word2idx, idx2word, vocab, _ = data_utils.read_data_set('data.pkl') 

# Inspect data

In [3]:
print 'First article headline - encoded:\n', Y[0]
print [idx2word[idx] for idx in Y[0]]
print '\nFirst article text - encoded:\n', X[0]
print '\nMost freq. words:\n', vocab[:50]

First article headline - encoded:
[3402, 6428, 48, 209, 1862]
['ECB', 'defends', 'England', 'tour', 'schedule']

First article text - encoded:
[24, 48, 6, 146, 458, 732, 20, 1174, 364, 651, 5, 2, 23926, 7, 2, 209, 3, 135, 1988, 17146, 7, 2, 6540, 9212, 29, 287, 275, 5, 1771, 12159, 2, 51, 7, 76, 66, 3, 66, 3401, 3402, 554, 7, 269, 8285, 386, 3270, 101, 70, 2307, 147, 23107, 45, 12, 832, 2, 39, 11647, 26, 2, 135, 632, 11219, 10907, 135, 248, 14, 98, 36, 5795, 9795, 2101, 26, 51, 1483, 3, 2, 169, 7, 22748, 135, 248, 5244, 291, 2149, 3, 287, 1872, 6, 22, 2381, 35, 2696, 13, 183, 43, 594, 6, 393, 5854, 70, 250, 3, 62, 397, 148, 292, 6, 309, 295, 1524, 210, 2, 807, 7, 2, 136, 2101, 3, 4368, 2, 5203, 18, 3157, 7207, 57, 7, 451, 1612, 70, 32, 473, 3, 22829, 4, 1145, 1081, 7, 269, 50, 2, 1642, 2239, 10622, 182, 31, 4760, 18, 5659, 5, 4, 80, 5923, 142, 5, 1509, 7, 45, 2, 269, 9, 4752, 123, 63, 86, 2328, 18, 2, 9711, 7, 174, 269, 16, 2, 3766, 6, 135, 1974, 2101, 5, 17217, 1954, 16332, 658, 7, 14

# Data preprocessing

In [4]:
vocab_size = len(vocab) + 4
word2idx['<pad>'] =  vocab_size - 2
idx2word[vocab_size - 2] = '<pad>'
word2idx['<go>'] =  vocab_size - 1
idx2word[vocab_size - 1] = '<go>'

# data padding
def padding(x, y):
    
    labels = []
    for i in range(len(y)):
        labels.append([word2idx['<go>']] + y[i] + [word2idx['<eos>']] + (8 - len(y[i])) * [word2idx['<pad>']])
    
    inputs = []
    for i in range(len(x)):
        for j in range(len(x[i]) % 2 + 1):
            part = x[i][j*100:(j+1)*100]
            part = (100 - len(part)) * [word2idx['<pad>']] + part
            inputs.append((part, i)) 
    return inputs, labels

# data spliting
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1)

del X
del Y

X_train, Y_train = padding(X_train, Y_train)

# Bulding a model

In [5]:
input_seq_len = 100
output_seq_len = 10

# placeholders for sequences
encoder_inputs = []
for _ in range(input_seq_len):
    encoder_inputs.append(tf.placeholder(tf.int32, shape = [None], name = 'encoder{}'.format(_)))

decoder_inputs = []
for _ in range(output_seq_len):
    decoder_inputs.append(tf.placeholder(tf.int32, shape = [None], name = 'decoder{}'.format(_)))
    
targets = [decoder_inputs[i+1] for i in range(len(decoder_inputs)-1)]

# output projection - dim reduction
output_dim = 512
w_t = tf.get_variable("proj_w", [vocab_size, output_dim], dtype=tf.float32)
w = tf.transpose(w_t)
b = tf.get_variable("proj_b", [vocab_size], dtype=tf.float32)
output_projection = (w, b)

model = tf.nn.seq2seq.embedding_attention_seq2seq(
                                                encoder_inputs, 
                                                decoder_inputs, 
                                                tf.nn.rnn_cell.BasicLSTMCell(output_dim),
                                                num_encoder_symbols = vocab_size,
                                                num_decoder_symbols = vocab_size,
                                                embedding_size = 100,
                                                feed_previous= False,
                                                output_projection = output_projection,
                                                dtype = tf.float32)

# Definition of loss function

In [6]:
def sampled_loss(labels, logits):
    
    return tf.nn.sampled_softmax_loss(
                weights=w_t,
                biases=b,
                labels=tf.reshape(labels, [-1, 1]),
                inputs=logits,
                num_sampled=128,
                num_classes=vocab_size)


# Some helper functions

In [7]:
# helper function for feeding data into placeholders
def feed_dict(x, y, batch_size = 200):
    
    idxes = np.random.choice([i for i in range(len(x))], size = batch_size)
    
    feed = {}
    for i in range(input_seq_len):
            feed[encoder_inputs[i].name] = np.array([x[j][0][i] for j in idxes])
            
    for i in range(output_seq_len):
            feed[decoder_inputs[i].name] = np.array([y[x[j][1]][i] for j in idxes])
            
    return feed

# predicting a headline for article
def predict(sess, feed_dict):
    
    outputs, states = sess.run(model, feed_dict = feed_dict)
    labels = sess.run(targets, feed_dict = feed_dict)
    
    return outputs, labels

# output projection
def proj(sess, output_seq):
    
    output_proj_ops = [tf.matmul(output_seq[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]
    return np.array(sess.run(output_proj_ops))
    
# decoding predicted headline
def decode_output_seq(output_seq):
    
    words = []
    
    for t in range(output_seq_len):
        smax = softmax(output_seq[t])
        idx = np.argmax(smax)
        words.append(idx2word[idx])
        
    return words

# decoding a label
def decode_label(label):
    
    words = []
    for idx in label:
        words.append(idx2word[idx])
    return words

# simple softmax function
def softmax(x):
    n = np.max(x)
    e_x = np.exp(x - n)
    return e_x / e_x.sum()

# Training the model

In [8]:
steps = 10
learning_rate = 1.0
batch_size = 200

labels_tensors = []
for _ in range(output_seq_len):
    labels_tensors.append(tf.placeholder(tf.int32, shape = [None], name = 'labels{}'.format(_)))
    
logits_tensors = []
for _ in range(output_seq_len):
    logits_tensors.append(tf.placeholder(tf.float32, shape = [None, output_dim], name = 'logits{}'.format(_)))
    
# calculate a loss for a whole seq
def calculate_loss():
    loss = sampled_loss(labels_tensors[0], logits_tensors[0])
    
    for i in range(1, output_seq_len):
        loss += sampled_loss(labels_tensors[i], logits_tensors[i])
        
    return tf.reduce_mean(loss)

loss = calculate_loss()
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    
    feed = feed_dict(X_train, Y_train)
    outputs, labels = predict(sess, feed)
    # projecting only one output_seq not a whole batch - because of memory!   
    output_seq = proj(sess, [outputs[i][199].reshape(1, output_dim) for i in range(output_seq_len)])
    
    # decoding predicted headline     
    output_seq = np.reshape(output_seq, [output_seq_len, vocab_size])
    words = decode_output_seq(output_seq)
    print 'Predicted headline:'
    for word in words:
        print word,
    print '\n'
    # decoding corresponding label     
    label = [labels[i][199] for i in range(output_seq_len-1)] + [word2idx['<pad>']]
    words = decode_label(label)
    print 'Actual headline:'
    for word in words:
        print word,
    print '\n\n---------TRAINING---------\n\n'
    
    # training
    t = time.time()
    for step in range(steps):
        feed = feed_dict(X_train, Y_train)
        outputs, labels = predict(sess, feed)
        labels.append(np.full(shape = [batch_size], fill_value = word2idx['<pad>']))
        
        feed = {}
        for i in range(output_seq_len):
            feed[labels_tensors[i].name] = labels[i]
            feed[logits_tensors[i].name] = outputs[i]
            
        sess.run(optimizer, feed_dict = feed)
        
        if step % (steps-1) == 0 or step == 0:
            loss_value = sess.run(loss, feed_dict = feed)
            print 'step: {}, loss: {}'.format(step, loss_value)
            
    print 'Training time for {} steps:{}s'.format(steps, time.time() - t)
    

Predicted headline:
personality. personality. primary primary primary primary primary primary primary primary 

Actual headline:
England stutter to Zimbabwe win <eos> <pad> <pad> <pad> <pad> 

---------TRAINING---------


step: 0, loss: 40.7962417603
step: 9, loss: 35.7465782166
Training time for 10 steps:40.6767930984s


## I will train the model for more steps later and test it!!!