In [1]:
import numpy as np
import tensorflow as tf
import json
import subprocess, os
from tensorflow.python.layers import core as layers_core
import matplotlib.pyplot as plt
import math
import tempfile

In [2]:
params = dict()
params['rnn_size'] = 400
params['encoder_emb'] = 400
params['decoder_emb'] = 400
params['init_weights'] = 0.35
params['dropout'] = 0.5
params['normalize'] = 1
params['beam_size'] = 10
params['saved'] = None #Use saved model
params['lr'] = 0.5
params['dataset'] = 'Theano,youtube-dl,node,angular,react,opencv,CNTK,\
bitcoin,tensorflow,caffe,elasticsearch,guava'.split(',')
params['language'] = 'python,python,javascript,javascript,javascript,cpp,cpp,cpp,cpp,cpp,java,java'.split(',')
params['layers'] = 1
params['decay'] = 0.8
params['max_grad_norm'] = 5
params['max_length'] = 20
params['batch_size'] = 100
params['max_code_length'] = 100
params['max_nl_length'] = 100
params['graph'] = 'train'

In [3]:
data_types = ['train', 'valid', 'test']
data = {}
work_dir = os.environ['WORK_DIR']
work_dir = os.path.join(work_dir,'preprocessing')
dataset_name = '_'.join(params['dataset'])
langauge_name = '_'.join(params['language'])

for kind in data_types:
    file_name = dataset_name + "." + langauge_name + "." + kind + ".npz"
    file_dir = os.path.join(work_dir,file_name)
    data[kind] = np.load(file_dir, encoding='bytes')
file_name = dataset_name + "." + langauge_name + "." + "vocab.json"
file_dir = os.path.join(work_dir,file_name)
with open(file_dir) as f:
    data['vocab'] = json.load(f)

In [66]:
params['code_vocab_size'] = data['vocab']['max_code'] + 1
params['nl_vocab_size'] = data['vocab']['max_nl'] + 1
file_name = dataset_name + "." + langauge_name + ".ref.txt"
params['dev_ref_file'] = os.path.join(work_dir,file_name)

<numpy.lib.npyio.NpzFile at 0x7f3e88514470>

In [42]:
def make_train(ids):
#     if type(ids)==list:
    x_train = np.vstack([data['train']['X'][()][params['dataset'][idx]] for idx in ids]).T
    y_train = np.vstack([data['train']['Y'][()][params['dataset'][idx]] for idx in ids]).T
    xlen_train = np.hstack([data['train']['Xlen'][()][params['dataset'][idx]] for idx in ids]).reshape(-1)
    ylen_train = np.hstack([data['train']['Ylen'][()][params['dataset'][idx]] for idx in ids]).reshape(-1) 
#     else:
#         x_train = data['train']['X'][()][params['dataset'][ids]].T
#         y_train = data['train']['Y'][()][params['dataset'][ids]].T
#         xlen_train = np.hstack((data['train']['Xlen'][()][params['dataset'][ids]])).reshape(-1)
#         ylen_train = np.hstack((data['train']['Ylen'][()][params['dataset'][ids]])).reshape(-1) 
    return x_train, y_train, xlen_train, ylen_train
def make_val_test(ids):
#     if type(ids)==list:
    x_valid = np.vstack([data['valid']['X'][()][params['dataset'][idx]] for idx in ids]).T
    y_valid = np.vstack([data['valid']['Y'][()][params['dataset'][idx]] for idx in ids]).T
    xlen_valid = np.hstack([data['valid']['Xlen'][()][params['dataset'][idx]] for idx in ids]).reshape(-1)
    ylen_valid = np.hstack([data['valid']['Ylen'][()][params['dataset'][idx]] for idx in ids]).reshape(-1)
    x_test = np.hstack([data['test']['X'][()][params['dataset'][idx]] for idx in ids]).T
    y_test = np.hstack([data['test']['Y'][()][params['dataset'][idx]] for idx in ids]).T
    xlen_test = np.hstack([data['test']['Xlen'][()][params['dataset'][idx]] for idx in ids]).reshape(-1)
    ylen_test = np.hstack([data['test']['Ylen'][()][params['dataset'][idx]] for idx in ids]).reshape(-1)
    ids_test = np.hstack([data['test']['ids'][()][params['dataset'][idx]] for idx in ids]).reshape(-1)
#     else:
#         x_valid = data['valid']['X'][()][params['dataset'][ids]].T
#         y_valid = data['valid']['Y'][()][params['dataset'][ids]].T
#         xlen_valid = data['valid']['Xlen'][()][params['dataset'][ids]]
#         ylen_valid = data['valid']['Ylen'][()][params['dataset'][ids]]
#         x_test = data['test']['X'][()][params['dataset'][ids]].T
#         y_test = data['test']['Y'][()][params['dataset'][ids]].T
#         xlen_test = data['test']['Xlen'][()][params['dataset'][ids]]
#         ylen_test = data['test']['Ylen'][()][params['dataset'][ids]]
#         ids_test = data['test']['ids'][()][params['dataset'][ids]]
    return x_valid, y_valid, xlen_valid, ylen_valid, x_test, y_test, xlen_test, ylen_test, ids_test

In [80]:
train_datasets = [9,10]
valid_datasets = [7]

In [81]:
x_train, y_train, xlen_train, ylen_train = make_train(train_datasets)

In [82]:
x_valid, y_valid, xlen_valid, ylen_valid, x_test, y_test, xlen_test, ylen_test, ids_test = make_val_test(valid_datasets)

In [46]:
def num_to_nl(prediction):
    nl = ""
    for word in prediction:
        token = data['vocab']['num_to_nl'][str(word[0])]
        if not token in ['UNK','CODE_START','CODE_END']: 
            nl += token
            nl += " "
    if nl == "":
        return "mysql"
    return nl[:-1]

def run_bleu(predictions, ids):
    os.chdir('/home/mohamed/tf-commitgen/')
    with tempfile.NamedTemporaryFile(dir='/tmp', delete=False, mode='w') as tmp_file:
        for i in range(ids.size):
            tmp_file.write(str(ids[i]) + '\t' + predictions[i] + '\n')
        tmp_name = tmp_file.name
    bleu = subprocess.check_output(["python","./model/bleu.py",
                                    params['dev_ref_file']], stdin=open(tmp_name))
    print("BLEU:", str(bleu)[2:-3])

In [47]:
def model(X, Y, Xlen, Ylen):
    #encoder embedding and cell
    encoder_emb = tf.get_variable("encoder_emb", [params['code_vocab_size'], params['encoder_emb']])
    encoder_emb_inp = tf.nn.embedding_lookup(encoder_emb, X)
    encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(params['encoder_emb'])
    #encoding
    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell, encoder_emb_inp, 
                                                                dtype=tf.float32,
                                                               time_major=True, sequence_length=Xlen)
    #decoder embedding and cell
    decoder_emb = tf.get_variable("decoder_emb", [params['nl_vocab_size'], params['rnn_size']])
    decoder_emb_inp = tf.nn.embedding_lookup(decoder_emb, Y)
    decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(params['decoder_emb'])
    #dropout and attention
    if params['graph'] == 'train':
        decoder_cell = tf.contrib.rnn.DropoutWrapper(cell=decoder_cell, input_keep_prob=params['dropout'])
    attention_states = tf.transpose(encoder_outputs, [1, 0, 2])
    attention_mechanism = tf.contrib.seq2seq.LuongAttention(params['rnn_size'], attention_states,
                                                            memory_sequence_length=Xlen)
    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell,attention_mechanism,
                                                       attention_layer_size=params['rnn_size'])
    #fix error due to LSTM state being a tuple 
    decoder_initial_state = decoder_cell.zero_state(tf.shape(Y)[1], tf.float32).clone(cell_state=encoder_state)
    
    #beam search
    if params['graph'] == 'test':
        decoder_initial_state = tf.contrib.seq2seq.tile_batch(decoder_initial_state, 
                                                              multiplier=params['beam_size'])
        decoder = tf.contrib.seq2seq.BeamSearchDecoder(decoder_cell, decoder_emb_inp, 3, 4,
                                                   decoder_initial_state, beam,
                                                   output_layer=projection_layer,
                                                   length_penalty_weight=0.0)
        outputs, _, ylen_out = tf.contrib.seq2seq.dynamic_decode(decoder)
        return outputs.predicted_ids, ylen_out
        
    #helper
    helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, Ylen, time_major=True)
    projection_layer = layers_core.Dense(params['nl_vocab_size'], use_bias=False)
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, decoder_initial_state, output_layer=projection_layer)
    
    #decoding
    outputs, _, ylen_out = tf.contrib.seq2seq.dynamic_decode(decoder)
    return outputs.rnn_output, ylen_out

In [48]:
def run_model(session, predict, predict_lengths, loss_val,
              batch_size=100, print_every=100,
              training=None, verbose = True, plot_losses=False):
    predict = tf.argmax(predict,2)
    
    # have tensorflow compute accuracy
    accuracy=tf.constant([0])
    
    # shuffle indicies
    train_indicies = np.arange(x_train.shape[1])
    np.random.shuffle(train_indicies)
    valid_indicies = np.arange(x_valid.shape[1])

    training_now = training is not None
    
    # setting up variables we want to compute (and optimizing)
    # if we have a training function, add that to things we compute
    variables = [mean_loss,predict,predict_lengths,accuracy]
    if training_now:
        variables[-1] = training
    # counter 
    iter_cnt = 0
    val_accs = []
    e = 0
    all_losses = []
    best_acc = 0
    while True:
        e += 1
        # keep track of losses and accuracy
        accs = []
        losses = []
        # make sure we iterate over the dataset once
        for i in range(int(math.ceil(x_train.shape[1]/batch_size))):
            # generate indicies for the batch
            start_idx = (i*batch_size)%x_train.shape[1]
            idx = train_indicies[start_idx:start_idx+batch_size]
            # get batch size
            actual_batch_size = y_train[:,idx].shape[1]
            # create a feed dictionary for this batch
            feed_dict = {X: x_train[:,idx],
                         Y: y_train[:,idx],
                         Xlen: xlen_train[idx],
                         Ylen: ylen_train[idx]}
            params['graph'] = 'train'
            # have tensorflow compute loss and correct predictions
            # and (if given) perform a training step
            batch_loss, pred, pred_lengths, _ = session.run(variables,feed_dict=feed_dict)
            # aggregate performance stats
            losses.append(batch_loss*actual_batch_size)
            epoch_accs = []
            for j,index in enumerate(idx):
                sample_length = max(ylen_train[index], pred_lengths[j])
                targets = y_train[:sample_length,index]
                acc = np.mean(np.equal(targets, 
                                       pred[:sample_length,j]))
                epoch_accs.append(acc)
            acc = np.mean(epoch_accs)
            #print every now and then
            if verbose:
                print('\r[Training] %.2f%%' %(i/(int(math.ceil(x_train.shape[1]/batch_size))-1)*100), end='')
            iter_cnt += 1
            accs.append(acc) 
        print('\r',end='')
        total_acc = np.mean(accs)
        total_loss = np.sum(losses)/x_train.shape[1] 
        
        if plot_losses:
            all_losses += losses
            plt.plot(all_losses)
            plt.grid(True)
            plt.title('Epoch {} Loss'.format(e))
            plt.xlabel('minibatch number')
            plt.ylabel('minibatch loss')
            plt.show()
            
        for i in range(int(math.ceil(x_valid.shape[1]/batch_size))):
            epoch_accs = []
            start_idx = (i*batch_size)%x_valid.shape[1]
            idx = valid_indicies[start_idx:start_idx+batch_size]
            val_feed_dict = {X: x_valid[:,idx],
                             Y: y_valid[:,idx],
                             Xlen: xlen_valid[idx],
                             Ylen: ylen_valid[idx]}
            params['graph'] = 'valid'
            pred, pred_lengths = session.run([predict,predict_lengths] , feed_dict=val_feed_dict)
            iter_val_accs = []
            for j,index in enumerate(idx):
                sample_length = max(ylen_valid[index], pred_lengths[j])

                acc = np.mean(np.equal(y_valid[:sample_length,index], 
                                       pred[:sample_length,j]))
                iter_val_accs.append(acc)
            acc = np.mean(iter_val_accs)
            epoch_accs.append(acc)
            if verbose:
                print('[Validating] %.2f%%\r' %(i/(int(math.ceil(x_valid.shape[1]/batch_size))-1)*100), flush=True, end='')
        print('\r',end='')
        acc = np.mean(epoch_accs)
        val_accs.append(acc)
        if len(val_accs) > 10 and val_accs[-5] >= val_accs[-1]:
            params['lr'] *= params['decay']
            
        print("Epoch {2}, Overall loss = {0:.3g}, training accuracy of {1:.3g}, validation accuracy of {3: .3g} and learning rate = {4: .4g}" \
              .format(total_loss,total_acc,e, val_accs[-1], params['lr']))
        
        
#         if e>3:
#             test_predictions = []
#             for i in range(x_test.shape[0]):
#                 feed_dict = {X: np.array(x_test[i]).reshape(-1,1),
#                              Y: np.array(y_test[i]).reshape(-1,1),
#                              Xlen: xlen_test[i].reshape(1),
#                              Ylen: ylen_test[i].reshape(1)}
#                 params['graph'] = 'test'
#                 pred = session.run(predict,feed_dict=feed_dict)
#                 test_predictions.append(num_to_nl(pred))
#                 if verbose:
#                     print('[Testing] %.2f%%\r' %(i/(x_test.shape[0]-1)*100), flush=True, end='')
#             print('\r',end='')
#             run_bleu(test_predictions, ids_test)
#         if e>15:
#             print(test_predictions[:20])
        
        if val_accs[-1] > best_acc:
            save_path = work_dir + '/' + ''.join([str(x) for x in train_datasets]) + '.ckpt'
            saver.save(session, save_path)
            best_acc = val_accs[-1]
            print('Model Saved (%.2f > %.2f)'%(val_accs[-1], best_acc))
        if params['lr'] < 1e-3:
            break
    return total_loss,total_acc

In [83]:
tf.reset_default_graph()
params['lr'] = 0.005
params['saved'] = False
params['graph'] = 'train'
X = tf.placeholder(tf.int32, [None, None])
Y = tf.placeholder(tf.int64, [None, None])
Xlen = tf.placeholder(tf.int32, [None])
Ylen = tf.placeholder(tf.int32, [None])
y_out, ylen_out = model(X, Y, Xlen, Ylen)
y_out = tf.transpose(y_out, [1,0,2])
targets = Y[:tf.shape(y_out)[0]]
targets = tf.one_hot(targets, params['nl_vocab_size'])
loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels = targets, logits = y_out)
mean_loss = tf.reduce_mean(loss)

trainable = tf.trainable_variables()
gradients = tf.gradients(mean_loss, trainable)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, params['max_grad_norm'])

optimizer = tf.train.AdamOptimizer(params['lr'])
train_step = optimizer.apply_gradients(zip(clipped_gradients, trainable))

saver = tf.train.Saver()

with tf.Session() as sess:
    if params['saved']:
        save_path = work_dir + '/' + ''.join([str(x) for x in train_datasets]) + '.ckpt'
        saver.restore(sess, save_path)
        print('Model Restored')
    else:
        sess.run(tf.global_variables_initializer())
    run_model(sess,y_out,ylen_out,mean_loss,params['batch_size'],100,train_step, True, False)

Epoch 1, Overall loss = 1.53, training accuracy of 0.201, validation accuracy of  0.392 and learning rate =  0.005
Model Saved
Epoch 2, Overall loss = 0.748, training accuracy of 0.513, validation accuracy of  0.59 and learning rate =  0.005
Model Saved
Epoch 3, Overall loss = 0.448, training accuracy of 0.669, validation accuracy of  0.684 and learning rate =  0.005
Model Saved
Epoch 4, Overall loss = 0.313, training accuracy of 0.753, validation accuracy of  0.729 and learning rate =  0.005
Model Saved
Epoch 5, Overall loss = 0.23, training accuracy of 0.813, validation accuracy of  0.744 and learning rate =  0.005
Model Saved
Epoch 6, Overall loss = 0.161, training accuracy of 0.859, validation accuracy of  0.767 and learning rate =  0.005
Model Saved
Epoch 7, Overall loss = 0.11, training accuracy of 0.898, validation accuracy of  0.786 and learning rate =  0.005
Model Saved
Epoch 8, Overall loss = 0.0713, training accuracy of 0.929, validation accuracy of  0.808 and learning rate 

KeyboardInterrupt: 

In [84]:
with tf.Session() as sess:
    save_path = work_dir + '/' + ''.join([str(x) for x in train_datasets]) + '.ckpt'
    saver.restore(sess, save_path)
    test_predictions = []
    n = x_test.shape[0]
    params['graph'] = 'test'
    for i in range(n):
        feed_dict = {X: np.array(x_test[i]).reshape(-1,1),
                     Y: np.array(y_test[i]).reshape(-1,1),
                     Xlen: xlen_test[i].reshape(1),
                     Ylen: ylen_test[i].reshape(1)}
        pred = sess.run(tf.argmax(y_out,2),feed_dict=feed_dict)
        test_predictions.append(num_to_nl(pred))
        print('\r%.2f%%' %(i/n*100),end='',flush=True)
    print()
    run_bleu(test_predictions, ids_test)

INFO:tensorflow:Restoring parameters from /home/mohamed/data/preprocessing/910.ckpt
99.91%
BLEU: 37.8696172544
