### Preprocess Data

In [67]:
%load_ext autoreload
%autoreload 2

import ast
import glob
import re
from pathlib import Path

import astor
import pandas as pd
import spacy
import tensorflow as tf

from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

from general_utils import apply_parallel, flattenlist
from lang_model_utils import tokenize_docstring, tokenize_code

tf.logging.set_verbosity(tf.logging.ERROR)

EN = spacy.load('en')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Data can be pulled from
gs://conala/conala-corpus

In [90]:
df_train = pd.read_json("./data/conala-corpus/conala-train.json")
df_train.head()

Unnamed: 0,intent,question_id,rewritten_intent,snippet
0,How to convert a list of multiple integers int...,41067960,Concatenate elements of a list 'x' of multiple...,"sum(d * 10 ** i for i, d in enumerate(x[::-1]))"
1,How to convert a list of multiple integers int...,41067960,convert a list of integers into a single integer,"r = int(''.join(map(str, x)))"
2,how to convert a datetime string back to datet...,4170655,convert a DateTime string back to a DateTime o...,datetime.strptime('2010-11-13 10:33:54.227806'...
3,Averaging the values in a dictionary based on ...,29565452,get the average of a list values for each key ...,"[(i, sum(j) / len(j)) for i, j in list(d.items..."
4,zip lists in python,13704860,"zip two lists `[1, 2]` and `[3, 4]` into a lis...","zip([1, 2], [3, 4])"


In [94]:
%%time
import itertools
contents = Path("./data/conala-corpus/conala-mined.jsonl").read_text()
contents = contents.splitlines()
df_mined = pd.DataFrame([dict(eval(x)) for x in contents])

CPU times: user 15.2 s, sys: 532 ms, total: 15.7 s
Wall time: 15.7 s


In [97]:
df_mined.head()

Unnamed: 0,id,intent,parent_answer_post_id,prob,question_id,snippet
0,34705205_34705233_0,Sort a nested list by two elements,34705233,0.869,34705205,"sorted(l, key=lambda x: (-int(x[1]), x[0]))"
1,13905936_13905946_0,converting integer to list in python,13905946,0.85267,13905936,[int(x) for x in str(num)]
2,13837848_13838041_0,Converting byte string in unicode string,13838041,0.852143,13837848,c.decode('unicode_escape')
3,23490152_23490179_0,List of arguments with argparse,23490179,0.850829,23490152,"parser.add_argument('-t', dest='table', help='..."
4,2721782_2721807_0,How to convert a Date string to a DateTime obj...,2721807,0.840372,2721782,"datetime.datetime.strptime(s, '%Y-%m-%dT%H:%M:..."


In [103]:
def get_intent_snippet_pairs(row):
    return (' '.join(tokenize_code(row.snippet)),
            ' '.join(tokenize_docstring(
                row.rewritten_intent if 'rewritten_intent' in row and row.rewritten_intent != None else row.intent)))

In [115]:
"""Tokenize data and split into code and intent"""
pairs = df_train.apply(get_intent_snippet_pairs, axis="columns")
pairs_mined = df_mined.apply(get_intent_snippet_pairs, axis="columns")

In [119]:
train_code, train_comment = zip(*pairs_mined.append(pairs))
assert len(train_code) == len(train_comment)

In [157]:
from tokenize import tokenize
from io import BytesIO
from keras.preprocessing.text import text_to_word_sequence

pytok = lambda s : [t.string for t in tokenize(BytesIO(s.encode('utf-8')).readline)]

print(text_to_word_sequence(train_code[0]))
print(pytok(train_code[0]))

['sorted', 'l', 'key', 'lambda', 'x', 'int', 'x', '1', 'x', '0']
['utf-8', 'sorted', 'l', 'key', 'lambda', 'x', 'int', 'x', '1', 'x', '0', '']


In [158]:
"""Tokenize Code-Intent"""
from ktext.preprocess import processor
import logging
   
code_proc = processor(hueristic_pct_padding=.7, keep_n=20000)
code_proc.set_tokenizer(tok)
t_code = code_proc.fit_transform(train_code)

comment_proc = processor(append_indicators=True, hueristic_pct_padding=.7, keep_n=14000, padding ='post')
t_comment = comment_proc.fit_transform(train_comment)

 See full histogram by insepecting the `document_length_stats` attribute.
 See full histogram by insepecting the `document_length_stats` attribute.


In [159]:
""" Saving processed files to disk"""
import dill as dpickle
import numpy as np
from pathlib import Path

OUTPUT_PATH = Path('./data/conala-corpus/pickle')
OUTPUT_PATH.mkdir(exist_ok=True)

# Save the preprocessor
with open(OUTPUT_PATH/'conala_code_proc.dpkl', 'wb') as f:
    dpickle.dump(code_proc, f)

with open(OUTPUT_PATH/'conala_comment_proc.dpkl', 'wb') as f:
    dpickle.dump(comment_proc, f)

# Save the processed data
np.save(OUTPUT_PATH/'conala_t_code_vecs.npy', t_code)
np.save(OUTPUT_PATH/'conala_t_comment_vecs.npy', t_comment)

In [160]:
%reload_ext autoreload
%autoreload 2
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor


encoder_input_data, encoder_seq_len = load_encoder_inputs(OUTPUT_PATH/'conala_t_code_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs(OUTPUT_PATH/'conala_t_comment_vecs.npy')
num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH/'conala_code_proc.dpkl')
num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'conala_comment_proc.dpkl')

Shape of encoder input: (596270, 15)
Shape of decoder input: (596270, 9)
Shape of decoder target: (596270, 9)
Size of vocabulary for data/conala-corpus/pickle/conala_code_proc.dpkl: 20,002
Size of vocabulary for data/conala-corpus/pickle/conala_comment_proc.dpkl: 12,428


### Building Model 

In [173]:
from seq2seq_utils import build_seq2seq_model
warmup_steps = 4000
initial_lr = 2.0
hidden_dim = 1000
seq2seq_Model = build_seq2seq_model(word_emb_dim=800,
                                    hidden_state_dim=hidden_dim,
                                    encoder_seq_len=encoder_seq_len,
                                    num_encoder_tokens=num_encoder_tokens,
                                    num_decoder_tokens=num_decoder_tokens)
seq2seq_Model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 800)    9942400     Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, 15)           0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 800)    3200        Decoder-Word-Embedding[0][0]     
__________________________________________________________________________________________________
Encoder-Mo

In [174]:
from keras.models import Model, load_model
import pandas as pd
import logging

from keras.callbacks import CSVLogger, ModelCheckpoint
import numpy as np
from keras import optimizers

LOG_PATH = Path('./data/conala-corpus/logs')
LOG_PATH.mkdir(exist_ok=True)

seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.01), loss='sparse_categorical_crossentropy')

script_name_base = 'conala_func_'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))

model_checkpoint = ModelCheckpoint(str(LOG_PATH/'{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base)),
                                   save_best_only=True)

batch_size = 256
epochs = 16
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

Train on 524717 samples, validate on 71553 samples
Epoch 1/16


  '. They will not be included '


Epoch 2/16


  '. They will not be included '


Epoch 3/16


  '. They will not be included '


Epoch 4/16


  '. They will not be included '


Epoch 5/16


  '. They will not be included '


Epoch 6/16


  '. They will not be included '


Epoch 7/16


  '. They will not be included '


Epoch 8/16


  '. They will not be included '


Epoch 9/16


  '. They will not be included '


Epoch 10/16


  '. They will not be included '


Epoch 11/16


  '. They will not be included '


Epoch 12/16


  '. They will not be included '


Epoch 13/16


  '. They will not be included '


Epoch 14/16


  '. They will not be included '


Epoch 15/16


  '. They will not be included '


Epoch 16/16


  '. They will not be included '


In [175]:
df_test = pd.read_json("./data/conala-corpus/conala-test.json")
df_test.head()

Unnamed: 0,intent,question_id,rewritten_intent,snippet
0,How can I send a signal from a python program?,15080500,send a signal `signal.SIGUSR1` to the current ...,"os.kill(os.getpid(), signal.SIGUSR1)"
1,Decode Hex String in Python 3,3283984,decode a hex string '4a4b4c' to UTF-8.,bytes.fromhex('4a4b4c').decode('utf-8')
2,check if all elements in a list are identical,3844801,check if all elements in list `myList` are ide...,all(x == myList[0] for x in myList)
3,Format string dynamically,4302166,format number of spaces between strings `Pytho...,"print('%*s : %*s' % (20, 'Python', 20, 'Very G..."
4,How to convert a string from CP-1251 to UTF-8?,7555335,,d.decode('cp1251').encode('utf8')


In [176]:
from seq2seq_utils import Seq2Seq_Inference
import pandas as pd

pairs = df_test.apply(get_intent_snippet_pairs, axis="columns")
test_code, test_comment = zip(*pairs)

seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                 decoder_preprocessor=dec_pp,
                                 seq2seq_model=seq2seq_Model)

demo_testdf = pd.DataFrame({'code':test_code, 'comment':test_comment, 'ref':''})
seq2seq_inf.demo_model_predictions(n=15, df=demo_testdf)




Original Input:
 x 1 for x in b 

Original Output:
 get reverse of list items from list ' b ' using extended slicing

****** Predicted Output ******:
 how to convert a list of tuples into a list



Original Input:
 int 

Original Output:
 function to convert strings into integers

****** Predicted Output ******:
 how to convert a list of tuples into a list



Original Input:
 os chdir owd 

Original Output:
 change working directory to the directory ` owd `

****** Predicted Output ******:
 how to get the current directory info hash in python



Original Input:
 for i in range 256 for j in range 256 ip 192 168 d d i j print ip 

Original Output:
 loop through the ip address range " 192.168.x.x "

****** Predicted Output ******:
 how to use a list of dictionaries as a table



Original Input:
 nums int x for x in intstringlist 

Original Output:
 converting list of strings ` intstringlist ` to list of integer ` nums `

****** Predicted Output ******:
 how to convert a list of tuples 

In [177]:
seq2seq_inf.evaluate_model(input_strings=test_code, 
                           output_strings=test_comment, 
                           max_len=None)



HBox(children=(IntProgress(value=0, max=500), HTML(value='')))






0.02662236674559463

### Transformer Model

Processing the data 

In [218]:
from general_utils import create_token_map, build_vocab, build_data

TRANS_PATH = Path('./data/transformer/')

tr_src_tokens = build_vocab(create_token_map(train_code), outpath=TRANS_PATH/"train_code_tokens.txt")
tr_target_tokens = build_vocab(create_token_map(train_comment), outpath=TRANS_PATH/"train_comment_tokens.txt")

tst_src_tokens = build_vocab(create_token_map(test_code), outpath=TRANS_PATH/"test_code_tokens.txt")
tst_target_tokens = build_vocab(create_token_map(test_comment), outpath=TRANS_PATH/"test_comment_tokens.txt")

In [219]:
X_train, y_train = build_data(source_data=train_code, target_data=train_comment, 
                              src_tokens=tr_src_tokens, tar_tokens=tr_target_tokens)
X_test, y_test = build_data(source_data=test_code, target_data=test_comment, 
                              src_tokens=tst_src_tokens, tar_tokens=tst_target_tokens)

In [230]:
from transformer_models.keras_trans.transformer import Transformer, LRSchedulerPerStep
from keras.optimizers import Adam
hidden_dim = 256
s2s = Transformer(tr_src_tokens, tr_target_tokens, len_limit=70, d_model=hidden_dim, d_inner_hid=512, \
    n_head=8, layers=2, dropout=0.1)


csv_logger = CSVLogger('./data/logs/trans.log')

mfile = './models/trans.model.h5'
lr_scheduler = LRSchedulerPerStep(hidden_dim, 4000) 
model_saver = ModelCheckpoint(mfile, save_best_only=True, save_weights_only=True)

s2s.compile(Adam(0.001, 0.9, 0.98, epsilon=1e-9))
s2s.model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
lambda_59 (Lambda)              (None, None)         0           input_4[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
lambda_62 (Lambda)              (None, None)         0           lambda_59[0][0]                  
__________________________________________________________________________________________________
lambda_61 

In [None]:
s2s.model.fit([X_train, y_train], batch_size=batch_size, epochs=epochs,
              validation_split=0.12,
              callbacks=[lr_scheduler, model_saver, csv_logger])

Train on 5405983 samples, validate on 737180 samples
Epoch 1/16
 422656/5405983 [=>............................] - ETA: 2:41:08 - loss: 3.0299 - ppl: 610.8049 - accu: 0.3922