In [1]:
import numpy as np
from os import path
import json
import pickle
import os
import argparse
from commitgen.data import build_data, split_list, build_vocab

3.5.2 (default, Nov 23 2017, 16:37:01) 
[GCC 5.4.0 20160609]


In [2]:
work_dir = os.environ['WORK_DIR']
work_dir = os.path.join(work_dir, "preprocessing")
if not os.path.isdir(work_dir):
    os.mkdir(work_dir)

nl_max_length = 100
code_max_length = 100
code_unk_threshold = 2
nl_unk_threshold = 2
test = True
ratio = 0.8 


datasets = ['Theano']
languages = ["python"]

In [3]:
def process_data(data, test = False):
    if test:
        data = build_data(data, vocab)
        X = []
        Y = []
        Ylen = []
        Xlen = []
        ids = []
        for sample in data:
            x = sample['code_num']
            y = sample['nl_num']
            ids.append(sample['id'])
            X.append(x)
            Y.append(y)
            Xlen.append(sample['code_sizes'])
            Ylen.append(len(sample['nl_num']))
        X = np.array(X)
        Y = np.array(Y)
    else:
        data = build_data(data, vocab,
                          max_code_length=code_max_length,
                          max_nl_length=nl_max_length)
        X = np.array([])
        ids = []
        for sample in data:
            x = np.ones(code_max_length)
            np.put(x, range(sample['code_sizes']), sample['code_num'])
            nl_num = sample['nl_num']
            y = np.ones(nl_max_length)
            np.put(y, range(len(nl_num)), nl_num)
            ids.append(sample['id'])
            if X.size == 0:
                X = x
                Y = y
                Xlen = [sample['code_sizes']]
                Ylen = [len(sample['nl_num'])]
                continue
            X = np.vstack((X, x))
            Y = np.vstack((Y, y))
            Xlen.append(sample['code_sizes'])
            Ylen.append(len(sample['nl_num']))
    return X, Y, np.array(Xlen), np.array(Ylen), np.array(ids)

In [4]:
per_dataset_parsed_commits = []
all_parsed_commits = []

for dataset in datasets:
  filepath = os.path.join(work_dir, dataset + ".pickle")
  if os.path.isfile(filepath):
      with open(filepath, "rb") as f:
          parsed_commits = pickle.load(f)
      per_dataset_parsed_commits.append(parsed_commits)
      all_parsed_commits += parsed_commits
  else:
    raise IOError("Pickle file does not exist")

vocab = build_vocab(all_parsed_commits, code_unk_threshold, nl_unk_threshold)

dataset_name = "_".join(datasets)
language_name =  "_".join(languages)

# storing vocab
vocab_file_name = ".".join([dataset_name, language_name, 'vocab.json'])
with open(path.join(work_dir, vocab_file_name), 'w') as f:
    json.dump(vocab, f)

per_dataset_train = []
per_dataset_valid = []
per_dataset_test = []
all_test = []

for parsed_commits in per_dataset_parsed_commits:
    # splitting dataset
    train, valid, test = split_list(parsed_commits, generate_test=test, ratio=ratio)
    per_dataset_train.append(train)
    per_dataset_valid.append(valid)
    per_dataset_test.append(test)
    all_test += test


# generating data and saving files

data_to_save = dict()
data_to_save['X'] = dict()
data_to_save['Y'] = dict()
data_to_save['Xlen'] = dict()
data_to_save['Ylen'] = dict()
data_to_save['ids'] = dict()
for i in range(len(datasets)):
    dataset_X, dataset_Y, dataset_Xlen, dataset_Ylen, dataset_ids = process_data(per_dataset_train[i])
    data_to_save['X'][datasets[i]] = dataset_X
    data_to_save['Y'][datasets[i]] = dataset_Y
    data_to_save['Xlen'][datasets[i]] = dataset_Xlen
    data_to_save['Ylen'][datasets[i]] = dataset_Ylen

train_name = ".".join([dataset_name, language_name,"train"])
np.savez(os.path.join(work_dir, train_name), 
         **data_to_save)
print("Successfully generated train data")

data_to_save = dict()
data_to_save['X'] = dict()
data_to_save['Y'] = dict()
data_to_save['Xlen'] = dict()
data_to_save['Ylen'] = dict()
data_to_save['ids'] = dict()
for i in range(len(datasets)):
    dataset_X, dataset_Y, dataset_Xlen, dataset_Ylen, dataset_ids = process_data(per_dataset_valid[i])
    data_to_save['X'][datasets[i]] = dataset_X
    data_to_save['Y'][datasets[i]] = dataset_Y
    data_to_save['Xlen'][datasets[i]] = dataset_Xlen
    data_to_save['Ylen'][datasets[i]] = dataset_Ylen

valid_name = ".".join([dataset_name, language_name, "valid"])
np.savez(os.path.join(work_dir, valid_name), 
         **data_to_save)
print("Successfully generated valid data")



# we don't set a maximum length ONLY for test data
_, ref_data = build_data(all_test, vocab, ref=True)
ref_name = ".".join([dataset_name, language_name, "ref.txt"])
with open(os.path.join(work_dir, ref_name), 'w') as f:
    for sha, nl in ref_data:
        try:
            f.write(str(sha )+ "\t" + nl.decode('utf-8').encode('ascii', 'ignore') + "\n")
        except:
            f.write(str(sha )+ "\t" + nl + "\n")
        
data_to_save = dict()
data_to_save['X'] = dict()
data_to_save['Y'] = dict()
data_to_save['Xlen'] = dict()
data_to_save['Ylen'] = dict()
data_to_save['ids'] = dict()
for i in range(len(datasets)):
    dataset_X, dataset_Y, dataset_Xlen, dataset_Ylen, dataset_ids = process_data(per_dataset_test[i], test)
    data_to_save['X'][datasets[i]] = dataset_X
    data_to_save['Y'][datasets[i]] = dataset_Y
    data_to_save['Xlen'][datasets[i]] = dataset_Xlen
    data_to_save['Ylen'][datasets[i]] = dataset_Ylen
    data_to_save['ids'][datasets[i]] = dataset_ids

test_name = ".".join([dataset_name, language_name, "test"])
np.savez(os.path.join(work_dir, test_name), 
         **data_to_save)
print("Successfully generated test data")

Total size = 8177
Total skipped = 40
Successfully generated train data
Total size = 923
Total skipped = 3
Successfully generated valid data
Total size = 2304
Total skipped = 0
Total size = 2304
Total skipped = 0
Successfully generated test data


In [21]:
data_to_save['']['Theano'].shape

(8202, 100)