In [1]:
import os
import re
import pickle

import javalang

import nltk
nltk.download('punkt')
from nltk import Text
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
rq1 = './data_RQ1/'
VOCAB_SIZE = 30000

PAD_ = '<pad>'
UNK = '<unk>'
SOS = '<start>'
EOS = '<end>'

PAD_ID = 0
UNK_ID = 1
SOS_ID = 2
EOS_ID = 3

train_code_path = os.path.join(rq1, 'train/train.token.code')
train_nl_path = os.path.join(rq1,'train/train.token.nl')
valid_code_path = os.path.join(rq1, 'valid/valid.token.code')
valid_nl_path = os.path.join(rq1, 'valid/valid.token.nl')
test_code_path = os.path.join(rq1, 'test/test.token.code')
test_nl_path =  os.path.join(rq1, 'test/test.token.nl')

### 코드 전처리

In [3]:
COMMENT_RX = re.compile("(?<!:)\\/\\/.*|\\/\\*(\\s|.)*?\\*\\/", re.MULTILINE)

def process_source(code):
    code = code.replace('\n',' ').strip()
    tokens = list(javalang.tokenizer.tokenize(code))
    tks = []
    for tk in tokens:
        if tk.__class__.__name__ == 'String' or tk.__class__.__name__ == 'Character':
            tks.append('STR_')
        elif 'Integer' in tk.__class__.__name__ or 'FloatingPoint' in tk.__class__.__name__:
            tks.append('NUM_')
        elif tk.__class__.__name__ == 'Boolean':
            tks.append('BOOL_')
        else:
            tks.append(tk.value)
    return " ".join(tks)

def hump2underline(hunp_str):
    '''
    밑줄을 칠 CamelCase 문자열
    :param hunp_str: CamelCase 밑줄
    :return: 모두 소문자로 된 밑줄이 그어진 문자열
    '''
    p = re.compile(r'([a-z]|\d)([A-Z])') # 일반 일치, 소문자와 대문자의 경계 일치
    sub = re.sub(p, r'\1 \2', hunp_str).lower() # 여기서 두 번째 매개변수는 일반 그룹화된 역참조를 사용
    return sub

# code processing
def split_code(lines):
    result = []
    for line in lines:
        code = COMMENT_RX.sub('', line)
        processed_code = process_source(code)
        code_seq = ' '.join([hump2underline(i) for i in processed_code.split()])
        result.append(code_seq)
    
    return result

### vocab 만들기

In [10]:
def make_token_instance(lines):
    tokens = []
    for line in lines:
        tokens.extend(word_tokenize(str(line))) # 문장을 단어로 tokenize

    t = Text(tokens) # Token을 기반으로 정보를 담기 위한 인스턴스

    return t

# print(len(nl_t), len(set(nl_t))) # 191785개의 단어가 t 객체에 있음, 중복 제거시 58422
def make_vocab(lines, dic_type):
    t = make_token_instance(lines)
    vocab = t.vocab().most_common(VOCAB_SIZE) # 상위 vocab_size개의 단어만 보존 

    if dic_type == 'nl':
        word_to_index = {word[0] : index + 4 for index, word in enumerate(vocab)} # 각 단어에 대해 고유한 정수 부여하기(indexing)

        word_to_index['<pad>'] = 0
        word_to_index['<unk>'] = 1
        word_to_index['<start>'] = 2
        word_to_index['<end>'] = 3
    else:
        word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)} # 각 단어에 대해 고유한 정수 부여하기(indexing)

        word_to_index['<pad>'] = 0
        word_to_index['<unk>'] = 1

    # sorted_nl_dic = sorted(word_to_index.items(), key=lambda x:x[1]) 
    vocab_list = [x[0] for x in sorted(word_to_index.items(), key=lambda x:x[1])] # value 기준으로 정렬하고 키값(토큰)만 추출
    return vocab_list

# Code Vocab 만들기

In [11]:
f_name = train_nl_path

In [12]:
with open(f_name, 'r', encoding='utf-8') as f:
    code_lines = f.readlines()

print(len(code_lines)) # 문장개수

for line in code_lines[:5]:
    print(line)

445812
create native global variables from the modules the returned object can be reused for different instances of environments .

just a simple check to see if the x y pair actually fits into the pixel array .

patches the given resource and will also remove private properties if it is an external call based upon context .

to zoom out .

creates a new plot .



In [13]:
code_tokens = split_code(code_lines)
code_vocab = make_vocab(code_tokens, "code")

In [14]:
code_vocab[:10]

['<pad>', '<unk>', '.', 'the', 'a', 'to', 'of', 'this', 'and', 'is']

In [17]:
path = './data_RQ1/vocab_park/'
if not os.path.isdir(path):
    os.mkdir(path)

with open('vocab.code', 'w', encoding='utf-8') as f:
    for i in code_vocab:
        f.write(i+'\n')

# nl vocab 만들기

In [None]:
f_name = train_nl_path

In [None]:
with open(f_name, 'r', encoding='utf-8') as f:
    nl_lines = f.readlines()

print(len(nl_lines)) # 문장개수

for line in nl_lines[:5]:
    print(line)

In [None]:
nl_vocab = make_vocab(nl, "nl")

In [None]:
for i in lines[:5]:
    print(sent_tokenize(str(i)))
for i in lines[:5]:
    print(word_tokenize(str(i)))    

    

# Vocab 불러와서 문장 indexing

In [None]:
encoded = [] 
for line in tokenized: #입력 데이터에서 1줄씩 문장을 읽음 
    temp = [] for w in line: #각 줄에서 1개씩 글자를 읽음 
    try: 
        temp.append(word_to_index[w]) # 글자를 해당되는 정수로 변환 
    except KeyError: # 단어 집합에 없는 단어일 경우 unk로 대체된다. 
        temp.append(word_to_index['unk']) # unk의 인덱스로 변환 
    encoded.append(temp)

In [None]:
print(encoded[:10])


In [None]:

def process_source(file_name, save_file):
    with open(file_name, 'r', encoding='utf-8') as source:
        lines = source.readlines()
    with open(save_file, 'w+', encoding='utf-8') as save:
        for line in lines:
            code = line.strip()
            tokens = list(javalang.tokenizer.tokenize(code))
            tks = []
            for tk in tokens:
                if tk.__class__.__name__ == 'String' or tk.__class__.__name__ == 'Character':
                    tks.append('STR_')
                elif 'Integer' in tk.__class__.__name__ or 'FloatingPoint' in tk.__class__.__name__:
                    tks.append('NUM_')
                elif tk.__class__.__name__ == 'Boolean':
                    tks.append('BOOL_')
                else:
                    tks.append(tk.value)
            save.write(" ".join(tks) + '\n')

In [None]:
def initialize_vocabulary(vocabulary_path):
    """
    Initialize vocabulary from file.

    We assume the vocabulary is stored one-item-per-line, so a file:
      dog
      cat
    will result in a vocabulary {'dog': 0, 'cat': 1}, and a reversed vocabulary ['dog', 'cat'].

    :param vocabulary_path: path to the file containing the vocabulary.
    :return:
      the vocabulary (a dictionary mapping string to integers), and
      the reversed vocabulary (a list, which reverses the vocabulary mapping).
    """
    if os.path.exists(vocabulary_path):
        rev_vocab = []
        with open(vocabulary_path) as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.rstrip('\n') for line in rev_vocab]
        vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
        return namedtuple('vocab', 'vocab reverse')(vocab, rev_vocab)
    else:
        raise ValueError("vocabulary file %s not found", vocabulary_path)

In [None]:
 def read_vocab(self):
        # don't try reading vocabulary for encoders that take pre-computed features
        self.vocabs = [
            None if binary else utils.initialize_vocabulary(vocab_path)
            for vocab_path, binary in zip(self.filenames.vocab, self.binary)
            ]
        self.src_vocab, self.trg_vocab = self.vocabs[:len(self.src_ext)], self.vocabs[len(self.src_ext):]

In [3]:
f = open('./vocabulary/nl', 'r', encoding='utf-8')
s = f.readlines()
f.close()
dic_word = {}
key = 0
for c in s:
    dic_word[key] = c.strip()
    key += 1

In [None]:
def read_dataset(paths, extensions, vocabs, max_size=None, character_level=None, sort_by_length=False,
                 max_seq_len=None, from_position=None, binary=None, use_unknown=True):
    data_set = []

    if from_position is not None:
        debug('reading from position: {}'.format(from_position))

    line_reader = read_lines_from_position(paths, from_position=from_position, binary=binary)
    character_level = character_level or {}

    positions = None

    for inputs, positions in line_reader:
        if len(data_set) > 0 and len(data_set) % 100000 == 0:
            debug("  lines read: {}".format(len(data_set)))
        lines = [
            input_ if binary_ else
            sentence_to_token_ids(input_, vocab.vocab,ext, character_level=character_level.get(ext), use_unknown=use_unknown)
            for input_, vocab, binary_, ext in zip(inputs, vocabs, binary, extensions)
        ]

        if not all(lines):  # skip empty inputs
            continue
        # skip lines that are too long
        if max_seq_len and any(len(line) > max_seq_len[ext] for line, ext in zip(lines, extensions)):
            continue

        data_set.append(lines)

        if max_size and len(data_set) >= max_size:
            break

    debug('files: {}'.format(' '.join(paths)))
    debug('lines reads: {}'.format(len(data_set)))

    if sort_by_length:
        data_set.sort(key=lambda lines: list(map(len, lines)))

    return data_set, positions

In [5]:
def sentence_to_token_ids(sentence, vocabulary, ext, character_level=False):
    """
    Convert a string to list of integers representing token-ids.

    For example, a sentence "I have a dog" may become tokenized into
    ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
    "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].

    :param sentence: a string, the sentence to convert to token-ids
    :param vocabulary: a dictionary mapping tokens to integers
    :param character_level: treat sentence as a string of characters, and
        not as a string of words
    :return: a list of integers, the token-ids for the sentence.
    """
    sentence = sentence.strip()
    sentence = sentence.rstrip('\n') if character_level else sentence.split(' ')
    if ext =='nl':
        use_unknown=True
    if use_unknown:
        return [vocabulary.get(w, UNK_ID) for w in sentence]
    else:
        tks = []
        for w in sentence:
            if w not in vocabulary:
                w = w.split('_')[0]
            tks.append(vocabulary[w])
        return tks