## 1. Comment-Intent Labeling Tool (COIN)

In [1]:
from src.comment_classifier.model import commentClassifier
from transformers import AutoTokenizer
import string
import torch

In [2]:
def coin_preprocess(tokenizer, comment):
    def count_punc_num(comment, comment_len):
        count = lambda l1, l2: sum([1 for x in l1 if x in l2])
        punc_num = count(comment, set(string.punctuation))
        digits_num = count(comment, set(string.digits))
        return (punc_num + digits_num) / comment_len

    comment_tokens = tokenizer.tokenize(comment)
    cc_tokens = [tokenizer.cls_token] + comment_tokens + [tokenizer.sep_token]
    cc_ids = tokenizer.convert_tokens_to_ids(cc_tokens)
    cc_att_mask = [1] * len(cc_tokens)
    punc_num = count_punc_num(comment, len(comment.strip().split()))
    if len(comment.strip().split()) < 3:
        comment_len = 1
    else:
        comment_len = 0    
    return torch.tensor(cc_ids).unsqueeze(0), torch.tensor(cc_att_mask).unsqueeze(0), \
           torch.tensor(comment_len).unsqueeze(0), torch.tensor(punc_num).unsqueeze(0)

### COIN can identify the developer-intent for a given comment

In [3]:
# 1. load the well-trained COIN
classifier = commentClassifier('./src/comment_classifier/pretrained_codebert', 6, 0.2)
classifier.load_state_dict(torch.load("./src/comment_classifier//saved_model/comment_classifier.pkl"))
classifier.cuda()
print("load the parameters of the pretrained classifier!")

load the parameters of the pretrained classifier!


In [4]:
# 2. input a comment and preprocess it
comment1 = 'Starts the background initialization'
comment2 = 'After the construction of a BackgroundInitializer() object it start() method has to be called .'
tokenizer = AutoTokenizer.from_pretrained('./src/comment_classifier/pretrained_codebert')
cc_ids1, cc_att_mask1, comment_len1, punc_num1 = coin_preprocess(tokenizer, comment1)
cc_ids2, cc_att_mask2, comment_len2, punc_num2 = coin_preprocess(tokenizer, comment2)

In [5]:
# 3. predict the intent for the comment
classifier.eval()
class_name = ['what', 'why', 'how-to-use', 'how-it-is-done', 'property', 'others']
logits1 = classifier(cc_ids1.cuda(), cc_att_mask1.cuda(), comment_len1.cuda(), punc_num1.cuda())
prediction1 = class_name[int(torch.argmax(logits1, 1))]
print('comment:', comment1, '\nintent:', prediction1)

comment: Starts the background initialization 
intent: what


In [6]:
logits2 = classifier(cc_ids2.cuda(), cc_att_mask2.cuda(), comment_len2.cuda(), punc_num2.cuda())
prediction2 = class_name[int(torch.argmax(logits2, 1))]
print('comment:', comment2, '\nintent:', prediction2)

comment: After the construction of a BackgroundInitializer() object it start() method has to be called . 
intent: how-to-use


## 2. Developer-Intent Driven Comment Generator (DOME)

In [7]:
from src.comment_generator.DOME import Generator
from tokenizers import Tokenizer
import numpy as np
import torch
import json

### DOME can generate various comments that are coherent with the given intents

In [8]:
# 1. load the well-trained DOME
class Config(object):
    def __init__(self):
        self.bpe_model = f'./src/Application_Demo/bpe_tokenizer_all_token.json'
        self.tokenizer = Tokenizer.from_file(self.bpe_model)
        self.vocab_size = self.tokenizer.get_vocab_size()
        self.eos_token = self.tokenizer.token_to_id('[EOS]')
        self.intent2id = {'what': 0, 'why': 1, 'usage': 2, 'done': 3, 'property': 4}
        self.intent2bos_id = {'what': "[WHAT/]", 'why': "[WHY/]", 'usage': "[USAGE/]", 'done': "[DONE/]", 'property': "[PROP/]"}
        self.intent2cls_id = {'what': "[/WHAT]", 'why': "[/WHY]", 'usage': "[/USAGE]", 'done': "[/DONE]", 'property': "[/PROP]"}

        self.d_model = 512
        self.d_intent = 128
        self.d_ff = 2048
        self.head_num = 8
        self.enc_layer_num = 6
        self.dec_layer_num = 6
        self.max_token_inline = 25
        self.max_line_num = 15
        self.max_comment_len = 30
        self.clip_dist_code = 8
        self.intent_num = 5
        self.stat_k = 5
        self.token_k = 10
        self.beam_width = 5
        self.batch_size = 64
        self.dropout = 0.2
    
config = Config()
generator = Generator(config.d_model, config.d_intent, config.d_ff, config.head_num, config.enc_layer_num,
                      config.dec_layer_num, config.vocab_size, config.max_comment_len, config.clip_dist_code, config.eos_token,
                      config.intent_num, config.stat_k, config.token_k, config.dropout, None)
generator.load_state_dict(torch.load(f"./src/comment_generator/saved_model/tlcodesum/comment_generator.pkl"))
generator.cuda()
print("load the parameters of the pretrained generator!")

load the parameters of the pretrained generator!


In [9]:
# 2. load the data
with open('./src/Application_Demo/demo_generator_dataset/raw_code.demo', 'r') as f:
    raw_code_lines = f.readlines()
with open('./src/Application_Demo/demo_generator_dataset/code_split.demo', 'r') as f:
    code_stat_lines = f.readlines()
with open('./src/Application_Demo/demo_generator_dataset/similar_comment.demo', 'r') as f:
    similar_comment_lines = f.readlines()

raw_code, input_code, code_valid_len, input_exemplar = [], [], [], []
for raw_code_line, code_stat_line, exemplar_line in zip(raw_code_lines, code_stat_lines, similar_comment_lines):
    raw_code.append(json.loads(raw_code_line.strip())['raw_code'])
    statement_line = json.loads(code_stat_line.strip())
    exemplar_what = json.loads(exemplar_line.strip())['what']
    exemplar_why = json.loads(exemplar_line.strip())['why']
    exemplar_done = json.loads(exemplar_line.strip())['done']
    exemplar_usage = json.loads(exemplar_line.strip())['usage']
    exemplar_property = json.loads(exemplar_line.strip())['property']
    input_exemplar.append({'what':config.tokenizer.encode(exemplar_what).ids[:config.max_comment_len], 'why':config.tokenizer.encode(exemplar_why).ids[:config.max_comment_len], 'done':config.tokenizer.encode(exemplar_done).ids[:config.max_comment_len], 'usage':config.tokenizer.encode(exemplar_usage).ids[:config.max_comment_len], 'property':config.tokenizer.encode(exemplar_property).ids[:config.max_comment_len]})
    
    temp_code = []
    for stat_idx, stat in enumerate(statement_line['code'][:config.max_line_num]):
        cur_stat = config.tokenizer.encode(stat).ids[:config.max_token_inline]
        temp_code = temp_code + cur_stat + [config.tokenizer.token_to_id('[PAD]')] * (config.max_token_inline - len(cur_stat))
    input_code.append(temp_code)
    
    code_valid_len.append(len(statement_line['code'][:config.max_line_num]))

In [10]:
def prediction(code, exemplar, intent, code_valid_len):
    input_intent = torch.tensor(config.intent2id[intent]).unsqueeze(0).cuda()
    bos = torch.tensor([config.tokenizer.token_to_id(config.intent2bos_id[intent])]).unsqueeze(0).cuda()
    input_code = torch.tensor([config.tokenizer.token_to_id(config.intent2cls_id[intent])] + code).unsqueeze(0).cuda()
    input_exemplar = torch.tensor(exemplar[intent]).unsqueeze(0).cuda()
    code_valid_len = torch.tensor([code_valid_len]).cuda()
    exemplar_valid_len = torch.tensor([len(exemplar[intent])]).cuda()
    generator.eval()
    pred = generator(input_code, input_exemplar, bos, code_valid_len, exemplar_valid_len, input_intent)
    pred = config.tokenizer.decode(pred[0])
    return pred

In [11]:
# 3.prediction
for i in range(len(input_code)):
    print("code:\n", raw_code[i])
    print("what:", prediction(input_code[i], input_exemplar[i], 'what', code_valid_len[i]))
    print("why:", prediction(input_code[i], input_exemplar[i], 'why', code_valid_len[i]))
    print("how-it-is-done:", prediction(input_code[i], input_exemplar[i], 'done', code_valid_len[i]))
    print("usage:", prediction(input_code[i], input_exemplar[i], 'usage', code_valid_len[i]))
    print("property:", prediction(input_code[i], input_exemplar[i], 'property', code_valid_len[i]))
    print("=============================================================================")

code:
 public int hashCode(){
  return value.hashCode();
}
what: generates a hash code .
why: generates code for this object .
how-it-is-done: a method that generates a hashcode based on the contents of the string representations .
usage: this method is used when this class is used as the code .
property: return a hashcode for this text attribute .
code:
 protected void writeQualifiedName(String nsAlias,String name) throws IOException {
  if (nsAlias != null && nsAlias.length() > 0) {
    writer.write(nsAlias);
    writer.write(':');
  }
  writer.write(name);
}
what: writes a qualified name to a file .
why: writes the beginning of the generated name to the given alias .
how-it-is-done: copy a qualified name , using the given class .
usage: below method will be used to write the idex file
property: returns a managed name path holding the value of the specified string .
code:
 <T>List<T> onFind(Class<T> modelClass,String[] columns,String[] conditions,String orderBy,String limit,boolean i