In [1]:
import torch
a = torch.load('code_clone_detection_java_model_train_on_30_percent_label_' + str(1) + '.pt')

In [2]:
import pandas as pd
root = 'data/'
lang = 'java'
categories = 1
if lang == 'java':
    categories = 5
    
path = root+lang+'/demo/'

query_source = pd.read_pickle(root+lang+'/query_source.pkl')

In [3]:
import torch
from model import BatchProgramCC
from gensim.models.word2vec import Word2Vec

def get_device():
    if torch.cuda.is_available():
        if torch.cuda.get_device_name(0) == 'GeForce GT 730':
            device = 'cpu'
        else:
            device = 'cuda'
    else:
        device = 'cpu'
    return torch.device(device)

device = get_device()

word2vec = Word2Vec.load(root+lang+"/train/embedding/node_w2v_128").wv
MAX_TOKENS = word2vec.vectors.shape[0]
EMBEDDING_DIM = word2vec.vectors.shape[1]
vocab = word2vec.vocab

HIDDEN_DIM = 100
ENCODE_DIM = 128
LABELS = 1
EPOCHS = 10
BATCH_SIZE = 1

type_list = [1, 2, 3, 4, 5]
type_str = ['type-1', 'type-2', 'S type-3', 'M type-3', 'type-4']

models = []
for t in type_list:
    model = BatchProgramCC(EMBEDDING_DIM,
                           HIDDEN_DIM,
                           MAX_TOKENS+1,
                           ENCODE_DIM,
                           LABELS,
                           BATCH_SIZE,
                           device
                           )
    model.to(device)
    model.load_state_dict(torch.load('code_clone_detection_java_model_train_on_30_percent_label_' + str(t) + '.pt'))
    model.eval()
    models.append(model)

    Found GPU0 GeForce GT 730 which is of cuda capability 3.0.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability that we support is 3.5.
    
GeForce GT 730 with CUDA capability sm_30 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 compute_37.
If you want to use the GeForce GT 730 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [4]:
import javalang
def parse_program(func):
    tokens = javalang.tokenizer.tokenize(func)
    parser = javalang.parser.Parser(tokens)
    tree = parser.parse_member_declaration()
    return tree

from utils import get_blocks_v1 as func
def tree_to_index(node):
    global vocab
    token = node.token
    result = [vocab[token].index if token in vocab else MAX_TOKENS]
    children = node.children
    for child in children:
        result.append(tree_to_index(child))
    return result

def trans2seq(r):
    blocks = []
    func(r, blocks)
    tree = []
    for b in blocks:
        btree = tree_to_index(b)
        tree.append(btree)
    return tree

class vector_calculation():
    def __init__(self, model):
        self.model = model
    def __call__(self, x):
        with torch.no_grad():
            vector = self.model.encode([x]).reshape(1,-1)#.cpu().numpy()
        return vector

class confidence_calculation():
    def __init__(self, model, base_vector):
        self.model = model
        #self.base_vector = [torch.from_numpy(base_vector)]
        self.base_vector = base_vector
    def __call__(self, vector):
        vector = torch.from_numpy(vector).reshape(1,-1)
        with torch.no_grad():
            confidence = model.predict(self.base_vector, vector)
        return confidence.cpu().item()

In [5]:
import os
files= os.listdir(path) #得到文件夹下的所有文件名称

result_dict = {}
result_dict['language'] = 'java'
result_dict['path'] = path
result_dict['date'] = '2020.09.14'
result_dict['database'] = 'BCB v1.0.1'
result_dict['models'] = {type_str[0]: 'ASTNN t1 v1.3.0',
                       type_str[1]: 'ASTNN t2 v1.3.0',
                       type_str[2]: 'ASTNN st3 v1.3.0',
                       type_str[3]: 'ASTNN mt3 v1.3.0',
                       type_str[4]: 'ASTNN t4 v1.3.0'}
result_dict['number of files'] = 0
result_dict['number of functions'] = 0
result_dict['clone detection result'] = []

def get_function_name(function_ast):
    return function_ast.name

vectors = []
for file in files: #遍历文件夹
    if not os.path.isdir(path+file) and file[-5:].lower()=='.java': #判断是否是文件夹，不是文件夹才打开，同时判断是否java脚本文件
        file_result = {}
        file_result['file name'] = file
        file_result['file path'] = path+file
        file_result['LOC'] = 100
        file_result['tokens num'] = 1000
        file_result['function num'] = 0
        file_result['file result'] = []
        with open(path+'/'+file,'r') as f:
            code = f.read()
        
        
        
        code = parse_program(code)
        function_name = get_function_name(code)
        code = trans2seq(code)
        function_vectors = []
        
        
        function_result = {}
        function_result['function name'] = function_name
        function_result['LOC'] = 10
        function_result['tokens num'] = 100
        function_result['function result'] = []
        
        type_vectors = []
        for t in type_list:
            vec_cal = vector_calculation(models[t-1])
            type_vectors.append(vec_cal(code))
        function_vectors.append(type_vectors)
        
        
        file_result['function num'] = len(function_vectors)
        result_dict['number of functions'] += len(function_vectors)
        file_result['file result'].append(function_result)
        vectors.append(function_vectors)
        
        result_dict['clone detection result'].append(file_result)
            
result_dict['number of files'] = len(vectors)

In [6]:
#t = 1  # type
k = 2  # top-k match
digit = 3  # 小数位数
for i, function_vectors in enumerate(vectors):  # file
    file_result = result_dict['clone detection result'][i]
    for j, type_vectors in enumerate(function_vectors):  # function
        function_result = file_result['file result'][j]
        for t in type_list:  # type
            type_result = {}
            type_result[type_str[t-1]+' result'] = []
            confi_cal = confidence_calculation(models[t-1], type_vectors[t-1])
            query_source['confidence'] = query_source['vector-'+str(t)].apply(confi_cal)
            best_match = query_source.nlargest(k, ['confidence'])
            for idex, item in best_match.iterrows():  # match
                match = {}
                match['function id'] = idex
                match['LOC'] = 10
                match['tokens num'] = 100
                match['confidence'] = round(item['confidence'], digit)  # 保留三位小数
                type_result[type_str[t-1]+' result'].append(match)
            function_result['function result'].append(type_result)

In [7]:
import json

result_str = json.dumps(result_dict, sort_keys=False, indent=4, separators=(',', ': '))
print(result_str)

with open(path+'result.json', 'w') as f:
    json.dump(result_dict, f, indent=4, separators=(',', ': '))

{
    "language": "java",
    "path": "data/java/demo/",
    "date": "2020.09.14",
    "database": "BCB v1.0.1",
    "models": {
        "type-1": "ASTNN t1 v1.3.0",
        "type-2": "ASTNN t2 v1.3.0",
        "S type-3": "ASTNN st3 v1.3.0",
        "M type-3": "ASTNN mt3 v1.3.0",
        "type-4": "ASTNN t4 v1.3.0"
    },
    "number of files": 10,
    "number of functions": 10,
    "clone detection result": [
        {
            "file name": "code_example_1044656.java",
            "file path": "data/java/demo/code_example_1044656.java",
            "LOC": 100,
            "tokens num": 1000,
            "function num": 1,
            "file result": [
                {
                    "function name": "open",
                    "LOC": 10,
                    "tokens num": 100,
                    "function result": [
                        {
                            "type-1 result": [
                                {
                                    "function id": 194