In [12]:
import torch
from transformers import AutoTokenizer, AutoModel

from scipy.spatial.distance import cosine

In [2]:
path = "codebert-base/"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=True)
model = AutoModel.from_pretrained(path, local_files_only=True)

In [8]:
def get_code_embedding(code, tokenizer, model):
    inputs = tokenizer(code, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings

In [18]:
def calculate_cosine_similarity(embedding1, embedding2):
    embedding1 = embedding1.squeeze()
    embedding2 = embedding2.squeeze()
    return 1 - cosine(embedding1, embedding2)

In [28]:
def code_similarity_match(candidate_list, patch_str):
    tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=True)
    model = AutoModel.from_pretrained(path, local_files_only=True)

    # convert patch_str to latent vectors/embeddings
    target_embedding = get_code_embedding(patch_str, tokenizer, model)

    code_similarities = []
    for candidate_code in candidate_list:
        # convert every candidate code to latent vectors/embeddings
        candidate_embedding = get_code_embedding(candidate_code, tokenizer, model)
        # calculate the distance(Cosine)
        cosine_distance = calculate_cosine_similarity(candidate_embedding.detach().numpy(), target_embedding.detach().numpy())
        code_similarities.append((candidate_code, cosine_distance))

    code_similarities.sort(key=lambda x: x[1], reverse=True)
    return code_similarities

In [31]:
candidate_list = [
    "int value = value2 / 10;",
    "int value = value / 10;",
    "double value = value % 10;"
]

patch_str = "int value = value % 10;"

sentence_similarities = code_similarity_match(candidate_list, patch_str)

In [32]:
sentence_similarities

[('int value = value / 10;', 0.998833289629509),
 ('double value = value % 10;', 0.9980462673824129),
 ('int value = value2 / 10;', 0.9932692032065936)]