In [19]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Embeddings Code Diff

In [20]:
def create_code_embeddings_added_deleted(model, added_code, deleted_code):
    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
    
    # Tokenize the added and deleted code
    added_tokens = tokenizer.tokenize(added_code)
    deleted_tokens = tokenizer.tokenize(deleted_code)
    print(added_tokens)
    print(deleted_tokens)

    # Adding CLS token, SEP token and EOS token
    tokens = [tokenizer.cls_token]+added_tokens+[tokenizer.sep_token]+deleted_tokens+[tokenizer.eos_token]
    print(tokens)
    
    #Convert tokens to IDs
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens[1:])
    print(tokens_ids)
    
    # Create embeddings
    code_embeddings=model(torch.tensor(tokens_ids)[None,:])[0]
    
    print(code_embeddings)

    return code_embeddings

# Embeddings Code Diff Sum

In [21]:
def create_code_embeddings_added_deleted_sum(model, added_code, deleted_code):
    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
    
    # Tokenize the added and deleted code
    added_tokens = tokenizer.tokenize(added_code)
    deleted_tokens = tokenizer.tokenize(deleted_code)
    print(added_tokens)
    print(deleted_tokens)

    # Adding CLS token, SEP token and EOS token
    tokens = [tokenizer.cls_token]+added_tokens+[tokenizer.sep_token]+deleted_tokens+[tokenizer.eos_token]
    print(tokens)
    
    #Convert tokens to IDs
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens[1:])
    print(tokens_ids)
    
    # Create embeddings
    code_embeddings=model(torch.tensor(tokens_ids)[None,:])[0]
    summed_embeddings = torch.sum(code_embeddings, dim=1)
    
    print(summed_embeddings)

    return summed_embeddings

# Embeddings Task Description

In [22]:
def create_task_description_embeddings (model, task_description):
    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
    
    # Tokenize the task descriptions
    task_description_tokens = tokenizer.tokenize(task_description)
    print(task_description_tokens)

    # Adding CLS token, SEP token and EOS token
    tokens = [tokenizer.cls_token]+task_description_tokens+[tokenizer.eos_token]
    print(tokens)
    
    #Convert tokens to IDs
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens[1:])
    print(tokens_ids)
    
    # Create embeddings
    task_description_embeddings=model(torch.tensor(tokens_ids)[None,:])[0]

    print(task_description_embeddings)

    return task_description_embeddings

# Embeddings Task Description Sum

In [23]:
def create_task_description_embeddings_sum (model, task_description):
    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
    
    # Tokenize the task descriptions
    task_description_tokens = tokenizer.tokenize(task_description)
    print(task_description_tokens)

    # Adding CLS token, SEP token and EOS token
    tokens = [tokenizer.cls_token]+task_description_tokens+[tokenizer.eos_token]
    print(tokens)
    
    #Convert tokens to IDs
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens[1:])
    print(tokens_ids)
    
    # Create embeddings
    task_description_embeddings=model(torch.tensor(tokens_ids)[None,:])[0]
    summed_embeddings = torch.sum(task_description_embeddings, dim=1)

    print(summed_embeddings)

    return summed_embeddings

# Cosine Similarity Calculation

In [24]:
def calculate_cosine_similarity(code_embedding, task_description_embedding):

    size_code_embedding = code_embedding.size()
    size_task_description_embedding = task_description_embedding.size()

    # truncate if necessary
    if size_code_embedding.numel() < size_task_description_embedding.numel():
        task_description_embedding = task_description_embedding[:, :size_code_embedding[1], :]

    elif size_code_embedding.numel() > size_task_description_embedding.numel():
        code_embedding = code_embedding[:, :size_task_description_embedding[1], :]

    # calculate cosine similarity
    code_embedding_np = code_embedding.detach().numpy().reshape(1, -1)
    task_description_embedding_np = task_description_embedding.detach().numpy().reshape(1, -1)

    similarity = cosine_similarity(code_embedding_np, task_description_embedding_np)
    similarity_value = similarity[0, 0]
    return similarity_value

# Cosine Similarity Calculation Sum

In [25]:
def calculate_cosine_similarity_sum(code_embedding, task_description_embedding):

    # calculate cosine similarity
    code_embedding_np = code_embedding.detach().numpy().reshape(1, -1)
    task_description_embedding_np = task_description_embedding.detach().numpy().reshape(1, -1)

    similarity = cosine_similarity(code_embedding_np, task_description_embedding_np)
    similarity_value = similarity[0, 0]
    return similarity_value


# Results 

In [26]:
model = AutoModel.from_pretrained("microsoft/codebert-base")

added_code = 'ess.broadcastMessage("essentials.banip.notify", tl("playerBanIpAddress", senderName, ipAddress, banReason));'
deleted_code = 'ess.broadcastMessage("essentials.ban.notify", tl("playerBanIpAddress", senderName, ipAddress, banReason));'
task_description_correct = "Silent ban messages?"
task_description_incorrect = "Reduce permission check calls in PlayerCommandSendEvent"

code_embedding = create_code_embeddings_added_deleted(model, added_code, deleted_code)
task_description_embedding_correct = create_task_description_embeddings(model, task_description_correct)
task_description_embedding_incorrect = create_task_description_embeddings(model, task_description_incorrect)
cosine_similarity_correct = calculate_cosine_similarity(code_embedding, task_description_embedding_correct)
cosine_similarity_incorrect = calculate_cosine_similarity(code_embedding, task_description_embedding_incorrect)

print("Cosine similarity correct:", cosine_similarity_correct)
print("Cosine similarity incorrect:", cosine_similarity_incorrect)

['ess', '.', 'broad', 'cast', 'Message', '("', 'ess', 'entials', '.', 'ban', 'ip', '.', 'not', 'ify', '",', 'Ġt', 'l', '("', 'player', 'Ban', 'I', 'p', 'Address', '",', 'Ġsender', 'Name', ',', 'Ġip', 'Address', ',', 'Ġban', 'Reason', '));']
['ess', '.', 'broad', 'cast', 'Message', '("', 'ess', 'entials', '.', 'ban', '.', 'not', 'ify', '",', 'Ġt', 'l', '("', 'player', 'Ban', 'I', 'p', 'Address', '",', 'Ġsender', 'Name', ',', 'Ġip', 'Address', ',', 'Ġban', 'Reason', '));']
['<s>', 'ess', '.', 'broad', 'cast', 'Message', '("', 'ess', 'entials', '.', 'ban', 'ip', '.', 'not', 'ify', '",', 'Ġt', 'l', '("', 'player', 'Ban', 'I', 'p', 'Address', '",', 'Ġsender', 'Name', ',', 'Ġip', 'Address', ',', 'Ġban', 'Reason', '));', '</s>', 'ess', '.', 'broad', 'cast', 'Message', '("', 'ess', 'entials', '.', 'ban', '.', 'not', 'ify', '",', 'Ġt', 'l', '("', 'player', 'Ban', 'I', 'p', 'Address', '",', 'Ġsender', 'Name', ',', 'Ġip', 'Address', ',', 'Ġban', 'Reason', '));', '</s>']
[3361, 4, 32990, 5182, 423

# Results Sum

In [27]:
model = AutoModel.from_pretrained("microsoft/codebert-base")

added_code = 'ess.broadcastMessage("essentials.banip.notify", tl("playerBanIpAddress", senderName, ipAddress, banReason));'
deleted_code = 'ess.broadcastMessage("essentials.ban.notify", tl("playerBanIpAddress", senderName, ipAddress, banReason));'
task_description_correct = "Silent ban messages?"
task_description_incorrect = "Reduce permission check calls in PlayerCommandSendEvent"

code_embedding = create_code_embeddings_added_deleted_sum(model, added_code, deleted_code)
task_description_embedding_correct = create_task_description_embeddings_sum(model, task_description_correct)
task_description_embedding_incorrect = create_task_description_embeddings_sum(model, task_description_incorrect)
cosine_similarity_correct = calculate_cosine_similarity_sum(code_embedding, task_description_embedding_correct)
cosine_similarity_incorrect = calculate_cosine_similarity_sum(code_embedding, task_description_embedding_incorrect)

print("Cosine similarity (added) correct:", cosine_similarity_correct)
print("Cosine similarity (added) incorrect:", cosine_similarity_incorrect)

['ess', '.', 'broad', 'cast', 'Message', '("', 'ess', 'entials', '.', 'ban', 'ip', '.', 'not', 'ify', '",', 'Ġt', 'l', '("', 'player', 'Ban', 'I', 'p', 'Address', '",', 'Ġsender', 'Name', ',', 'Ġip', 'Address', ',', 'Ġban', 'Reason', '));']
['ess', '.', 'broad', 'cast', 'Message', '("', 'ess', 'entials', '.', 'ban', '.', 'not', 'ify', '",', 'Ġt', 'l', '("', 'player', 'Ban', 'I', 'p', 'Address', '",', 'Ġsender', 'Name', ',', 'Ġip', 'Address', ',', 'Ġban', 'Reason', '));']
['<s>', 'ess', '.', 'broad', 'cast', 'Message', '("', 'ess', 'entials', '.', 'ban', 'ip', '.', 'not', 'ify', '",', 'Ġt', 'l', '("', 'player', 'Ban', 'I', 'p', 'Address', '",', 'Ġsender', 'Name', ',', 'Ġip', 'Address', ',', 'Ġban', 'Reason', '));', '</s>', 'ess', '.', 'broad', 'cast', 'Message', '("', 'ess', 'entials', '.', 'ban', '.', 'not', 'ify', '",', 'Ġt', 'l', '("', 'player', 'Ban', 'I', 'p', 'Address', '",', 'Ġsender', 'Name', ',', 'Ġip', 'Address', ',', 'Ġban', 'Reason', '));', '</s>']
[3361, 4, 32990, 5182, 423