In [67]:
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, pipeline
import torch

model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")

CODE = "def returnInt() -> <mask>:"
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.2260781228542328,
  'token': 6979,
  'token_str': ' int',
  'sequence': 'def returnInt() -> int:'},
 {'score': 0.15783804655075073,
  'token': 5053,
  'token_str': ' Any',
  'sequence': 'def returnInt() -> Any:'},
 {'score': 0.1569615751504898,
  'token': 9291,
  'token_str': ' None',
  'sequence': 'def returnInt() -> None:'},
 {'score': 0.12274446338415146,
  'token': 1907,
  'token_str': ' type',
  'sequence': 'def returnInt() -> type:'},
 {'score': 0.06580942869186401,
  'token': 1666,
  'token_str': '...',
  'sequence': 'def returnInt() ->...:'}]

In [68]:
class CustomModel(torch.nn.Module):
    def __init__(self, model, d, vocabulary_size = 50265): 
        super(CustomModel, self).__init__() 
        self.d = d
        self.model = model
        self.config = model.config
        self.layer = torch.nn.Linear(vocabulary_size, d)
    
    def forward(self, input_ids=None, attention_mask=None):
        model_output = self.model.forward(input_ids=input_ids, attention_mask=attention_mask)        
        final_output_tensor = self.layer.forward(model_output[0])
        model_output.logits = final_output_tensor
        
        return model_output

In [69]:
nl_tokens=tokenizer.tokenize("")
code_tokens=tokenizer.tokenize("def returnInt() ->")
tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
model(torch.tensor(tokens_ids)[None,:])[0].size()

torch.Size([1, 8, 50265])

In [70]:
custom_model = CustomModel(model, 8)
custom_model.forward(torch.tensor(tokens_ids)[None,:])



MaskedLMOutput(loss=None, logits=tensor([[[ 0.9974,  1.8173,  2.9009,  0.6223, -2.3383, -2.3489, -0.2263,
           1.8593],
         [ 2.1772,  2.6819,  1.8731,  1.0957, -0.8751, -3.7013,  2.4598,
           1.5739],
         [ 3.7235,  2.1757,  3.4237,  3.7053, -1.2009, -3.2981,  2.5223,
           0.7235],
         [ 3.2575,  2.2046,  1.5974, -1.8128, -2.8777, -0.9669,  0.1867,
           0.5435],
         [ 0.8468,  3.6607,  3.4923,  1.6033, -1.0270, -4.8929,  1.9278,
           0.2467],
         [-0.5875,  4.3031,  2.4883,  1.2344,  0.3943, -0.5856,  0.0650,
           0.3482],
         [ 1.3970,  2.8812,  2.4330, -0.4276, -0.8746, -1.0335, -0.2439,
           0.5437],
         [ 3.9880,  0.5474,  2.6009, -1.1463, -2.1115, -0.9738,  1.2335,
           1.8181]]], grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [77]:
custom_model = CustomModel(model, 8)
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.408164918422699,
  'token': 0,
  'token_str': '<s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.20400215685367584,
  'token': 2,
  'token_str': '</s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.16126449406147003,
  'token': 6,
  'token_str': ',',
  'sequence': 'def returnInt() ->,:'},
 {'score': 0.08908458799123764,
  'token': 4,
  'token_str': '.',
  'sequence': 'def returnInt() ->.:'},
 {'score': 0.07929743081331253,
  'token': 5,
  'token_str': ' the',
  'sequence': 'def returnInt() -> the:'}]

In [72]:
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()

In [73]:
from tqdm.notebook import tqdm

def tokenize_code(code):
    nl_tokens = tokenizer.tokenize("")
    code_tokens = tokenizer.tokenize(code)
    tokens = [tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    return torch.tensor(tokens_ids)[None,:]

epochs = 1
data = [("def returnInt() -> <mask>:", "def calcInt() -> <mask>:","def returnFloat() -> <mask>:", "int")]

optimizer = torch.optim.Adam(custom_model.parameters(), lr=0.001)
criterion = torch.jit.script(TripletLoss())

for epoch in tqdm(range(epochs), desc="Epochs"):
    custom_model.train()
    running_loss = []
    for step, (t_a, t_p, t_n, anchor_label) in enumerate(data):
        
        optimizer.zero_grad()
        anchor_out = custom_model(tokenize_code(t_a))
        positive_out = custom_model(tokenize_code(t_p))
        negative_out = custom_model(tokenize_code(t_n))
        
        loss = criterion(anchor_out[0], positive_out[0], negative_out[0])
        loss.backward()
        optimizer.step()

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

In [74]:
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.9997450709342957,
  'token': 4,
  'token_str': '.',
  'sequence': 'def returnInt() ->.:'},
 {'score': 0.00016059390327427536,
  'token': 2,
  'token_str': '</s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 5.0995793571928516e-05,
  'token': 1,
  'token_str': '<pad>',
  'sequence': 'def returnInt() ->:'},
 {'score': 4.3344498408259824e-05,
  'token': 0,
  'token_str': '<s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 3.261212372200617e-15,
  'token': 3,
  'token_str': '<unk>',
  'sequence': 'def returnInt() ->:'}]

In [75]:
import numpy as np
from annoy import AnnoyIndex
from torch.utils.data import DataLoader
import random

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
KNN_TREE_SIZE = 20

def compute_validation_loss_dsl():
    with torch.no_grad():
        # model.eval()
        computed_embed_batches_train = []
        computed_embed_labels_train = []
        # for step, (t_a, t_p, t_n, anchor_label) in enumerate(DataLoader(data)):
        output = custom_model.forward(torch.tensor(tokens_ids)[None,:])
        computed_embed_batches_train.append(output.logits.cpu().numpy())
        # computed_embed_labels_train.append(t_a[1].logits.cpu().numpy())
             
        annoy_index = create_knn_index(np.vstack(computed_embed_batches_train), None, computed_embed_batches_train[0].shape[1])
    return annoy_index

def create_knn_index(train_types_embed: np.array, valid_types_embed: np.array, type_embed_dim:int) -> AnnoyIndex:
    """
    Creates KNNs index for given type embedding vectors, taken from Type4Py
    """

    annoy_idx = AnnoyIndex(type_embed_dim, 'euclidean')

    for i, v in enumerate(tqdm(train_types_embed, total=len(train_types_embed),
                          desc="KNN index")):
        for l in v:
            annoy_idx.add_item(i, l)

    # if valid_types_embed is not None:
    #     for i, v in enumerate(valid_types_embed):
    #         annoy_idx.add_item(len(train_types_embed) + i, v)

    annoy_idx.build(KNN_TREE_SIZE)
    return annoy_idx


In [78]:
annoy_idx = compute_validation_loss_dsl()
print(annoy_idx)

KNN index:   0%|          | 0/1 [00:00<?, ?it/s]

<annoy.Annoy object at 0x7fa5dc9d43f0>


In [87]:
from collections import defaultdict

def get_test_embedding():
    with torch.no_grad():
        nl_tokens=tokenizer.tokenize("")
        code_tokens=tokenizer.tokenize("def returnInt() ->")
        tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
        tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
        computed_embed_batches_test = []
        output = custom_model.forward(torch.tensor(tokens_ids)[None,:])
        print(output)
        computed_embed_batches_test.append(output.logits.cpu().numpy())
        test_label = tokenizer.convert_tokens_to_ids(tokenizer("int"))
        print(test_label)
        return computed_embed_batches_test, test_label

def predict_type_embed(types_embed_array: np.array, types_embed_labels: np.array, indexed_knn: AnnoyIndex, k: int):
    """
    Predict type of given type embedding vectors
    """

    pred_types_embed = []
    pred_types_score = []
    for i, e in enumerate(tqdm(types_embed_array, total=len(types_embed_array), desc="Finding KNNs & Prediction")):
        for embed_vec in e:
            idx, dist = indexed_knn.get_nns_by_vector(embed_vec, k, include_distances=True)
            pred_idx_scores = compute_types_score(dist, idx, types_embed_labels)
            pred_types_embed.append([i for i, s in pred_idx_scores])
            pred_types_score.append(pred_idx_scores)
    
    return pred_types_embed, pred_types_score

def compute_types_score(types_dist: list, types_idx: list, types_embed_labels: np.array):
        types_dist = 1 / (np.array(types_dist) + 1e-10) ** 2
        types_dist /= np.sum(types_dist)
        types_score = defaultdict(int)
        for n, d in zip(types_idx, types_dist):
            types_score[types_embed_labels[n]] += d
        
        return sorted({t: s for t, s in types_score.items()}.items(), key=lambda kv: kv[1], reverse=True)
    
types_embed_array, types_embed_labels = get_test_embedding()
pred_type_embed, pred_type_score = predict_type_embed(np.vstack(types_embed_array), types_embed_labels, annoy_idx, 8)
print(pred_type_embed)
print(pred_type_score)

MaskedLMOutput(loss=None, logits=tensor([[[ 0.7088, -3.2723,  1.4179, -2.5493, -0.4054, -0.8069,  0.2873,
           1.0417],
         [ 1.1670, -2.6667,  0.7293, -3.0402, -0.1084, -0.4715,  1.1692,
           0.5224],
         [ 0.9119, -2.8948,  0.7886, -2.1832,  0.1219, -0.0357,  0.6262,
           1.2775],
         [ 1.1808, -3.3710,  0.9670, -2.4556,  0.1107,  0.0985,  0.6259,
           1.1026],
         [ 1.1007, -3.1372,  1.5742, -2.8584,  0.1584, -0.8895,  0.8356,
           1.2035],
         [ 0.4043, -3.0784,  0.9782, -2.8364,  0.8008, -0.8753,  0.5216,
           0.9065],
         [ 0.9808, -3.5654,  1.2455, -2.2411,  0.0260, -0.9857,  0.3142,
           0.9441],
         [ 1.1272, -3.4024,  0.4743, -3.0634,  0.8844, -0.4214,  0.6487,
           0.5105]]]), hidden_states=None, attentions=None)
[3, 3]


Finding KNNs & Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[3], [3], [3], [3], [3], [3], [3], [3]]
[[(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)]]
