In [98]:
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, pipeline
import torch

model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")

CODE = "def returnInt() -> <mask>:"
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.22607609629631042,
  'token': 6979,
  'token_str': ' int',
  'sequence': 'def returnInt() -> int:'},
 {'score': 0.1578385978937149,
  'token': 5053,
  'token_str': ' Any',
  'sequence': 'def returnInt() -> Any:'},
 {'score': 0.1569618135690689,
  'token': 9291,
  'token_str': ' None',
  'sequence': 'def returnInt() -> None:'},
 {'score': 0.12274535745382309,
  'token': 1907,
  'token_str': ' type',
  'sequence': 'def returnInt() -> type:'},
 {'score': 0.06580983847379684,
  'token': 1666,
  'token_str': '...',
  'sequence': 'def returnInt() ->...:'}]

In [99]:
class CustomModel(torch.nn.Module):
    def __init__(self, model, d, vocabulary_size = 50265): 
        super(CustomModel, self).__init__() 
        self.d = d
        self.model = model
        self.config = model.config
        self.layer = torch.nn.Linear(vocabulary_size, d)
    
    def forward(self, input_ids=None, attention_mask=None):
        model_output = self.model.forward(input_ids=input_ids, attention_mask=attention_mask)        
        final_output_tensor = self.layer.forward(model_output[0])
        model_output.logits = final_output_tensor
        return model_output

In [100]:
nl_tokens=tokenizer.tokenize("")
code_tokens=tokenizer.tokenize("def returnInt() -> blahblah")
tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
model(torch.tensor(tokens_ids)[None,:])[0].size()

torch.Size([1, 11, 50265])

In [101]:
custom_model = CustomModel(model, 8)
custom_model.forward(torch.tensor(tokens_ids)[None,:])



MaskedLMOutput(loss=None, logits=tensor([[[-1.5327,  1.6024, -3.1271, -1.6551, -2.2588, -0.2849,  2.3658,
           3.5348],
         [-0.8810,  0.8842, -3.3689, -1.4831, -1.7109, -0.4538,  1.2516,
           3.4641],
         [-0.1972,  1.8971, -5.9111, -0.9745, -0.7997,  0.1484,  1.2759,
           1.6533],
         [-0.4901,  1.4421, -3.8161, -1.2706, -2.7123,  2.3424,  0.4357,
           3.1755],
         [-1.8994,  1.4675, -1.9338, -3.1568, -3.7597, -1.4410,  1.3943,
           5.7186],
         [-2.5603, -0.4156, -4.5737, -2.9418, -0.9588, -2.8346,  1.6243,
           6.9443],
         [-3.2496,  2.7400, -3.1106, -5.1585, -2.7332,  0.5180,  2.1054,
           7.4531],
         [-0.2799, -0.9783, -2.5475, -2.1471, -3.5733,  3.7248, -0.3224,
           4.1424],
         [-0.1317, -0.1612, -4.8593, -2.6546,  2.1494, -0.1348, -0.2017,
           4.6845],
         [ 0.4310, -0.0933, -4.4425, -3.5494, -2.8485, -0.0183, -0.6244,
           4.3259],
         [-1.5258,  1.6011, -3.1276, 

In [102]:
custom_model = CustomModel(model, 8)
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.8491508364677429,
  'token': 2,
  'token_str': '</s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.061621226370334625,
  'token': 6,
  'token_str': ',',
  'sequence': 'def returnInt() ->,:'},
 {'score': 0.05688280612230301,
  'token': 4,
  'token_str': '.',
  'sequence': 'def returnInt() ->.:'},
 {'score': 0.027118658646941185,
  'token': 5,
  'token_str': ' the',
  'sequence': 'def returnInt() -> the:'},
 {'score': 0.0028568394482135773,
  'token': 1,
  'token_str': '<pad>',
  'sequence': 'def returnInt() ->:'}]

In [103]:
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()

In [104]:
from tqdm.notebook import tqdm

def tokenize_code(code):
    nl_tokens = tokenizer.tokenize("")
    code_tokens = tokenizer.tokenize(code)
    tokens = [tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    return torch.tensor(tokens_ids)[None,:]

epochs = 1
data = [("def returnInt() -> <mask>:", "def calcInt() -> <mask>:","def returnFloat() -> <mask>:", "int")]

optimizer = torch.optim.Adam(custom_model.parameters(), lr=0.001)
criterion = torch.jit.script(TripletLoss())

for epoch in tqdm(range(epochs), desc="Epochs"):
    custom_model.train()
    running_loss = []
    for step, (t_a, t_p, t_n, anchor_label) in enumerate(data):
        
        optimizer.zero_grad()
        anchor_out = custom_model(tokenize_code(t_a))
        positive_out = custom_model(tokenize_code(t_p))
        negative_out = custom_model(tokenize_code(t_n))
        
        loss = criterion(anchor_out[0], positive_out[0], negative_out[0])
        loss.backward()
        optimizer.step()

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

In [105]:
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.9662728905677795,
  'token': 3,
  'token_str': '<unk>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.03372710570693016,
  'token': 5,
  'token_str': ' the',
  'sequence': 'def returnInt() -> the:'},
 {'score': 2.3076533851260636e-16,
  'token': 4,
  'token_str': '.',
  'sequence': 'def returnInt() ->.:'},
 {'score': 2.2461171363397533e-18,
  'token': 7,
  'token_str': ' to',
  'sequence': 'def returnInt() -> to:'},
 {'score': 7.931263707988504e-23,
  'token': 2,
  'token_str': '</s>',
  'sequence': 'def returnInt() ->:'}]

In [106]:
import numpy as np
from annoy import AnnoyIndex
from torch.utils.data import DataLoader
import random

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
KNN_TREE_SIZE = 20

def compute_validation_loss_dsl():
    with torch.no_grad():
        # model.eval()
        computed_embed_batches_train = []
        computed_embed_labels_train = []
        # for step, (t_a, t_p, t_n, anchor_label) in enumerate(DataLoader(data)):
        output = custom_model.forward(torch.tensor(tokens_ids)[None,:])
        computed_embed_batches_train.append(output.logits.cpu().numpy()) # 11 * 8
        # computed_embed_labels_train.append(t_a[1].logits.cpu().numpy())
             
        annoy_index = create_knn_index(np.vstack(computed_embed_batches_train), None, computed_embed_batches_train[0].shape[2])
    return annoy_index

def create_knn_index(train_types_embed: np.array, valid_types_embed: np.array, type_embed_dim:int) -> AnnoyIndex:
    """
    Creates KNNs index for given type embedding vectors, taken from Type4Py
    """

    annoy_idx = AnnoyIndex(type_embed_dim, 'euclidean')

    for i, v in enumerate(tqdm(train_types_embed, total=len(train_types_embed), desc="KNN index")):
        annoy_idx.add_item(i, v[0])

    # if valid_types_embed is not None:
    #     for i, v in enumerate(valid_types_embed):
    #         annoy_idx.add_item(len(train_types_embed) + i, v)

    annoy_idx.build(KNN_TREE_SIZE)
    return annoy_idx


In [107]:
annoy_idx = compute_validation_loss_dsl()
print(annoy_idx)

KNN index:   0%|          | 0/1 [00:00<?, ?it/s]

<annoy.Annoy object at 0x0000013BA5E38A30>


In [108]:
from collections import defaultdict

input_list = [
    "def returnInt() -> <mask>: \n\t x: int = 42 \n\t return x",
    "def setInt(self, x: int) -> <mask>: \n\t self.x = x",
    "def getInt(self) -> <mask>: \n\t return self.x",
    "def concatString(self, s1: str, s2: str) -> <mask>: \n\t return s1 + s2",
    "def setStr(self, s: str) -> <mask>: \n\t self.s = s",
    "def getStr(self) -> <mask>: \n\t return self.s",
    "def isInt(self, x) -> <mask>: \n\t return x % 1 == 0",
    "def isString(self, s) -> <mask>: \n\t return type(s, str)"
]
labels = [
    "int",
    "int",
    "int",
    "str",
    "str",
    "str",
    "bool",
    "bool"
]

def get_test_embedding():
    with torch.no_grad():
        computed_embed_batches_test = []
        computed_embed_labels_test = []
        
        for i in range(8):
            nl_tokens=tokenizer.tokenize("")
            code_tokens=tokenizer.tokenize(input_list[i])
            tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
            tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
            
            output = custom_model.forward(torch.tensor(tokens_ids)[None,:])
            computed_embed_batches_test.append(output.logits.cpu().numpy())
#             test_label = tokenizer.convert_tokens_to_ids(tokenizer(labels[i]))
#             computed_embed_labels_test.append(labels[i])
        return computed_embed_batches_test, labels

def predict_type_embed(types_embed_array: np.array, types_embed_labels: np.array, indexed_knn: AnnoyIndex, k: int):
    """
    Predict type of given type embedding vectors
    """

    pred_types_embed = []
    pred_types_score = []
    for i, embed_vec in enumerate(tqdm(types_embed_array, total=len(types_embed_array), desc="Finding KNNs & Prediction")):
        idx, dist = indexed_knn.get_nns_by_vector(embed_vec, k, include_distances=True)
        pred_idx_scores = compute_types_score(dist, idx, types_embed_labels)
        pred_types_embed.append([i for (i, s) in pred_idx_scores])
        pred_types_score.append(pred_idx_scores)
    
    return pred_types_embed, pred_types_score

def compute_types_score(types_dist: list, types_idx: list, types_embed_labels: np.array):
        types_dist = 1 / (np.array(types_dist) + 1e-10) ** 2
        types_dist /= np.sum(types_dist)
        types_score = defaultdict(int)
        for n, d in zip(types_idx, types_dist):
            types_score[types_embed_labels[n]] += d
        
        return sorted({t: s for t, s in types_score.items()}.items(), key=lambda kv: kv[1], reverse=True)
    
types_embed_array, types_embed_labels = get_test_embedding()
# print(types_embed_array[0])
# print(np.vstack(types_embed_array[0]))
knn_K = 8
pred_type_embed, pred_type_score = predict_type_embed(np.vstack(types_embed_array[0]), types_embed_labels, annoy_idx, knn_K)
# for i in pred_type_embed[0]:
#     print(types_embed_labels[i])
print(tokenizer.convert_ids_to_tokens(pred_type_embed[0][0]))
print(pred_type_score)

[[[-1.5693102e+01 -2.5255030e+01 -1.6359529e+01  3.6548542e+01
    3.3064899e-01  3.5709141e+01 -4.7024246e+01 -3.2363412e+00]
  [-1.6196732e+01 -2.7523052e+01 -1.2040886e+01  3.6689110e+01
    1.6002575e-01  3.5438446e+01 -4.2520947e+01 -3.1085544e+00]
  [-2.7576933e+01 -3.8618401e+01 -2.0782473e+00  3.7252720e+01
    1.0839910e-01  4.1041142e+01 -3.6700027e+01 -4.7793503e+00]
  [-2.3177340e+01 -4.6333946e+01  4.1112299e+00  3.8342468e+01
   -7.1163245e-02  4.6451714e+01 -3.1646967e+01 -3.9649751e+00]
  [-2.0026312e+01 -4.2045143e+01  3.4292309e+00  2.9566652e+01
    2.4433419e-01  4.5869267e+01 -4.1784382e+01 -4.4153147e+00]
  [-2.2947287e+01 -3.4197121e+01 -6.5307174e+00  3.5513668e+01
    3.3909369e-01  3.8166130e+01 -3.4583218e+01 -4.0050516e+00]
  [-2.3524315e+01 -3.4618809e+01 -4.5251775e+00  3.8357151e+01
   -3.8281280e-01  4.2755749e+01 -4.0301003e+01 -3.8047602e+00]
  [-1.5997654e+01 -2.5336708e+01 -1.6155655e+01  3.7271442e+01
    3.4625167e-01  3.4819809e+01 -4.5939411e+01 

Finding KNNs & Prediction:   0%|          | 0/23 [00:00<?, ?it/s]

<unk>
[[(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)], [(3, 1.0)]]
