In [1]:
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, pipeline
import torch

model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")

CODE = "def returnInt() -> <mask>:"
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.2260766625404358,
  'token': 6979,
  'token_str': ' int',
  'sequence': 'def returnInt() -> int:'},
 {'score': 0.157838374376297,
  'token': 5053,
  'token_str': ' Any',
  'sequence': 'def returnInt() -> Any:'},
 {'score': 0.156962051987648,
  'token': 9291,
  'token_str': ' None',
  'sequence': 'def returnInt() -> None:'},
 {'score': 0.1227448433637619,
  'token': 1907,
  'token_str': ' type',
  'sequence': 'def returnInt() -> type:'},
 {'score': 0.06580957025289536,
  'token': 1666,
  'token_str': '...',
  'sequence': 'def returnInt() ->...:'}]

In [2]:
class CustomModel(torch.nn.Module):
    def __init__(self, model, d, vocabulary_size = 50265): 
        super(CustomModel, self).__init__() 
        self.d = d
        self.model = model
        self.config = model.config
        self.layer = torch.nn.Linear(vocabulary_size, d)
    
    def forward(self, input_ids=None, attention_mask=None):
        model_output = self.model.forward(input_ids=input_ids, attention_mask=attention_mask)        
        final_output_tensor = self.layer.forward(model_output[0])
        model_output.logits = final_output_tensor
        return model_output

In [3]:
nl_tokens=tokenizer.tokenize("")
code_tokens=tokenizer.tokenize("def returnInt() -> <mask>")
tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
output = model(torch.tensor(tokens_ids)[None,:])[0]
# tokens[7]
output[0][7][torch.argmax(output[0][7])]

tensor(13.4799, grad_fn=<SelectBackward0>)

In [4]:
custom_model = CustomModel(model, 8)
custom_model.forward(torch.tensor(tokens_ids)[None,:])



MaskedLMOutput(loss=None, logits=tensor([[[-1.6605, -2.1943, -2.8319, -1.1643,  2.9283, -5.6575, -3.2562,
           1.0868],
         [-3.7877, -1.1551, -2.6151, -1.6566,  2.9266, -5.8890, -2.5256,
           0.1403],
         [-1.5121, -3.2002, -2.8317, -2.6426,  4.2348, -7.1305, -0.1336,
           0.1063],
         [-3.2301, -1.1848, -0.8329, -1.4263,  3.8917, -6.7515, -0.9728,
           1.0566],
         [-2.3751, -4.1675,  0.1800,  0.6472,  2.7231, -5.8361, -3.1794,
           0.4383],
         [-3.2134, -0.7541, -2.2230, -1.3668,  2.9409, -5.4193, -1.6290,
           1.0584],
         [-3.4316, -3.1963, -2.5261, -1.8759,  1.8716, -7.3175, -0.5786,
           0.2265],
         [-2.9279, -1.3168, -1.5731, -1.6301,  2.7295, -3.8154, -2.3004,
           1.4263],
         [-2.0348, -2.7027, -1.7724, -0.8066,  3.2134, -4.1598, -2.6139,
          -0.1610]]], grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [5]:
custom_model = CustomModel(model, 8)
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.8262478709220886,
  'token': 5,
  'token_str': ' the',
  'sequence': 'def returnInt() -> the:'},
 {'score': 0.09357747435569763,
  'token': 1,
  'token_str': '<pad>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.03982565924525261,
  'token': 4,
  'token_str': '.',
  'sequence': 'def returnInt() ->.:'},
 {'score': 0.033154960721731186,
  'token': 7,
  'token_str': ' to',
  'sequence': 'def returnInt() -> to:'},
 {'score': 0.006195381283760071,
  'token': 2,
  'token_str': '</s>',
  'sequence': 'def returnInt() ->:'}]

In [6]:
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()

In [7]:
from tqdm.notebook import tqdm

def tokenize_code(code):
    nl_tokens = tokenizer.tokenize("")
    code_tokens = tokenizer.tokenize(code)
    tokens = [tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    return torch.tensor(tokens_ids)[None,:]

epochs = 1
data = [("def returnInt() -> <mask>:", "def calcInt() -> <mask>:","def returnFloat() -> <mask>:", "int")]

optimizer = torch.optim.Adam(custom_model.parameters(), lr=0.001)
criterion = torch.jit.script(TripletLoss())

for epoch in tqdm(range(epochs), desc="Epochs"):
    custom_model.train()
    running_loss = []
    for step, (t_a, t_p, t_n, anchor_label) in enumerate(data):
        
        optimizer.zero_grad()
        anchor_out = custom_model(tokenize_code(t_a))
        positive_out = custom_model(tokenize_code(t_p))
        negative_out = custom_model(tokenize_code(t_n))
        
        loss = criterion(anchor_out[0], positive_out[0], negative_out[0])
        loss.backward()
        optimizer.step()

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.7488725185394287,
  'token': 6,
  'token_str': ',',
  'sequence': 'def returnInt() ->,:'},
 {'score': 0.2510627508163452,
  'token': 7,
  'token_str': ' to',
  'sequence': 'def returnInt() -> to:'},
 {'score': 5.411247911979444e-05,
  'token': 4,
  'token_str': '.',
  'sequence': 'def returnInt() ->.:'},
 {'score': 1.0583322364254855e-05,
  'token': 3,
  'token_str': '<unk>',
  'sequence': 'def returnInt() ->:'},
 {'score': 1.3184968966204727e-20,
  'token': 1,
  'token_str': '<pad>',
  'sequence': 'def returnInt() ->:'}]

In [71]:
import numpy as np
from annoy import AnnoyIndex
from torch.utils.data import DataLoader
import random
from collections import defaultdict
import time

input_list = [
    "def returnInt() -> <mask>: \n\t x: int = 42 \n\t return x",
    "def setInt(self, x: int) -> <mask>: \n\t self.x = x",
    "def getInt(self) -> <mask>: \n\t return self.x",
    "def concatString(self, s1: str, s2: str) -> <mask>: \n\t return s1 + s2",
    "def setStr(self, s: str) -> <mask>: \n\t self.s = s",
    "def getStr(self) -> <mask>: \n\t return self.s",
    "def isInt(self, x) -> <mask>: \n\t return x % 1 == 0",
    "def doSomething() -> <mask>: \n\t pass",
    "def isString(self, s) -> <mask>: \n\t return type(s, str)"
]

labels = [
    "lol",
    "float",
    "int",
    "str",
    "str",
    "str",
    "bool",
    "None",
    "bool"
]

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
KNN_TREE_SIZE = 20

def compute_validation_loss_dsl(inputs=input_list[:4], labels=labels[:4]):
    computed_embed_batches_train = []
    with torch.no_grad():
        for inp, label in zip(inputs, labels):
            nl_tokens = tokenizer.tokenize("")
            code_tokens = tokenizer.tokenize(inp)
            tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
            tokens_ids=tokenizer.convert_tokens_to_ids(tokens)

            output = custom_model.forward(torch.tensor(tokens_ids)[None,:])
            
            # Select masked tokens
            masked_tokens = [c for c, token in enumerate(code_tokens) if token == "<mask>"]
            assert len(masked_tokens) == 1
            
            vals = output.logits.cpu().numpy()
            predicted_masks = [vals[0][i] for i in masked_tokens]
            
            computed_embed_batches_train.append(predicted_masks)

        annoy_index = create_knn_index(computed_embed_batches_train, None, computed_embed_batches_train[0][0].size)
    return annoy_index

def create_knn_index(train_types_embed: np.array, valid_types_embed: np.array, type_embed_dim:int) -> AnnoyIndex:
    """
    Creates KNNs index for given type embedding vectors, taken from Type4Py
    """

    annoy_idx = AnnoyIndex(type_embed_dim, 'euclidean')

    for i, v in enumerate(tqdm(train_types_embed, total=len(train_types_embed), desc="KNN index")):
        print(v[0])
        annoy_idx.add_item(i, v[0])

    # if valid_types_embed is not None:
    #     for i, v in enumerate(valid_types_embed):
    #         annoy_idx.add_item(len(train_types_embed) + i, v)

    annoy_idx.build(KNN_TREE_SIZE)
    return annoy_idx

annoy_idx = compute_validation_loss_dsl()
print(annoy_idx)

KNN index:   0%|          | 0/4 [00:00<?, ?it/s]

[-25.71658    -31.977694    -2.2333093   -2.6045978    0.67572707
 -48.578865    16.938425    29.472477  ]
[-39.842625   -34.684433   -23.886139    -0.29078707   1.4387879
 -50.335125    13.408825    13.3979225 ]
[-20.764132   -31.093374    -1.7169559   -2.0099554    0.83564806
 -43.067097    10.4085      28.954763  ]
[-22.318245   -26.959398    -5.9409437   -2.5109363    0.46577013
 -42.52149     12.835083    30.072348  ]
<annoy.Annoy object at 0x7f11e2eff630>


In [72]:
def get_test_embedding(inputs=input_list[4:], labels=labels[:4]):
    with torch.no_grad():
        computed_embed_batches_test = []
        computed_embed_labels_test = []
        computed_mask_positions = []
        
        for i in range(len(labels)):
            nl_tokens = tokenizer.tokenize("")
            code_tokens = tokenizer.tokenize(inputs[i])
            
            # Select masked tokens
            masked_tokens = [c for c, token in enumerate(code_tokens) if token == "<mask>"]            
            assert len(masked_tokens) == 1
            
            tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
            tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
            
            output = custom_model.forward(torch.tensor(tokens_ids)[None,:])
            
            vals = output.logits.cpu().numpy()
            predicted_masks = [vals[0][i] for i in masked_tokens]
            
            computed_embed_batches_test.append(predicted_masks)
        print(computed_embed_batches_test)
        return computed_embed_batches_test, labels

def predict_type_embed(types_embed_array: np.array, types_embed_labels: np.array, indexed_knn: AnnoyIndex, k: int):
    """
    Predict type of given type embedding vectors
    """

    pred_types_embed = []
    pred_types_score = []
    for i, embed_vec in enumerate(tqdm(types_embed_array, total=len(types_embed_array), desc="Finding KNNs & Prediction")):
        idx, dist = indexed_knn.get_nns_by_vector(embed_vec[0], k, include_distances=True)
        
        pred_idx_scores = compute_types_score(dist, idx, types_embed_labels)
        
        pred_types_embed.append([i for (i, s) in pred_idx_scores])
        pred_types_score.append(pred_idx_scores)
    
    return pred_types_embed, pred_types_score

def compute_types_score(types_dist: list, types_idx: list, types_embed_labels: np.array):
        types_dist = 1 / (np.array(types_dist) + 1e-10) ** 2
        types_dist /= np.sum(types_dist)
        types_score = defaultdict(int)
        for n, d in zip(types_idx, types_dist):
            types_score[types_embed_labels[n]] += d
        
        return sorted({t: s for t, s in types_score.items()}.items(), key=lambda kv: kv[1], reverse=True)
    
types_embed_array, types_embed_labels = get_test_embedding()

for c, mapping in enumerate(types_embed_array):
    print(f"{mapping}, {labels[c]}")
pred_type_embed, pred_type_score = predict_type_embed(types_embed_array, types_embed_labels, annoy_idx, knn_K,)
print(pred_type_score)

[[array([-21.343218 , -30.613182 ,  -3.0923553,  -2.3816638,   0.8759284,
       -43.35853  ,  11.127215 ,  25.9648   ], dtype=float32)], [array([-18.42064  , -27.302011 ,  -2.6667924,  -1.7992612,   0.8901088,
       -44.03419  ,  11.145033 ,  18.890875 ], dtype=float32)], [array([-22.887123 , -29.524544 ,  -3.8122504,  -2.34594  ,   0.7078566,
       -43.817497 ,  15.684399 ,  27.696667 ], dtype=float32)], [array([-22.875221 , -36.807476 ,   2.7585974,  -2.0631034,   0.7342264,
       -45.711044 ,  10.870563 ,  33.997448 ], dtype=float32)]]
[array([-21.343218 , -30.613182 ,  -3.0923553,  -2.3816638,   0.8759284,
       -43.35853  ,  11.127215 ,  25.9648   ], dtype=float32)], lol
[array([-18.42064  , -27.302011 ,  -2.6667924,  -1.7992612,   0.8901088,
       -44.03419  ,  11.145033 ,  18.890875 ], dtype=float32)], float
[array([-22.887123 , -29.524544 ,  -3.8122504,  -2.34594  ,   0.7078566,
       -43.817497 ,  15.684399 ,  27.696667 ], dtype=float32)], int
[array([-22.875221 , -36.8

Finding KNNs & Prediction:   0%|          | 0/4 [00:00<?, ?it/s]

[[('int', 0.7802072427014553), ('str', 0.21979275729854467)], [('int', 0.5593838300039669), ('str', 0.44061616999603315)], [('str', 0.6059886566418033), ('int', 0.3940113433581967)], [('int', 0.5763279523356901), ('lol', 0.4236720476643099)]]
