In [1]:
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, pipeline
import torch

model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")

CODE = "def returnInt() -> <mask>:"
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

Downloading:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

[{'score': 0.2260781228542328,
  'token': 6979,
  'token_str': ' int',
  'sequence': 'def returnInt() -> int:'},
 {'score': 0.15783804655075073,
  'token': 5053,
  'token_str': ' Any',
  'sequence': 'def returnInt() -> Any:'},
 {'score': 0.1569615751504898,
  'token': 9291,
  'token_str': ' None',
  'sequence': 'def returnInt() -> None:'},
 {'score': 0.12274446338415146,
  'token': 1907,
  'token_str': ' type',
  'sequence': 'def returnInt() -> type:'},
 {'score': 0.06580942869186401,
  'token': 1666,
  'token_str': '...',
  'sequence': 'def returnInt() ->...:'}]

In [2]:
class CustomModel(torch.nn.Module):
    def __init__(self, model, d, vocabulary_size = 50265): 
        super(CustomModel, self).__init__() 
        self.d = d
        self.model = model
        self.config = model.config
        self.layer = torch.nn.Linear(vocabulary_size, d)
    
    def forward(self, input_ids=None, attention_mask=None):
        model_output = self.model.forward(input_ids=input_ids, attention_mask=attention_mask)        
        final_output_tensor = self.layer.forward(model_output[0])
        model_output.logits = final_output_tensor
        
        return model_output

In [3]:
nl_tokens=tokenizer.tokenize("")
code_tokens=tokenizer.tokenize("def returnInt() ->")
tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
model(torch.tensor(tokens_ids)[None,:])[0].size()

torch.Size([1, 8, 50265])

In [4]:
custom_model = CustomModel(model, 5)
custom_model.forward(torch.tensor(tokens_ids)[None,:])



MaskedLMOutput(loss=None, logits=tensor([[[-2.8276e+00,  2.4216e+00,  3.9869e+00, -1.0463e+00,  1.0917e+00],
         [-1.2637e+00,  1.0037e+00,  4.2818e+00, -1.4421e+00,  5.5309e-01],
         [-1.1521e+00,  9.6312e-01,  4.9411e+00,  1.2044e+00,  1.8036e+00],
         [-1.4185e+00,  4.0227e+00,  2.9961e+00, -4.8033e-01,  8.6555e-01],
         [-3.3025e+00,  1.0812e+00,  3.2022e+00, -3.3987e+00, -6.3526e-03],
         [-1.0949e+00,  2.0268e+00,  7.7756e+00,  9.2707e-01,  3.8056e+00],
         [ 6.8033e-01,  4.1091e+00,  4.0951e+00, -9.2743e-01,  3.2098e+00],
         [ 2.3712e-01,  1.5028e+00,  3.9121e+00, -4.0188e-01,  1.0961e-01]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [5]:
custom_model = CustomModel(model, 5)
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.9643492698669434,
  'token': 1,
  'token_str': '<pad>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.03478380665183067,
  'token': 4,
  'token_str': '.',
  'sequence': 'def returnInt() ->.:'},
 {'score': 0.0006143652135506272,
  'token': 0,
  'token_str': '<s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.00024761210079304874,
  'token': 2,
  'token_str': '</s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 4.905385594611289e-06,
  'token': 3,
  'token_str': '<unk>',
  'sequence': 'def returnInt() ->:'}]

In [6]:
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()

In [7]:
from tqdm.notebook import tqdm

def tokenize_code(code):
    nl_tokens = tokenizer.tokenize("")
    code_tokens = tokenizer.tokenize(code)
    tokens = [tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    return torch.tensor(tokens_ids)[None,:]

epochs = 1
data = [("def returnInt() -> <mask>:", "def calcInt() -> <mask>:","def returnFloat() -> <mask>:", "int")]

optimizer = torch.optim.Adam(custom_model.parameters(), lr=0.001)
criterion = torch.jit.script(TripletLoss())

for epoch in tqdm(range(epochs), desc="Epochs"):
    custom_model.train()
    running_loss = []
    for step, (t_a, t_p, t_n, anchor_label) in enumerate(data):
        
        optimizer.zero_grad()
        anchor_out = custom_model(tokenize_code(t_a))
        positive_out = custom_model(tokenize_code(t_p))
        negative_out = custom_model(tokenize_code(t_n))
        
        loss = criterion(anchor_out[0], positive_out[0], negative_out[0])
        loss.backward()
        optimizer.step()

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 1.0,
  'token': 3,
  'token_str': '<unk>',
  'sequence': 'def returnInt() ->:'},
 {'score': 2.3913810927138346e-13,
  'token': 2,
  'token_str': '</s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 8.368926076229508e-14,
  'token': 0,
  'token_str': '<s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 1.6425211836604224e-35,
  'token': 1,
  'token_str': '<pad>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.0,
  'token': 4,
  'token_str': '.',
  'sequence': 'def returnInt() ->.:'}]

In [30]:
import numpy as np
from annoy import AnnoyIndex

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
KNN_TREE_SIZE = 20

def compute_validation_loss_dsl():
    with torch.no_grad():
        # model.eval()
        computed_embed_batches_train = []
        computed_embed_labels_train = []
        for epoch in tqdm(range(epochs), desc="Epochs"):
            for step, (t_a, t_p, t_n, anchor_label) in enumerate(data):
                output = custom_model.forward(torch.tensor(tokens_ids)[None,:])
                computed_embed_batches_train.append(output.logits.cpu().numpy())
                # computed_embed_labels_train.append(t_a[1].logits.cpu().numpy())

        annoy_index = create_knn_index(np.vstack(computed_embed_batches_train), None, computed_embed_batches_train[0].shape[1])
    return annoy_index

def create_knn_index(train_types_embed: np.array, valid_types_embed: np.array, type_embed_dim:int) -> AnnoyIndex:
    """
    Creates KNNs index for given type embedding vectors, taken from Type4Py
    """

    annoy_idx = AnnoyIndex(type_embed_dim, 'euclidean')

    for i, v in enumerate(tqdm(train_types_embed, total=len(train_types_embed),
                          desc="KNN index")):
        print(i)
        print(v)
        annoy_idx.add_item(i, v)

    # if valid_types_embed is not None:
    #     for i, v in enumerate(valid_types_embed):
    #         annoy_idx.add_item(len(train_types_embed) + i, v)

    annoy_idx.build(KNN_TREE_SIZE)
    return annoy_idx


In [31]:
annoy_idx = compute_validation_loss_dsl()
print(annoy_idx)

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

KNN index:   0%|          | 0/1 [00:00<?, ?it/s]

0
[[ 26.284954 -23.429432  27.446783  56.30995  -65.48645 ]
 [ 26.281193 -23.468046  27.453445  56.31244  -65.486244]
 [ 28.097422 -34.463352  31.892488  58.57457  -72.57966 ]
 [ 28.29975  -37.697678  33.283176  59.11005  -73.90328 ]
 [ 23.934975 -33.45297   28.592031  61.48916  -71.910736]
 [ 31.411812 -34.704235  36.51779   57.64144  -71.293465]
 [ 28.166185 -39.511665  33.261097  60.465538 -74.42499 ]
 [ 26.281128 -23.468134  27.45339   56.31243  -65.48622 ]]


TypeError: only size-1 arrays can be converted to Python scalars