In [97]:
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, pipeline
import torch

model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")

CODE = "def returnInt() -> <mask>:"
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.2260766625404358,
  'token': 6979,
  'token_str': ' int',
  'sequence': 'def returnInt() -> int:'},
 {'score': 0.157838374376297,
  'token': 5053,
  'token_str': ' Any',
  'sequence': 'def returnInt() -> Any:'},
 {'score': 0.156962051987648,
  'token': 9291,
  'token_str': ' None',
  'sequence': 'def returnInt() -> None:'},
 {'score': 0.1227448433637619,
  'token': 1907,
  'token_str': ' type',
  'sequence': 'def returnInt() -> type:'},
 {'score': 0.06580957025289536,
  'token': 1666,
  'token_str': '...',
  'sequence': 'def returnInt() ->...:'}]

In [94]:
class CustomModel(torch.nn.Module):
    def __init__(self, model, d, vocabulary_size = 50265): 
        super(CustomModel, self).__init__() 
        self.d = d
        self.model = model
        self.config = model.config
        self.layer = torch.nn.Linear(vocabulary_size, d)
    
    def forward(self, input_ids=None, attention_mask=None):
        model_output = self.model.forward(input_ids=input_ids, attention_mask=attention_mask)        
        final_output_tensor = self.layer.forward(model_output[0])
        model_output.logits = final_output_tensor
        
        return model_output

In [100]:
nl_tokens=tokenizer.tokenize("")
code_tokens=tokenizer.tokenize("def returnInt() ->")
tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
model(torch.tensor(tokens_ids)[None,:])[0].size()

torch.Size([1, 8, 50265])

In [101]:
custom_model = CustomModel(model, 5)
custom_model.forward(torch.tensor(tokens_ids)[None,:])



MaskedLMOutput(loss=None, logits=tensor([[[-3.3597, -0.5786, -0.8628, -1.6961,  4.4505],
         [-2.2103, -1.5020, -1.8677, -0.6764,  3.6895],
         [-3.9333, -0.7501, -2.3433, -1.9681,  5.3239],
         [-4.3055, -2.1125, -0.9487, -3.1717,  5.0609],
         [ 0.3511, -1.6958, -3.5025, -3.5813,  1.7566],
         [-4.7110,  0.2014, -2.3685, -3.6495,  3.1118],
         [-1.9819, -1.4330, -1.1298, -2.0145,  5.2455],
         [-1.3276, -2.7520, -1.4874, -2.5914,  2.7173]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [102]:
custom_model = CustomModel(model, 5)
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.6465588212013245,
  'token': 3,
  'token_str': '<unk>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.32681992650032043,
  'token': 1,
  'token_str': '<pad>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.021621134132146835,
  'token': 4,
  'token_str': '.',
  'sequence': 'def returnInt() ->.:'},
 {'score': 0.004525703378021717,
  'token': 2,
  'token_str': '</s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.0004744056495837867,
  'token': 0,
  'token_str': '<s>',
  'sequence': 'def returnInt() ->:'}]

In [82]:
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()

In [103]:
from tqdm.notebook import tqdm

def tokenize_code(code):
    nl_tokens = tokenizer.tokenize("")
    code_tokens = tokenizer.tokenize(code)
    tokens = [tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    return torch.tensor(tokens_ids)[None,:]

epochs = 1
data = [("def returnInt() -> <mask>:", "def calcInt() -> <mask>:","def returnFloat() -> <mask>:", "int")]

optimizer = torch.optim.Adam(custom_model.parameters(), lr=0.001)
criterion = torch.jit.script(TripletLoss())

for epoch in tqdm(range(epochs), desc="Epochs"):
    running_loss = []
    for step, (t_a, t_p, t_n, anchor_label) in enumerate(data):
        
        optimizer.zero_grad()
        anchor_out = custom_model(tokenize_code(t_a))
        positive_out = custom_model(tokenize_code(t_p))
        negative_out = custom_model(tokenize_code(t_n))
        
        loss = criterion(anchor_out[0], positive_out[0], negative_out[0])
        loss.backward()
        optimizer.step()

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

In [104]:
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

[{'score': 0.9998101592063904,
  'token': 3,
  'token_str': '<unk>',
  'sequence': 'def returnInt() ->:'},
 {'score': 0.00018984028429258615,
  'token': 0,
  'token_str': '<s>',
  'sequence': 'def returnInt() ->:'},
 {'score': 7.956073300174893e-11,
  'token': 4,
  'token_str': '.',
  'sequence': 'def returnInt() ->.:'},
 {'score': 8.717174904512714e-23,
  'token': 1,
  'token_str': '<pad>',
  'sequence': 'def returnInt() ->:'},
 {'score': 4.3312271805322544e-39,
  'token': 2,
  'token_str': '</s>',
  'sequence': 'def returnInt() ->:'}]