In [74]:
from torch.utils.data.dataloader import DataLoader
from datasets import load_dataset
from transformers import RobertaTokenizerFast, RobertaModel

# Uncomment if you want to download the full dataset from hugging face
#dataset = load_dataset ( ' kevinjesse /ManyTypes4TypeScript ')

#load the small selected local dataset using the py script 
dataset = load_dataset('ManyTypes4TypeScript.py', ignore_verifications=True)

#fast tokenizer for roberta - please stick to the fast one or expect bugs and slowdown
tokenizer = RobertaTokenizerFast.from_pretrained("microsoft/codebert-base", add_prefix_space=True)

model = RobertaModel.from_pretrained("microsoft/codebert-base")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset many_types4_type_script (/home/user/.cache/huggingface/datasets/many_types4_type_script/ManyTypes4TypeScript/1.0.0/f87845becfdb639f5c328d25ec0bba30e959da6024bdbe0575b34d62aa7f188d)


  0%|          | 0/3 [00:00<?, ?it/s]

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [76]:
model.config.to_dict()['hidden_size']

768

In [72]:
def generate_label_masks(labels):
    """
    Generates the derived masked label list and masks from labels

    :labels: a list containg the labels ([0 1 0 0 0 1])

    :return: 
        :label_list: a list of the labels with one of them being masked (turn into tokenizer.mask_token_id 
        [0 100 0 0 0 1], [[0 1 0 0 0 100]]

        :mask_list: a list of the masked labels 
        ([1, 1])
    """ 
    label_list = []
    mask_list = []
    for i in range(len(labels)):
        # if the label is None, we don't need to generate masks
        if labels[i] is None:
                continue
        # else create a copy where you mask, then pass it and the original value
        copy_label = labels.copy()
        copy_label[i] = tokenizer.mask_token_id
        label_list.append(copy_label)
        mask_list.append(labels[i])
    return label_list, mask_list

# Doesn't work due to ValueError: Class label 50264 greater than configured num_classes 50001
# Would work if you change 'labels' to something else, for example, masked labels
# tries to generate the mask before the tokenization
def map_dataset(inputs_):
    """
    :inputs_: partition of the dataset, for example, train.
    It's a dictionary with features: ['id', 'tokens', 'labels']

    :return: 
        :inputs_new: the modified partition, which now includes the added 'masked_label' field
    """ 

    # create new partition which includes masked_label field
    inputs_new = {'id': [], 'tokens': [], "labels": [], 'masked_label': []}

    for i in range(len(inputs_['labels'])):
        label_list, mask_list = generate_label_masks( inputs_['labels'][i])

        # add a record to the new partition, each record includes:
        # 1) the original id
        # 2) the original list of tokens
        # 3) the new labels list where one of the labels is masked
        # 4) the original value of the masked label
        for j in range(len(label_list)):
            inputs_new['id'].append(inputs_['id'][i])
            inputs_new['tokens'].append(inputs_['tokens'][i])
            inputs_new['labels'].append(label_list[j])
            inputs_new['masked_label'].append(mask_list[j])
    return inputs_new

maped_ds = dataset.map(map_dataset, batched=True)
    

  0%|          | 0/1 [00:00<?, ?ba/s]

ValueError: Class label 50264 greater than configured num_classes 50001

In [None]:
def tokenize_and_align_labels(examples):
    def divide_chunks(l1, l2, n):
        for i in range(0, len(l1), n):
            yield {'input_ids': [0] + l1[i:i + n] + [2], 'labels': [-100] + l2[i:i + n] + [-100]}

    window_size = 510
    tokenized_inputs = tokenizer(examples['tokens'], is_split_into_words=True, truncation=False,
                                    add_special_tokens=False)
    inputs_ = {'input_ids': [], 'labels': []}

    for encoding, label in zip(tokenized_inputs.encodings, examples['labels']):
        word_ids = encoding.word_ids  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                l = label[word_idx] if label[word_idx] is not None else -100
                label_ids.append(l)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        s_labels = set(label_ids)
        if len(s_labels) == 1 and list(s_labels)[0] == -100:
            continue
        for e in divide_chunks(encoding.ids, label_ids, window_size):
            for k, v in e.items():
                inputs_[k].append(v)

    inputs_new = {'input_ids': [], 'm_labels': [], "masks": []}

    for i in range(len(inputs_['labels'])):
        if len(inputs_['input_ids'][i]) != 512:
            continue    
        for j in range(len(inputs_['labels'][i])):
            if inputs_['labels'][i][j]==-100:
                continue
            copy_label = inputs_['labels'][i].copy()
            copy_label[j] = tokenizer.mask_token_id
            inputs_new['input_ids'].append(inputs_['input_ids'][i])
            inputs_new['m_labels'].append(copy_label)
            inputs_new['masks'].append(inputs_['labels'][i][j])
    return inputs_new

tokenized_hf = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['id', 'tokens', 'labels'])

In [None]:
[len(e) for e in tokenized_hf['train']['input_ids'] if len(e) != 512]

#print(tokenized_hf)

#print(tokenized_hf2['validation'][0])

In [None]:
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, pipeline
import torch

#model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")
#tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")

CODE = "def returnInt() -> <mask>:"
CODE1 = "def returnBool() + <mask> 0"
CODE2 = "<mask>"
CODE3 = "mask"
print(tokenizer(CODE))
print(tokenizer(CODE1))
print(tokenizer(CODE2))
print(tokenizer(CODE3))

# fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

# outputs = fill_mask(CODE)
# outputs

In [None]:
nl_tokens=tokenizer.tokenize("")
code_tokens=tokenizer.tokenize("def returnInt() -> <mask>")
tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
output = model(torch.tensor(tokens_ids)[None,:])[0]
# tokens[7]
output[0][7][torch.argmax(output[0][7])]

In [None]:
custom_model = CustomModel(model, 8)
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

In [None]:
input_list = [
    "def returnInt() -> <mask>: \n\t x: int = 42 \n\t return x",
    "def setInt(self, x: int) -> <mask>: \n\t self.x = x",
    "def getInt(self) -> <mask>: \n\t return self.x",
    "def concatString(self, s1: str, s2: str) -> <mask>: \n\t return s1 + s2",
    "def setStr(self, s: str) -> <mask>: \n\t self.s = s",
    "def getStr(self) -> <mask>: \n\t return self.s",
    "def isInt(self, x) -> <mask>: \n\t return x % 1 == 0",
    "def doSomething() -> <mask>: \n\t pass",
    "def isString(self, s) -> <mask>: \n\t return type(s, str)"
]

labels = [
    "int",
    "None",
    "int",
    "str",
    "None",
    "str",
    "bool",
    "None",
    "bool"
]

In [88]:
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(0)
    
    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()

In [89]:
from torch.utils.data import TensorDataset
from typing import Tuple
import torch

class TripletDataset(torch.utils.data.Dataset):

    def __init__(self, *in_sequences: torch.Tensor, m_labels: torch.Tensor, labels: torch.Tensor, dataset_name: str,
                 train_mode: bool=True):
        self.data = TensorDataset(*in_sequences)
        self.m_labels = m_labels
        self.labels = labels
        self.dataset_name = dataset_name
        self.train_mode = train_mode

        self.get_item_func = self.get_item_train if self.train_mode else self.get_item_test

    def get_item_train(self, index: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor],
                                         Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
        """
        It returns three tuples. Each one is a (data, label)
         - The first tuple is (data, label) at the given index
         - The second tuple is similar (data, label) to the given index
         - The third tuple is different (data, label) from the given index 
        """

         # Find a similar datapoint randomly
        mask = self.labels == self.labels[index]
        mask[index] = False # Making sure that the similar pair is NOT the same as the given index
        mask = mask.nonzero()
        a = mask[torch.randint(high=len(mask), size=(1,))][0]

        # Find a different datapoint randomly
        mask = self.labels != self.labels[index]
        mask = mask.nonzero()
        b = mask[torch.randint(high=len(mask), size=(1,))][0]
        
        return (self.data[index], self.m_labels[index]), (self.data[a.item()], self.m_labels[a.item()]), \
               (self.data[b.item()], self.m_labels[b.item()])

    def get_item_test(self, index: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor], list, list]:
        return (self.data[index], self.labels[index]), [], []
    
    def __getitem__(self, index: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor],
                                         Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
         return self.get_item_func(index)

    def __len__(self) -> int:
        return len(self.data)

In [None]:
import random

def make_triplet(input, labels, index):
    pos_labels = []
    neg_labels = []
    for i, l in enumerate(labels):
        if not i == index:
            if l == labels[index]:
                pos_labels.append(i)
            else:
                neg_labels.append(i)

    return input[index], input[pos_labels[random.randint(0, len(pos_labels)-1)]], input[neg_labels[random.randint(0, len(neg_labels)-1)]]

def make_data(input, labels):
    data = []
    for i in range(len(input)):
        data.append(make_triplet(input, labels, i))
    return data

In [None]:
from regex import P
from tqdm.notebook import tqdm
from torch.nn.utils.rnn import pad_sequence, unpad_sequence

# def tokenize_code(code):
#     nl_tokens = tokenizer.tokenize("")
#     code_tokens = tokenizer.tokenize(code)
#     tokens = [tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
#     tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
#     return torch.tensor(tokens_ids)[None,:]

# def tokenize_input(input):
#     token_ids_list = torch.Tensor()
#     for code in input:
#         nl_tokens = tokenizer.tokenize("")
#         code_tokens = tokenizer.tokenize(code)
#         tokens = [tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
#         tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
#         token_ids_list = torch.cat((token_ids_list, torch.tensor(tokens_ids)), 0)
#     # print(token_ids_list)
#     # data = pad_sequence(token_ids_list, batch_first=True)
#     # print(data)
#     return token_ids_list

epochs = 1
# data = [("def returnInt() -> <mask>:", "def calcInt() -> <mask>:","def returnFloat() -> <mask>:", "int")]
# tokenized_input = tokenize_input(input_list)
# tokenized_labels = tokenize_input(labels)
# print(tokenized_input)
# print(tokenized_labels[0])
# data = TripletDataset(tokenized_input, labels=tokenized_labels, dataset_name="test")
# print(data.get_item_train(0))
# data = make_data(input_list, labels)
# print(data)

class CustomModel(torch.nn.Module):
    def __init__(self, model, d, codebert_output_dim = 393216, input_dim = 512): # 50265 + sep + 512 (labels) = 50778
        super(CustomModel, self).__init__() 
        self.d = d
        self.model = model
        self.config = model.config
        self.layer = torch.nn.Linear(codebert_output_dim + input_dim, d)
        self.input_dim = input_dim
        self.codebert_output_dim = codebert_output_dim
    
    def forward(self, input_ids=None, attention_mask=None):
        
        assert input_ids.shape[0] == 1024
        
        tokens, labels = torch.split(input_ids, self.input_dim)
        
        model_output = self.model.forward(input_ids=tokens.unsqueeze(0))[0]
        
        ll_input = torch.cat((model_output.view(1, self.codebert_output_dim).squeeze(0), labels), 0)
        assert ll_input.shape[0] == self.codebert_output_dim + self.input_dim
        
        final_output_tensor = self.layer.forward(ll_input)
        
        return final_output_tensor

custom_model = CustomModel(model, 8)
dataset = TripletDataset(torch.tensor(tokenized_hf['train']['input_ids']), m_labels=torch.tensor(tokenized_hf['train']['m_labels']), labels=torch.tensor(tokenized_hf['train']['masks']), dataset_name="train")

optimizer = torch.optim.Adam(custom_model.parameters(), lr=0.001)
criterion = torch.jit.script(TripletLoss())

for epoch in tqdm(range(epochs), desc="Epochs"):
    custom_model.train()
    running_loss = []
    for step in range(len(dataset)):
        (t_a, t_p, t_n) = dataset.get_item_func(step)
        
        optimizer.zero_grad()
        anchor_out = custom_model(input_ids=torch.cat((t_a[0][0], t_a[1]), 0))
        positive_out = custom_model(input_ids=torch.cat((t_p[0][0], t_p[1]), 0))
        negative_out = custom_model(input_ids=torch.cat((t_n[0][0], t_n[1]), 0))
        
        print(anchor_out)
        
        loss = criterion(anchor_out[0], positive_out[0], negative_out[0])
        loss.backward()
        optimizer.step()

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

tensor([-70.3263, -33.7461, -23.0762,   8.1758, -61.7985,  67.6001, -31.6710,
         75.0479], grad_fn=<AddBackward0>)
tensor([ 67.4757,  34.3271,  58.7532, -16.2151,  59.7282,  34.7843, -42.9827,
        -68.0706], grad_fn=<AddBackward0>)


In [None]:
inputs = tokenizer("Hello world!", return_tensors="pt")

# Model apply
outputs = model(**inputs)
outputs

In [34]:
# for epoch in tqdm(range(epochs), desc="Epochs"):
#     custom_model.train()
#     running_loss = []
#     for step in range(len(dataset)):
#         (t_a, t_p, t_n) = dataset.get_item_func(step)
        
#         print(len(t_a[0]))
        
(t_a, t_p, t_n) = dataset.get_item_func(0)
t_a[1]

tensor(-100)

In [19]:
fill_mask = pipeline('fill-mask', model=custom_model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
outputs

TypeError: CustomModel.forward() missing 1 required positional argument: 'm_labels'

In [20]:
import numpy as np
from annoy import AnnoyIndex
from torch.utils.data import DataLoader
import random
from collections import defaultdict
import time

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
KNN_TREE_SIZE = 20
DISTANCE_METRIC = 'euclidean'


def create_type_space(inputs=input_list[:4], labels=labels[:4]):
    """
    Creates the type space based on the inputs and their corresponding labels
    """
    
    # Make sure imputs are labeled
    assert len(inputs) == len(labels)
    
    # Cache the type space mappings
    computed_mapped_batches_train = []
    with torch.no_grad():
        
        # Iterate through the data set
        for inp, label in zip(inputs, labels):
            
            # Tokenize the code
            nl_tokens = tokenizer.tokenize("")
            code_tokens = tokenizer.tokenize(inp)
            tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
            tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
            
            # Get the type space mapping from the model
            output = custom_model.forward(torch.tensor(tokens_ids)[None,:])
            
            # Select masked tokens
            masked_tokens = [c for c, token in enumerate(code_tokens) if token == "<mask>"]
            
            print(masked_tokens)
            
            # For this version, assume only one mask
            assert len(masked_tokens) == 1
            
            # Selected only the masked tokens from the output
            vals = output.logits.cpu().numpy()
            predicted_masks = [vals[0][i] for i in masked_tokens]
            
            # Cache the mapping of the masked token only
            computed_mapped_batches_train.append(predicted_masks)
        
        # Create the type space
        annoy_index = create_knn_index(computed_mapped_batches_train, None, computed_mapped_batches_train[0][0].size)
    return annoy_index

def create_knn_index(train_types_embed: np.array, valid_types_embed: np.array, type_embed_dim:int) -> AnnoyIndex:
    """
    Creates KNNs index for given type embedding vectors, taken from Type4Py
    """
    
    annoy_idx = AnnoyIndex(type_embed_dim, DISTANCE_METRIC)

    for i, v in enumerate(tqdm(train_types_embed, total=len(train_types_embed), desc="KNN index")):
        print(v[0])
        annoy_idx.add_item(i, v[0])

    annoy_idx.build(KNN_TREE_SIZE)
    return annoy_idx

annoy_idx = create_type_space()
print(annoy_idx)

ValueError: You have to specify either input_ids or inputs_embeds

In [21]:
def map_type(inputs=input_list[:4]):
    """
    Maps an input to the type space
    """
    with torch.no_grad():
        computed_embed_batches_test = []
        computed_embed_labels_test = []
        
        for inp in inputs:

            # Tokenize the code
            nl_tokens = tokenizer.tokenize("")
            code_tokens = tokenizer.tokenize(inp)
            tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
            tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
            
            # Get the type space mapping from the model
            output = custom_model.forward(torch.tensor(tokens_ids)[None,:])
            
            # Select masked tokens
            masked_tokens = [c for c, token in enumerate(code_tokens) if token == "<mask>"]
            
            # For this version, assume only one mask
            assert len(masked_tokens) == 1
            
            # Selected only the masked tokens from the output
            vals = output.logits.cpu().numpy()
            predicted_masks = [vals[0][i] for i in masked_tokens]

            # Cache the mapping of the masked token only
            computed_embed_batches_test.append(predicted_masks)
        
        return computed_embed_batches_test

def predict_type(types_embed_array: np.array, types_embed_labels: np.array, indexed_knn: AnnoyIndex, k: int):
    """
    Predict type of given type embedding vectors
    """

    pred_types_embed = []
    pred_types_score = []
    for i, embed_vec in enumerate(tqdm(types_embed_array, total=len(types_embed_array), desc="Finding KNNs & Prediction")):
        
        # Get the distances to the KNN
        idx, dist = indexed_knn.get_nns_by_vector(embed_vec[0], k, include_distances=True)
        
        # Compute the scores according to the formula
        pred_idx_scores = compute_types_score(dist, idx, types_embed_labels)
        
        # Cache the scores and the labels
        pred_types_embed.append([i for (i, s) in pred_idx_scores])
        pred_types_score.append(pred_idx_scores)
    
    return pred_types_embed, pred_types_score

def compute_types_score(types_dist: list, types_idx: list, types_embed_labels: np.array):
        types_dist = 1 / (np.array(types_dist) + 1e-10) ** 2
        types_dist /= np.sum(types_dist)
        types_score = defaultdict(int)
        for n, d in zip(types_idx, types_dist):
            types_score[types_embed_labels[n]] += d
        
        return sorted({t: s for t, s in types_score.items()}.items(), key=lambda kv: kv[1], reverse=True)
    
types_embed_array = map_type()
knn_K = 2
pred_type_embed, pred_type_score = predict_type(types_embed_array, labels[:4], annoy_idx, knn_K,)
print(input_list[:4])
print(labels[:4])
print(pred_type_score)

ValueError: You have to specify either input_ids or inputs_embeds