In [1]:
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from transformers import  RobertaConfig, RobertaModel, RobertaTokenizer
import argparse
import json
import os
from model2 import Model
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
import random
import multiprocessing
from tqdm import tqdm, trange
import numpy as np
import javalang
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
np.random.seed(0)
import seaborn as sns
import collections
import pickle
import sklearn
from matplotlib import cm
from sklearn import manifold

In [2]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, sample_size=1000, file_path='train', block_size=512,pool=None):
        postfix=file_path.split('/')[-1].split('.txt')[0]
        self.examples = []
        index_filename=file_path
        print("Creating features from index file at %s ", index_filename)
        url_to_code={}
        with open('/'.join(index_filename.split('/')[:-1])+'/data.jsonl') as f:
            for line in f:
                line=line.strip()
                js=json.loads(line)
                url_to_code[js['idx']]=js['func']
        data=[]
        cache={}
        f=open(index_filename)
        with open(index_filename) as f:
            for line in f:
                line=line.strip()
                url1,url2,label=line.split('\t')
                if url1 not in url_to_code or url2 not in url_to_code:
                    continue
                if label=='0':
                    label=0
                else:
                    label=1
                data.append((url1,url2,label,tokenizer,cache,url_to_code))
        data=data[:sample_size]

        self.examples=pool.map(get_example,tqdm(data,total=len(data)))
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item].input_ids),torch.tensor(self.examples[item].label)


In [3]:
def get_example(item):
    url1,url2,label,tokenizer,cache,url_to_code=item
    if url1 in cache:
        code1=cache[url1].copy()
    else:
        try:
            code=' '.join(url_to_code[url1].split())
        except:
            code=""
        code1=tokenizer.tokenize(code)
    if url2 in cache:
        code2=cache[url2].copy()
    else:
        try:
            code=' '.join(url_to_code[url2].split())
        except:
            code=""
        code2=tokenizer.tokenize(code)
        
    return convert_examples_to_features(code1,code2,label,url1,url2,tokenizer,block_size, cache)

In [4]:
class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self,
                 input_tokens,
                 input_ids,
                 label,
                 url1,
                 url2):
        self.input_tokens = input_tokens
        self.input_ids = input_ids
        self.label=label
        self.url1=url1
        self.url2=url2

In [5]:
def load_and_cache_examples(tokenizer, 
                            test_data_file,
                            block_size, 
                            sample_size=1000,
                            evaluate=False,
                            test=False,
                            pool=None):
    dataset = TextDataset(tokenizer, 
                          file_path=test_data_file,
                          sample_size=sample_size,
                          block_size=block_size,
                          pool=pool)
    return dataset

In [6]:
def convert_examples_to_features(code1_tokens,code2_tokens,label,url1,url2,tokenizer,block_size,cache):
    code1_tokens=code1_tokens[:block_size-2]
    code1_tokens =[tokenizer.cls_token]+code1_tokens+[tokenizer.sep_token]
    code2_tokens=code2_tokens[:block_size-2]
    code2_tokens =[tokenizer.cls_token]+code2_tokens+[tokenizer.sep_token]  
    
    code1_ids=tokenizer.convert_tokens_to_ids(code1_tokens)
    padding_length = block_size - len(code1_ids)
    code1_ids+=[tokenizer.pad_token_id]*padding_length
    
    code2_ids=tokenizer.convert_tokens_to_ids(code2_tokens)
    padding_length = block_size - len(code2_ids)
    code2_ids+=[tokenizer.pad_token_id]*padding_length
    
    source_tokens=code1_tokens+code2_tokens
    source_ids=code1_ids+code2_ids
    return InputFeatures(source_tokens,source_ids,label,url1,url2)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = RobertaConfig.from_pretrained('microsoft/codebert-base')

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

model = RobertaModel.from_pretrained('microsoft/codebert-base',
                                    output_attentions=True, 
                                    output_hidden_states=True)

model=Model(model,config,tokenizer)

model = model.to(device)

In [8]:
file_path = "../dataset/valid.txt"
postfix=file_path.split('/')[-1].split('.txt')[0]
index_filename=file_path
url_to_code={}
with open('/'.join(index_filename.split('/')[:-1])+'/data.jsonl') as f:
    for line in f:
        line=line.strip()
        js=json.loads(line)
        url_to_code[js['idx']]=js['func']
data=[]
cache={}
f=open(index_filename)
with open(index_filename) as f:
    # lines = 1000
    added_lines = 0
    for line in f:
        # if added_lines >= lines:
        #     break
        line=line.strip()
        url1,url2,label=line.split('\t')
        if url1 not in url_to_code or url2 not in url_to_code:
            continue
        if label=='0':
            label=0
        else:
            label=1
        data.append((url1,url2,label,' '.join(url_to_code[url1].split()), ' '.join(url_to_code[url2].split())))
        added_lines += 1

In [9]:
len(data)

415416

In [10]:
data[0]

('13653451',
 '21955002',
 0,
 'public ViewInitListener() throws IOException { URL url = this.getClass().getResource(VIEW_INIT_CONFIG); log.debug("Loading configuration from: " + url); config = new Properties(); InputStream in = url.openStream(); config.load(in); in.close(); }',
 'public void run() { String s, s2; s = ""; s2 = ""; try { URL url = new URL("http://www.m-w.com/dictionary/" + Word); BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream())); String str; while (((str = in.readLine()) != null) && (!stopped)) { s = s + str; } in.close(); } catch (MalformedURLException e) { } catch (IOException e) { } Pattern pattern = Pattern.compile("popWin\\\\(\'/cgi-bin/(.+?)\'", Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(s); if ((!stopped) && (matcher.find())) { String newurl = "http://m-w.com/cgi-bin/" + matcher.group(1); try { URL url2 = new URL(newurl); BufferedReader in2 = new BufferedReader(new InputStreamReader(url2.openStream(

In [11]:
def get_syntax_types_for_code(code_snippet):
  types = ["[CLS]"]
  code = ["<s>"]
  tree = list(javalang.tokenizer.tokenize(code_snippet))
  
  for i in tree:
    j = str(i)
    j = j.split(" ")
    if j[1] == '"MASK"':
      types.append('[MASK]')
      code.append('<mask>')
    else:
      types.append(j[0].lower())
      code.append(j[1][1:-1])
    
  types.append("[SEP]")
  code.append("</s>")
  return np.array(types), ' '.join(code)

In [12]:
block_size = 400

code_sample = data[0]
types_1, rewrote_code_1 = get_syntax_types_for_code(code_sample[3])
types_2, rewrote_code_2 = get_syntax_types_for_code(code_sample[4])

tokenized_ids_1 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(rewrote_code_1))
tokenized_ids_2 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(rewrote_code_2))
if len(tokenized_ids_2) > block_size:
    tokenized_ids_2 = tokenized_ids_2[:block_size-1] + [tokenizer.sep_token_id]

if len(tokenized_ids_1) > block_size:
    tokenized_ids_1 = tokenized_ids_1[:block_size-1] + [tokenizer.sep_token_id]

padding_length = block_size - len(tokenized_ids_1)
tokenized_ids_1+=[tokenizer.pad_token_id]*padding_length
padding_length = block_size - len(tokenized_ids_2)
tokenized_ids_2+=[tokenizer.pad_token_id]*padding_length

source_ids = tokenized_ids_1 + tokenized_ids_2
labels = code_sample[2]
source_ids = torch.tensor(source_ids).unsqueeze(0).to(device)
labels = torch.tensor(labels).unsqueeze(0).to(device)

with torch.no_grad():
    output = model(block_size,source_ids,labels)

In [13]:
def create_pos_attn_patterns(attentions):
    '''
    Creates attention patterns related to positional encoding for attention initialization
    one_to_one - pays attention to the corresponding token
    next_token - pays attention to the next token
    prev_token - pays attention to the previous token
    cls_token  - pays attention to the first index ([CLS])
    '''

    one_to_one = torch.eye(attentions[0].shape[-1])
    next_token = torch.cat((torch.cat((torch.zeros(attentions[0].shape[-1]-1, 1), torch.eye(attentions[0].shape[-1]-1)), dim=1),\
        torch.zeros(1, attentions[0].shape[-1])), dim=0)
    prev_token = torch.cat((torch.zeros(1, attentions[0].shape[-1]), \
        torch.cat((torch.eye(attentions[0].shape[-1]-1), torch.zeros(attentions[0].shape[-1]-1, 1)), dim=1)), dim=0)
    cls_token = torch.zeros(attentions[0].shape[-1], attentions[0].shape[-1])
    cls_token[:,0] = 1.

    return [one_to_one, next_token, prev_token, cls_token]   

In [16]:
def compute_ag_loss(inputs, attentions, device, attn_head_types='0,1,1,4'):
        '''
        Adds a random loss based on attention values
        To test gradients
        outputs[-1] contains the attention values (tuple of size num_layers)
        and each elements is of the shape
        [batch_size X num_heads X max_sequence_len X max_sequence_len]
        '''
        # Get the attention head types
        attn_head_types = [int(i) for i in attn_head_types.split(',')]

        # The number attention heads of each type. one-to-one, next, previous, first
        numbers = attn_head_types
        print('numbers:', numbers)
        cum_sum = np.cumsum(numbers)
        print('cum_sum:', cum_sum)
        # Matrices containing the attention patterns
        targets = create_pos_attn_patterns(attentions)
        # Loss for positional attention patterns
        expanded_targets = []
        loss = torch.nn.MSELoss()
        total_loss = 0.        
        # Change the tensor's dimension
        for (num, target) in zip(numbers, targets):
            if num == 0:
                expanded_targets.append(None)
            else:
                # Add dimensions so that the tensor can be repeated
                target = torch.unsqueeze(target, 0)
                target = torch.unsqueeze(target, 0)
                # Change the target tensor's dimension so that it matches batch_size X num_heads[chosen]
                target = target.repeat(attentions[0].shape[0], num, 1, 1)
                target = target.to(device)
                expanded_targets.append(target)

        # Go over all the layers
        for i in range(len(attentions)):
            for j in range(len(numbers)):
                if expanded_targets[j] is not None:
                    if j == 0:
                        total_loss += loss(expanded_targets[j], attentions[i][:,0:cum_sum[j]])
                    else:
                        total_loss += loss(expanded_targets[j], attentions[i][:,cum_sum[j-1]:cum_sum[j]])
        return total_loss

In [17]:
scale = 1
mlm_loss = output[0]
ag_loss = compute_ag_loss(source_ids, 
                            output[2].attentions, 
                            device, attn_head_types='0,1,1,4')

# loss = mlm_loss + ag_loss * scale * linear_schedule_for_scale()

numbers: [0, 1, 1, 4]
cum_sum: [0 1 2 6]


In [18]:
print('mlm_loss:', mlm_loss.item(), 'ag_loss:', ag_loss.item(), 'total_loss:', mlm_loss.item() + ag_loss.item() * scale)

mlm_loss: 0.772885262966156 ag_loss: 0.09443830698728561 total_loss: 0.8673235699534416
