In [1]:
import pandas as pd
import numpy as np
import csv
import pickle
import re
import torch
import sklearn
import os
import random
import custom
import models
import regex
import clang
from clang import *
from clang import cindex
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from torch.utils.data import Dataset, DataLoader, IterableDataset
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM, RobertaForSequenceClassification
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import LineByLineTextDataset
from transformers.modeling_outputs import SequenceClassifierOutput
from custom import CustomDataCollatorForLanguageModeling

## Pre-requisites stuff

In [2]:
## Set default device (GPU or CPU)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
## Deterministic/reproducible flags

seedlist = [42, 834, 692, 489, 901, 408, 819, 808, 531, 166]

seed = seedlist[5]
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
## Weights and Biases flags

os.environ['WANDB_DISABLED'] = 'true'
os.environ['WANDB_MODE'] = 'dryrun'
# os.environ["CUDA_VISIBLE_DEVICES"]=""
#os.environ['WANDB_NOTEBOOK_NAME'] = 'Pretrain word-level VulBERTa on Draper'
#os.environ['WANDB_NAME'] = 'linux'
#os.environ['WANDB_PROJECT'] = 'projectName'

## Load/initialise custom tokenizer

In [5]:
## Tokenizer

from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import NormalizedString,PreTokenizedString
from typing import List 

class MyTokenizer:
    
    cidx = cindex.Index.create()
        

    def clang_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        ## Tokkenize using clang
        tok = []
        tu = self.cidx.parse('tmp.c',
                       args=[''],  
                       unsaved_files=[('tmp.c', str(normalized_string.original))],  
                       options=0)
        for t in tu.get_tokens(extent=tu.cursor.extent):
            spelling = t.spelling.strip()
            
            if spelling == '':
                continue
                
            ## Keyword no need

            ## Punctuations no need

            ## Literal all to BPE
            
            #spelling = spelling.replace(' ', '')
            tok.append(NormalizedString(spelling))

        return(tok)
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.clang_split)
        
## Custom tokenizer

from tokenizers import Tokenizer
from tokenizers import normalizers,decoders
from tokenizers.normalizers import StripAccents, unicode_normalizer_from_str, Replace
from tokenizers.processors import TemplateProcessing
from tokenizers import processors,pre_tokenizers
from tokenizers.models import BPE

## Init new tokenizers
#my_tokenizer = Tokenizer(BPE(unk_token="<unk>"))
#my_tokenizer = Tokenizer(BPE())


## Load pre-trained tokenizers
vocab, merges = BPE.read_file(vocab="./tokenizer/drapgh-vocab.json", merges="./tokenizer/drapgh-merges.txt")
my_tokenizer = Tokenizer(BPE(vocab, merges, unk_token="<unk>"))

my_tokenizer.normalizer = normalizers.Sequence([StripAccents(), Replace(" ", "Ä")])
my_tokenizer.pre_tokenizer = PreTokenizer.custom(MyTokenizer())
my_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
my_tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    special_tokens=[
    ("<s>",0),
    ("<pad>",1),
    ("</s>",2),
    ("<unk>",3),
    ("<mask>",4)
    ]
)


### Choose and prepare testing dataset

In [6]:
### Choose the dataset ('draper','vuldeepecker','devign','reveal')
mydataset = 'devign'

In [7]:
my_tokenizer.enable_truncation(max_length=1024)
my_tokenizer.enable_padding(direction='right', pad_id=1, pad_type_id=0, pad_token='<pad>', length=None, pad_to_multiple_of=None)

In [8]:
def process_encodings(encodings):
    input_ids=[]
    attention_mask=[]
    for enc in encodings:
        input_ids.append(enc.ids)
        attention_mask.append(enc.attention_mask)
    return {'input_ids':input_ids, 'attention_mask':attention_mask}

In [9]:
def cleaner(code):
    ## Remove code comments
    pat = re.compile(r'(/\*([^*]|(\*+[^*/]))*\*+/)|(//.*)')
    code = re.sub(pat,'',code)
    code = re.sub('\n','',code)
    code = re.sub('\t','',code)
    return(code)

In [10]:
class MyCustomDataset(Dataset):
    def __init__(self, encodings, labels, index):
        self.encodings = encodings
        #print(encodings)
        self.labels = labels
        self.index=index
        #print(index)
        assert len(self.encodings['input_ids']) == len(self.encodings['attention_mask']) ==  len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['index']=idx
        #print('retrieving:')
        #print(idx)
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
if mydataset=='devign':
    test_index=set()

    with open('data/finetune/devign/valid.txt') as f:
        for line in f:
            line=line.strip()
            #print(line)
            #print(int(line))
            test_index.add(int(line))
    mydata = pd.read_json('data/finetune/devign/Devign.json')
    m3=mydata.iloc[list(test_index)]
    print(test_index)
    print(m3)
    mydata = None
    del(mydata)
    m3.func = m3.func.apply(cleaner)

    test_encodings = my_tokenizer.encode_batch(m3.func)
    test_encodings = process_encodings(test_encodings)
    test_dataset = MyCustomDataset(test_encodings, m3.target.tolist(),m3.index)
else:
    m3 = pd.read_pickle('data/finetune/%s/%s_test.pkl'%(mydataset,mydataset))
    
    
    try:
        m3.functionSource = m3.functionSource.apply(cleaner)
        test_encodings = my_tokenizer.encode_batch(m3.functionSource)
        test_encodings = process_encodings(test_encodings)
        
        if  mydataset =='draper':
            test_dataset = MyCustomDataset(test_encodings, (m3['combine']*1).tolist())
        else:
            test_dataset = MyCustomDataset(test_encodings, m3.label.tolist())
    except:
        m3.func = m3.func.apply(cleaner)
        test_encodings = my_tokenizer.encode_batch(m3.func)
        test_encodings = process_encodings(test_encodings)
        test_dataset = MyCustomDataset(test_encodings, m3.label.tolist())

{8196, 8, 10, 24587, 16396, 15, 24597, 22, 16408, 16409, 8218, 16411, 24600, 24602, 16414, 8226, 24611, 24612, 24615, 16425, 8233, 16427, 24620, 16435, 51, 24632, 8249, 8250, 24635, 62, 8255, 24644, 8262, 71, 24649, 24650, 24651, 75, 24654, 16464, 8274, 24659, 8276, 16468, 88, 8284, 24673, 16484, 16487, 8297, 24682, 24688, 16500, 24695, 16503, 8311, 24699, 16508, 125, 16510, 127, 16511, 16515, 24711, 137, 24716, 143, 16527, 8337, 146, 16533, 24726, 24727, 24728, 155, 16541, 161, 162, 166, 24743, 8366, 174, 16558, 24757, 8374, 183, 24758, 16569, 16570, 24763, 24764, 188, 16577, 16578, 16582, 16584, 16596, 16597, 24790, 8419, 227, 24806, 8424, 235, 8428, 24813, 24815, 16624, 8440, 16633, 24827, 253, 24830, 255, 257, 8449, 8451, 8456, 16648, 24842, 8460, 268, 274, 16659, 16660, 8469, 24859, 24863, 24865, 292, 24872, 298, 300, 16685, 301, 24885, 311, 16698, 319, 16704, 16709, 24903, 8527, 16721, 24918, 342, 345, 346, 8538, 16731, 16737, 354, 355, 24935, 16744, 24937, 24938, 24939, 368, 167

In [12]:
################### D2A ONLY
#task = 'function'
#m3 = pd.read_csv('data/finetune/%s/%s/d2a_lbv1_%s_val.csv'%(mydataset,task,task))
#m3.code = m3.code.apply(cleaner)
#test_encodings = my_tokenizer.encode_batch(m3.code)
#test_encodings = process_encodings(test_encodings)
#test_dataset = MyCustomDataset(test_encodings, m3.label.tolist())


###########################test_dataset = MyCustomDataset(test_encodings, [0]*len(m3))

## Load fine-tuned VulBERTa-MLP model

In [13]:
mymodel=mydataset

In [14]:
model = RobertaForSequenceClassification.from_pretrained('/home/<ANONYMOUS>/VulBERTa/models/fine_tuned/VB-MLP_devign/')
print(model.num_parameters())

124836866


In [15]:
test_loader = DataLoader(test_dataset, batch_size=64)

In [16]:
def softmax_accuracy(probs,all_labels):
    #print(probs)
    def getClass(x):
        return(x.index(max(x)))
    
    all_labels = all_labels.tolist()
    probs_list=probs.tolist()
    probs = pd.Series(probs.tolist())
    #print(probs)
    all_predicted = probs.apply(getClass)
    all_predicted.reset_index(drop=True, inplace=True)
    vc = pd.value_counts(all_predicted == all_labels)
    try:
        acc = vc[1]/len(all_labels)
    except:
        if(vc.index[0]==False):
            acc = 0
        else:
            acc = 1
    
    #print(acc)
    #print(all_predicted)
    #print(probs)
    return(acc,all_predicted, probs)


In [17]:
%%capture

multigpu=False
if multigpu:
    model = torch.nn.DataParallel(model)
model.to(device)

### Predict

In [18]:
all_pred=[]
all_labels=[]
all_probs=[]
all_predictions=[]
all_index=[]
model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        all_index+=batch['index']
        #print(batch)
        #print(all_index)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        acc_val,pred, predictions = softmax_accuracy(torch.nn.functional.softmax(outputs[1],dim=1),labels)
        all_pred += pred.tolist()
        all_labels += labels.tolist()
        all_probs += outputs[1].tolist()
        all_predictions += predictions.tolist()

### Calculate the evaluation metrics

In [19]:
confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_pred)
print('Confusion matrix: \n',confusion)

tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

probs2=[]
for x in all_probs:
    probs2.append(x[1])

## Performance measure
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=all_labels, y_pred=all_pred)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=all_labels, y_pred=all_pred)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=all_labels, y_pred=all_pred)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=all_labels, y_pred=all_pred)))
print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=all_labels, y_score=probs2)))
print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=all_labels, y_score=probs2)))
print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=all_labels, y_pred=all_pred)))

Confusion matrix: 
 [[1204  341]
 [ 588  599]]

TP: 599
FP: 341
TN: 1204
FN: 588

Accuracy: 0.6599560761346999
Precision: 0.6372340425531915
Recall: 0.5046335299073293
F-measure: 0.5632346027268453
Precision-Recall AUC: 0.6949226108207741
AUC: 0.7271891009125286
MCC: 0.29624773462883225


In [20]:
print(len(all_labels))
print(type(all_labels))
#print(all_labels)
print(np.array(all_labels).T)

2732
<class 'list'>
[0 1 1 ... 1 0 0]


In [21]:
print(pred)

0     0
1     0
2     1
3     0
4     1
5     0
6     0
7     1
8     0
9     1
10    0
11    1
12    0
13    0
14    0
15    0
16    1
17    0
18    0
19    0
20    0
21    1
22    0
23    1
24    1
25    1
26    0
27    0
28    1
29    0
30    1
31    0
32    1
33    0
34    0
35    1
36    1
37    0
38    1
39    1
40    0
41    0
42    0
43    1
dtype: int64


In [22]:
print(len(all_pred))
print(all_pred)

2732
[0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1

In [23]:
print(all_accs)

NameError: name 'all_accs' is not defined

In [None]:
print(pred)
print(acc_val)

In [None]:
print(outputs)

In [None]:
print(len(all_predictions))
print(all_predictions)