In [1]:
import numpy as np
import copy
from pathlib import Path
import pandas as pd
import random
import re
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import torchtext.vocab as vocab
import sklearn.metrics
from transformers import RobertaModel
from transformers import RobertaConfig
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from torch.autograd import Variable
from torch import nn, optim
from torch.optim import SGD,Adam,RMSprop
from torch.utils.data import Dataset, DataLoader, IterableDataset
from clang import *


seed = 1234
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
multigpu = False
if device == torch.device('cuda'):
	multigpu = torch.cuda.device_count() > 1
print('Device: ',device)
print('MultiGPU: ',multigpu)


In [None]:
## Training & vocab parameters
DATA_PATH = 'data'
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = VOCAB_SIZE+2
EMBED_DIM = 768 #768

In [None]:
## Tokenizer

from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import NormalizedString,PreTokenizedString
from typing import List 

class MyTokenizer:
    
    cidx = cindex.Index.create()
        

    def clang_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        ## Tokkenize using clang
        tok = []
        tu = self.cidx.parse('tmp.c',
                       args=[''],  
                       unsaved_files=[('tmp.c', str(normalized_string.original))],  
                       options=0)
        for t in tu.get_tokens(extent=tu.cursor.extent):
            spelling = t.spelling.strip()
            
            if spelling == '':
                continue
                
            ## Keyword no need

            ## Punctuations no need

            ## Literal all to BPE
            
            #spelling = spelling.replace(' ', '')
            tok.append(NormalizedString(spelling))

        return(tok)
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.clang_split)
        
## Custom tokenizer

from tokenizers import Tokenizer
from tokenizers import normalizers,decoders
from tokenizers.normalizers import StripAccents, unicode_normalizer_from_str, Replace
from tokenizers.processors import TemplateProcessing
from tokenizers import processors,pre_tokenizers
from tokenizers.models import BPE

## Load pre-trained tokenizers
vocab, merges = BPE.read_file(vocab="./tokenizer/drapgh-vocab.json", merges="./tokenizer/drapgh-merges.txt")
my_tokenizer = Tokenizer(BPE(vocab, merges, unk_token="<unk>"))

my_tokenizer.normalizer = normalizers.Sequence([StripAccents(), Replace(" ", "Ä")])
my_tokenizer.pre_tokenizer = PreTokenizer.custom(MyTokenizer())
my_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
my_tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    special_tokens=[
    ("<s>",0),
    ("<pad>",1),
    ("</s>",2),
    ("<unk>",3),
    ("<mask>",4)
    ]
)


### PREPARE DATA

In [None]:
TEST_ONLY = True

In [None]:
mydataset = 'd2a'

In [None]:
my_tokenizer.enable_truncation(max_length=1024)
my_tokenizer.enable_padding(direction='right', pad_id=1, pad_type_id=0, pad_token='<pad>', length=None, pad_to_multiple_of=None)

In [None]:
def cleaner(code):
    ## Remove code comments
    pat = re.compile(r'(/\*([^*]|(\*+[^*/]))*\*+/)|(//.*)')
    code = re.sub(pat,'',code)
    code = re.sub('\n','',code)
    code = re.sub('\t','',code)
    return(code)

In [None]:
def process_encodings(encodings):
    input_ids=[]
    attention_mask=[]
    for enc in encodings:
        input_ids.append(enc.ids)
        attention_mask.append(enc.attention_mask)
    return {'input_ids':input_ids, 'attention_mask':attention_mask}

In [None]:
def replace_colname(x):
    try:
        x = x.rename(columns={'functionSource': "func"})
    except:
        None

    try:
        x = x.rename(columns={'code': "func"})
    except:
        None

    try:
        x = x.rename(columns={'label': "target"})
    except:
        None
    return(x)

if mydataset =='devign':
    if TEST_ONLY:
        
        test_index=set()
        with open('data/finetune/devign/test.txt') as f:
            for line in f:
                line=line.strip()
                test_index.add(int(line))
        mydata = pd.read_json('data/finetune/devign/Devign.json')
        m3=mydata.iloc[list(test_index)]
        mydata = None
        del(mydata)
        
    else:
        train_index=set()
        valid_index=set()
        test_index=set()

        with open('data/finetune/devign/train.txt') as f:
            for line in f:
                line=line.strip()
                train_index.add(int(line))

        with open('data/finetune/devign/valid.txt') as f:
            for line in f:
                line=line.strip()
                valid_index.add(int(line))

        with open('data/finetune/devign/test.txt') as f:
            for line in f:
                line=line.strip()
                test_index.add(int(line))

        mydata = pd.read_json('data/finetune/devign/Devign.json')
        m1=mydata.iloc[list(train_index)]
        m2=mydata.iloc[list(valid_index)]
        m3=mydata.iloc[list(test_index)]

        mydata = None
        del(mydata)
    

elif mydataset =='d2a':
    task = 'function'
    
    if TEST_ONLY:
        m3 = pd.read_csv('data/finetune/%s/%s/d2a_lbv1_%s_dev.csv'%(mydataset,task,task))
        m3 = replace_colname(m3)
    else:
        m1 = pd.read_csv('data/finetune/%s/%s/d2a_lbv1_%s_train.csv'%(mydataset,task,task))
        m2 = pd.read_csv('data/finetune/%s/%s/d2a_lbv1_%s_dev.csv'%(mydataset,task,task))
        m3 = pd.read_csv('data/finetune/%s/%s/d2a_lbv1_%s_test.csv'%(mydataset,task,task))
       
        m1 = replace_colname(m1)
        m2 = replace_colname(m2)
        m3 = replace_colname(m3)
        
        
else:
    
    def replace_colname(x):
        try:
            x = x.rename(columns={'functionSource': "func"})
        except:
            None
            
        try:
            x = x.rename(columns={'code': "func"})
        except:
            None

        try:
            x = x.rename(columns={'label': "target"})
        except:
            None
        return(x)
    
    
    if TEST_ONLY:
        m3 = pd.read_pickle('data/finetune/%s/%s_test.pkl'%(mydataset,mydataset))
        m3 = replace_colname(m3)
        
    else:
        m1 = pd.read_pickle('data/finetune/%s/%s_train.pkl'%(mydataset,mydataset))
        m2 = pd.read_pickle('data/finetune/%s/%s_val.pkl'%(mydataset,mydataset))
        m3 = pd.read_pickle('data/finetune/%s/%s_test.pkl'%(mydataset,mydataset))

        m1 = replace_colname(m1)
        m2 = replace_colname(m2)
        m3 = replace_colname(m3)

if TEST_ONLY:
    m3.func = m3.func.apply(cleaner)
    test_encodings = my_tokenizer.encode_batch(m3.func)
    try:
        test_encodings = [{'func':enc.ids,'target':lab} for enc,lab in zip(test_encodings,m3.target.tolist())]
    except:
        test_encodings = [{'func':enc.ids,'target':lab} for enc,lab in zip(test_encodings,(m3['combine']*1).tolist())]

else:
    
    m1.func = m1.func.apply(cleaner)
    train_encodings = my_tokenizer.encode_batch(m1.func)
    try:
        train_encodings = [{'func':enc.ids,'target':lab} for enc,lab in zip(train_encodings,m1.target.tolist())]
    except:
        train_encodings = [{'func':enc.ids,'target':lab} for enc,lab in zip(train_encodings,(m1['combine']*1).tolist())]


    m2.func = m2.func.apply(cleaner)
    val_encodings = my_tokenizer.encode_batch(m2.func)
    try:
        val_encodings = [{'func':enc.ids,'target':lab} for enc,lab in zip(val_encodings,m2.target.tolist())]
    except:
        val_encodings = [{'func':enc.ids,'target':lab} for enc,lab in zip(val_encodings,(m2['combine']*1).tolist())]

        
    m3.func = m3.func.apply(cleaner)
    test_encodings = my_tokenizer.encode_batch(m3.func)
    try:
        test_encodings = [{'func':enc.ids,'target':lab} for enc,lab in zip(test_encodings,m3.target.tolist())]
    except:
        test_encodings = [{'func':enc.ids,'target':lab} for enc,lab in zip(test_encodings,(m3['combine']*1).tolist())]


In [None]:
CODES = torchtext.data.Field(batch_first=True, fix_length=1024,use_vocab=False)
LABEL = torchtext.data.LabelField(dtype=torch.long, is_target=True,use_vocab=False)
fields = {'func': ('codes', CODES), 'target': ('label', LABEL)}

class TabularDataset_From_List(torchtext.data.Dataset):
    def __init__(self, input_list, format, fields, skip_header=False, **kwargs):
        make_example = {
            'json': torchtext.data.Example.fromJSON, 'dict': torchtext.data.Example.fromdict}[format.lower()]

        examples = [make_example(item, fields) for item in input_list]

        if make_example in (torchtext.data.Example.fromdict, torchtext.data.Example.fromJSON):
            fields, field_dict = [], fields
            for field in field_dict.values():
                if isinstance(field, list):
                    fields.extend(field)
                else:
                    fields.append(field)

        super(TabularDataset_From_List, self).__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls, path=None, root='.data', train=None, validation=None,
               test=None, **kwargs):
        if path is None:
            path = cls.download(root)
        train_data = None if train is None else cls(
            train, **kwargs)
        val_data = None if validation is None else cls(
            validation, **kwargs)
        test_data = None if test is None else cls(
            test, **kwargs)
        return tuple(d for d in (train_data, val_data, test_data)
                     if d is not None)


## Import the 100K data as TabularDataset

if TEST_ONLY:
    test_data = TabularDataset_From_List(test_encodings,'dict',fields = fields)
else:
    train_data = TabularDataset_From_List(train_encodings,'dict',fields = fields)
    val_data = TabularDataset_From_List(val_encodings,'dict',fields = fields)
    test_data = TabularDataset_From_List(test_encodings,'dict',fields = fields)

### IF ITERABLE DATASETTEST_ONLY

In [None]:
class MyDataset(IterableDataset):
    
    def __init__(self,filename,rcount):
     
        self.filename=filename
        self.len_labels=rcount
        super().__init__()
                    
    def process(self,filename):
        import pickle 
        with open(filename, "rb") as f:
            while True:
                try:
                    item = pickle.load(f)
                    yield {'input_ids': torch.tensor(item['input_ids']), 'attention_mask':torch.tensor(item['attention_mask']), 'labels':torch.tensor(item['labels'])}
                except EOFError:
                    break
                    
    def __len__(self):
        return self.len_labels

    def __iter__(self):
        dataset=self.process(self.filename)          
        return dataset

In [None]:
train_rcount = len(pd.read_pickle('data/draper/draper_train.pkl'))
train_dataset = MyDataset('data/draper/draper_stream_train.pkl', train_rcount)

In [None]:
val_rcount = len(pd.read_pickle('data/draper/draper_val.pkl'))
val_dataset = MyDataset('data/draper/draper_stream_val.pkl', val_rcount)

In [None]:
test_rcount = len(pd.read_pickle('data/draper/draper_test.pkl'))
test_dataset = MyDataset('data/draper/draper_stream_test.pkl', test_rcount)

### END ITERABLE DATASET

In [None]:
MAX_VOCAB_SIZE = VOCAB_SIZE

# place into iterators

if TEST_ONLY:
    test_iterator = torchtext.data.BucketIterator(
        test_data, 
        batch_size = 1,
        sort = False,
        shuffle = False)
    
else:
    train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
        (train_data, val_data, test_data), 
        batch_size = BATCH_SIZE,
        sort = False,
        shuffle = False)

UNK_IDX = 3
PAD_IDX = 1

# test_iterator = torchtext.data.BucketIterator(
#     test_data, 
#     batch_size = BATCH_SIZE,
#     sort = False,
#     shuffle = False)

#from torch.utils.data import DataLoader

# train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=BATCH_SIZE)
# val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=BATCH_SIZE)
# test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)



### Define VulBERTa-CNN model

In [None]:
class myCNN(nn.Module):
    def __init__(self, EMBED_SIZE, EMBED_DIM):
        super(myCNN,self).__init__()
        
        pretrained_weights = RobertaModel.from_pretrained('./models/VulBERTa/').embeddings.word_embeddings.weight

        self.embed = nn.Embedding.from_pretrained(pretrained_weights,
                                                  freeze=True,
                                                  padding_idx=1)

        self.conv1 = nn.Conv1d(in_channels=EMBED_DIM, out_channels=200, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=EMBED_DIM, out_channels=200, kernel_size=4)
        self.conv3 = nn.Conv1d(in_channels=EMBED_DIM, out_channels=200, kernel_size=5)

        self.dropout = nn.Dropout(0.5)

        self.fc1 = nn.Linear(200*3,256) #500
        self.fc2 = nn.Linear(256,128)
        self.fc3 = nn.Linear(128,2)
    
    def forward(self, x):
        x = self.embed(x)
        x = x.permute(0,2,1)

        x1 = F.relu(self.conv1(x))
        x2 = F.relu(self.conv2(x))
        x3 = F.relu(self.conv3(x))
        
        x1 = F.max_pool1d(x1, x1.shape[2])
        x2 = F.max_pool1d(x2, x2.shape[2])
        x3 = F.max_pool1d(x3, x3.shape[2])
        
        x = torch.cat([x1,x2,x3],dim=1)
        
        # flatten the tensor
        x = x.flatten(1)
        
        # apply mean over the last dimension
        #x = torch.mean(x, -1)

        x = self.dropout(x)

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return(x)

    

In [None]:
model = myCNN(EMBED_SIZE,EMBED_DIM)

In [None]:
#model.embed.weight.data[UNK_IDX] = torch.zeros(EMBED_DIM)
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBED_DIM)

In [None]:
if multigpu:
    model = torch.nn.DataParallel(model)
model.to(device)
print(model)

In [None]:
print('Num of trainable param: ',sum(p.numel() for p in model.parameters() if p.requires_grad))

### Prepare loss function

In [None]:

import sklearn

try:
    cw = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=m1.label.tolist())
except:
    cw = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=m1.target.tolist())
    
c_weights = torch.FloatTensor([cw[0], cw[1]])
criterion = nn.CrossEntropyLoss(weight=c_weights)
criterion = criterion.to(device)

optimizer = Adam(model.parameters(), lr=0.0005)

In [None]:
def softmax_accuracy(probs,all_labels):
    def getClass(x):
        return(x.index(max(x)))
    
    all_labels = all_labels.tolist()
    probs = pd.Series(probs.tolist())
    all_predicted = probs.apply(getClass)
    all_predicted.reset_index(drop=True, inplace=True)
    vc = pd.value_counts(all_predicted == all_labels)
    try:
        acc = vc[1]/len(all_labels)
    except:
        if(vc.index[0]==False):
            acc = 0
        else:
            acc = 1
    return(acc)

### Start training

In [None]:
try:
    model_foldername = 'VB-CNN_%s'%(mydataset)
except FileExistsError:
    print('Folder exists')

In [None]:
print('Training started.....')

EPOCHS=20
BEST_VAL = 9999.9
BEST_MODEL = None
BEST_EPOCH = None

for e in range(EPOCHS):
    running_acc = 0
    running_loss = 0
    timer = time.time()
    model.train()

    for batch in train_iterator:
        batch.codes, batch.label = batch.codes.to(device), batch.label.to(device)
        optimizer.zero_grad()
        output = model(batch.codes)
        loss = criterion(output, batch.label)
        loss.backward()
        optimizer.step()
        acc = softmax_accuracy(output,batch.label)
        running_acc += acc
        running_loss += loss.item()

    with torch.no_grad():
        model.eval()
        running_acc_val = 0
        running_loss_val = 0
        for batch in valid_iterator:
            batch.codes, batch.label = batch.codes.to(device), batch.label.to(device)
            output_val = model(batch.codes)
            loss_val = criterion(output_val,batch.label)
            acc_val = softmax_accuracy(output_val,batch.label)
            running_acc_val += acc_val
            running_loss_val += loss_val.item()

    print_out = "Epoch %d - Training acc: %.4f -Training loss: %.4f - Val acc: %.4f - Val loss: %.4f - Time: %.4fs \n" % (e+1,
    running_acc/len(train_iterator),
    running_loss/len(train_iterator),
    running_acc_val/len(valid_iterator),
    running_loss_val/len(valid_iterator),
    (time.time()-timer))
    
    
    selected_model = False
    
    if selected_model:
        
        myfile = open("res.txt", "a")

        if (running_loss_val/len(valid_iterator)) < BEST_VAL:
            print('Val_loss decreased!')
            print(print_out, end='')
            myfile.write('Val_loss decreased!')
            myfile.write(print_out)

            BEST_VAL = (running_loss_val/len(valid_iterator))
            BEST_MODEL = copy.deepcopy(model)
            BEST_EPOCH = e+1
            model_name = 'models/%s/model_ep_%d.tar' % (model_foldername,e+1)
            torch.save({
                'epoch': e+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss}, model_name)

        else:
            print(print_out, end='')
            myfile.write(print_out)

        myfile.close()
        
    else:
        print(print_out, end='')
        model_name = 'models/%s/model_ep_%d.tar' % (model_foldername,e+1)
        torch.save({
            'epoch': e+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss}, model_name)

        

print('Training completed!')

### Define evaluation function

In [None]:
def evaluate_testing(all_pred, all_labels):
    def getClass(x):
        return(x.index(max(x)))

    probs = pd.Series(all_pred)
    all_predicted = probs.apply(getClass)
    all_predicted.reset_index(drop=True, inplace=True)
    vc = pd.value_counts(all_predicted == all_labels)

    probs2=[]
    for x in probs:
        probs2.append(x[1])

    confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_predicted)
    print('Confusion matrix: \n',confusion)

    try:
        tn, fp, fn, tp = confusion.ravel()
        print('\nTP:',tp)
        print('FP:',fp)
        print('TN:',tn)
        print('FN:',fn)

        ## Performance measure
        print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=all_labels, y_pred=all_predicted)))
        print('Precision: '+ str(sklearn.metrics.precision_score(y_true=all_labels, y_pred=all_predicted)))
        print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=all_labels, y_pred=all_predicted)))
        print('Recall: '+ str(sklearn.metrics.recall_score(y_true=all_labels, y_pred=all_predicted)))
        print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=all_labels, y_score=probs2)))
        print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=all_labels, y_score=probs2)))
        print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=all_labels, y_pred=all_predicted)))
    except:
        None
        print('This is multiclass prediction')
    return(all_predicted)
    

### Evaluate on the testing set

In [None]:
print('Testing started.......')
## Testing
checkpoint = torch.load('models/VB-CNN_draper/model_ep_15.tar', map_location='cuda')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

model.eval()
with torch.no_grad():
    running_acc_test = 0
    running_loss_test = 0
    all_pred=[]
    all_labels=[]
    for batch in test_iterator:
        batch.codes, batch.label = batch.codes.to(device), batch.label.to(device)
        output_test = model(batch.codes).squeeze(1)
        loss_test = criterion(output_test,batch.label)
        acc_test = softmax_accuracy(output_test,batch.label)
        running_acc_test += acc_test
        running_loss_test += loss_test.item()
        all_pred += output_test.tolist()
        all_labels += batch.label.tolist()

ap=evaluate_testing(all_pred, all_labels)

#### Use below only on MVD dataset

In [None]:
tn=['non-vulnerable','CWE-404','CWE-476','CWE-119','CWE-706','CWE-670','CWE-673','CWE-119, CWE-666, CWE-573','CWE-573','CWE-668','CWE-400, CWE-665, CWE-020','CWE-662','CWE-400','CWE-665','CWE-020','CWE-074','CWE-362','CWE-191','CWE-190','CWE-610','CWE-704','CWE-170','CWE-676','CWE-187','CWE-138','CWE-369','CWE-662, CWE-573','CWE-834','CWE-400, CWE-665','CWE-400, CWE-404','CWE-221','CWE-754','CWE-311','CWE-404, CWE-668','CWE-506','CWE-758','CWE-666','CWE-467','CWE-327','CWE-666, CWE-573','CWE-469']
report = sklearn.metrics.classification_report(y_true=all_labels, y_pred=ap, digits=6,labels=np.arange(0,41),target_names=tn)
print(report)

In [None]:
confusion = sklearn.metrics.confusion_matrix(y_true=[1 if x == 0 else 0 for x in all_labels], y_pred=[1 if x == 0 else 0 for x in ap])
tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

In [None]:
all_fpr = []
w_all_fpr = []
aug_y_true_sum = 0
for counter in range(41):
    aug_y_true = [1 if x == counter else 0 for x in all_labels]
    aug_y_pred = [1 if x == counter else 0 for x in ap]
    confusion = sklearn.metrics.confusion_matrix(y_true=aug_y_true, y_pred=aug_y_pred)
    tn, fp, fn, tp = confusion.ravel()
    all_fpr.append(fp/(fp+tn))  ## FPR
    w_all_fpr.append((fp/(fp+tn))*aug_y_true.count(1))  ## w_FPR
    aug_y_true_sum += aug_y_true.count(1)

print('FPR: ', sum(all_fpr)/41.0*100.0)
print('Weighted FPR: ', sum(w_all_fpr)/aug_y_true_sum*100.0)