In [None]:
import pandas as pd
import numpy as np
import csv
import pickle
import re
import torch
import sklearn
import os
import random
import custom
import models
import regex_def
import clang
from clang import *
from clang import cindex
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from torch.utils.data import Dataset, DataLoader, IterableDataset
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM, RobertaForSequenceClassification
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import LineByLineTextDataset
from transformers.modeling_outputs import SequenceClassifierOutput
from custom import CustomDataCollatorForLanguageModeling

## Pre-requisites stuff

In [None]:
## Set default device (GPU or CPU)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
## Deterministic/reproducible flags

seedlist = [42, 834, 692, 489, 901, 408, 819, 808, 531, 166]

seed = seedlist[0]
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
## Weights and Biases flags

os.environ['WANDB_DISABLED'] = 'true'
os.environ['WANDB_MODE'] = 'dryrun'
# os.environ["CUDA_VISIBLE_DEVICES"]=""
#os.environ['WANDB_NOTEBOOK_NAME'] = 'Pretrain word-level VulBERTa on Draper'
#os.environ['WANDB_NAME'] = 'linux'
#os.environ['WANDB_PROJECT'] = 'projectName'

## Load/initialise custom tokenizer

In [None]:
## Tokenizer

from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import NormalizedString,PreTokenizedString
from typing import List 

class MyTokenizer:
    
    cidx = cindex.Index.create()
        

    def clang_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        ## Tokkenize using clang
        tok = []
        tu = self.cidx.parse('tmp.c',
                       args=[''],  
                       unsaved_files=[('tmp.c', str(normalized_string.original))],  
                       options=0)
        for t in tu.get_tokens(extent=tu.cursor.extent):
            spelling = t.spelling.strip()
            
            if spelling == '':
                continue
                
            ## Keyword no need

            ## Punctuations no need

            ## Literal all to BPE
            
            #spelling = spelling.replace(' ', '')
            tok.append(NormalizedString(spelling))

        return(tok)
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.clang_split)
        
## Custom tokenizer

from tokenizers import Tokenizer
from tokenizers import normalizers,decoders
from tokenizers.normalizers import StripAccents, unicode_normalizer_from_str, Replace
from tokenizers.processors import TemplateProcessing
from tokenizers import processors,pre_tokenizers
from tokenizers.models import BPE

## Init new tokenizers
#my_tokenizer = Tokenizer(BPE(unk_token="<unk>"))
#my_tokenizer = Tokenizer(BPE())


## Load pre-trained tokenizers
vocab, merges = BPE.read_file(vocab="./tokenizer/drapgh-vocab.json", merges="./tokenizer/drapgh-merges.txt")
my_tokenizer = Tokenizer(BPE(vocab, merges, unk_token="<unk>"))

my_tokenizer.normalizer = normalizers.Sequence([StripAccents(), Replace(" ", "Ä")])
my_tokenizer.pre_tokenizer = PreTokenizer.custom(MyTokenizer())
my_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
my_tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    special_tokens=[
    ("<s>",0),
    ("<pad>",1),
    ("</s>",2),
    ("<unk>",3),
    ("<mask>",4)
    ]
)


## Choose dataset

In [None]:
mydataset = 'devign'

### Tokenize dataset

In [None]:
my_tokenizer.enable_truncation(max_length=1024)
my_tokenizer.enable_padding(direction='right', pad_id=1, pad_type_id=0, pad_token='<pad>', length=None, pad_to_multiple_of=None)

In [None]:
def process_encodings(encodings):
    input_ids=[]
    attention_mask=[]
    for enc in encodings:
        input_ids.append(enc.ids)
        attention_mask.append(enc.attention_mask)
    return {'input_ids':input_ids, 'attention_mask':attention_mask}

In [None]:
def cleaner(code):
    ## Remove code comments
    pat = re.compile(r'(/\*([^*]|(\*+[^*/]))*\*+/)|(//.*)')
    code = re.sub(pat,'',code)
    code = re.sub('\n','',code)
    code = re.sub('\t','',code)
    return(code)

In [None]:
if mydataset =='devign':
    train_index=set()
    valid_index=set()

    with open('data/finetune/devign/train.txt') as f:
        for line in f:
            line=line.strip()
            train_index.add(int(line))

    with open('data/finetune/devign/valid.txt') as f:
        for line in f:
            line=line.strip()
            valid_index.add(int(line))
    mydata = pd.read_json('data/finetune/devign/Devign.json')
    m1=mydata.iloc[list(train_index)]
    m2=mydata.iloc[list(valid_index)]

    mydata = None
    del(mydata)
    m1.func = m1.func.apply(cleaner)
    m2.func = m2.func.apply(cleaner)
    
    train_encodings = my_tokenizer.encode_batch(m1.func)
    train_encodings = process_encodings(train_encodings)
    
    val_encodings = my_tokenizer.encode_batch(m2.func)
    val_encodings = process_encodings(val_encodings)
    
else:
    m1 = pd.read_pickle('data/finetune/%s/%s_train.pkl'%(mydataset,mydataset))
    m2 = pd.read_pickle('data/finetune/%s/%s_val.pkl'%(mydataset,mydataset))
    
    try:
        m1.functionSource = m1.functionSource.apply(cleaner)
        m2.functionSource = m2.functionSource.apply(cleaner)
        
        if mydataset =='draper':
            m1['target'] = m1['combine']*1
            m2['target'] = m2['combine']*1
        
        train_encodings = my_tokenizer.encode_batch(m1.functionSource)
        train_encodings = process_encodings(train_encodings)

        val_encodings = my_tokenizer.encode_batch(m2.functionSource)
        val_encodings = process_encodings(val_encodings)
        
    except:
        m1.func = m1.func.apply(cleaner)
        m2.func = m2.func.apply(cleaner)
        
        train_encodings = my_tokenizer.encode_batch(m1.func)
        train_encodings = process_encodings(train_encodings)

        val_encodings = my_tokenizer.encode_batch(m2.func)
        val_encodings = process_encodings(val_encodings)


### Prepare dataset

In [None]:
class MyCustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        assert len(self.encodings['input_ids']) == len(self.encodings['attention_mask']) ==  len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
try:
    train_dataset = MyCustomDataset(train_encodings, m1.target.tolist())
    val_dataset = MyCustomDataset(val_encodings, m2.target.tolist())
    train_labels = m1.target.tolist()
except:
    train_dataset = MyCustomDataset(train_encodings, m1.label.tolist())
    val_dataset = MyCustomDataset(val_encodings, m2.label.tolist())
    train_labels = m1.label.tolist()

In [None]:
## D2A dataset ONLY

task = 'function'

m1 = pd.read_csv('data/finetune/%s/%s/d2a_lbv1_%s_train.csv'%(mydataset,task,task))
m1.code = m1.code.apply(cleaner)
train_encodings = my_tokenizer.encode_batch(m1.code)
train_encodings = process_encodings(train_encodings)
train_dataset = MyCustomDataset(train_encodings, m1.label.tolist())

m2 = pd.read_csv('data/finetune/%s/%s/d2a_lbv1_%s_dev.csv'%(mydataset,task,task))
m2.code = m2.code.apply(cleaner)
val_encodings = my_tokenizer.encode_batch(m2.code)
val_encodings = process_encodings(val_encodings)
val_dataset = MyCustomDataset(val_encodings, m2.label.tolist())

### streaming dataset ONLY! 

In [None]:
################################## STREAMING DATASET #############################
import pickle  


m1 = pd.read_pickle('data/finetune/%s/%s_train.pkl'%(mydataset,mydataset))
m1.functionSource = m1.functionSource.apply(cleaner)
m1['target'] = m1['combine']*1
m1 = np.array_split(m1, 5)

split = 'train'
with open('data/finetune/draper/draper_stream_%s.pkl'%split, 'wb') as f:
    for m in m1:
        encodings = my_tokenizer.encode_batch(m.functionSource)
        labels = m.target.tolist()
        for enc,label in zip(encodings,labels):
            pickle.dump({'input_ids':enc.ids, 'attention_mask':enc.attention_mask, 'labels':label},f)

            
m2 = pd.read_pickle('data/finetune/%s/%s_val.pkl'%(mydataset,mydataset))
m2.functionSource = m2.functionSource.apply(cleaner)
m2['target'] = m2['combine']*1
m2 = np.array_split(m2, 5)

split = 'val'
with open('data/finetune/draper/draper_stream_%s.pkl'%split, 'wb') as f:
    for m in m2:
        encodings = my_tokenizer.encode_batch(m.functionSource)
        labels = m.target.tolist()
        for enc,label in zip(encodings,labels):
            pickle.dump({'input_ids':enc.ids, 'attention_mask':enc.attention_mask, 'labels':label},f)

            
m3 = pd.read_pickle('data/finetune/%s/%s_test.pkl'%(mydataset,mydataset))
m3.functionSource = m3.functionSource.apply(cleaner)
m3['target'] = m3['combine']*1
m3 = np.array_split(m3, 5)

split = 'test'
with open('data/finetune/draper/draper_stream_%s.pkl'%split, 'wb') as f:
    for m in m3:
        encodings = my_tokenizer.encode_batch(m.functionSource)
        labels = m.target.tolist()
        for enc,label in zip(encodings,labels):
            pickle.dump({'input_ids':enc.ids, 'attention_mask':enc.attention_mask, 'labels':label},f)

In [None]:
class MyDataset(IterableDataset):

    def __init__(self,filename,rcount):
     
        self.filename=filename
        self.len_labels=rcount
        super().__init__()
                    
    def process(self,filename):
        with open(filename, "rb") as f:
            while True:
                try:
                    item = pickle.load(f)
                    yield {'input_ids': torch.tensor(item['input_ids']), 'attention_mask':torch.tensor(item['attention_mask']), 'labels':torch.tensor(item['labels'])}
                except EOFError:
                    break
                    
    def __len__(self):
        return self.len_labels

    def __iter__(self):
        dataset=self.process(self.filename)          
        return dataset

In [None]:
train_rcount = len(pd.read_pickle('data/finetune/draper/draper_train.pkl'))
train_dataset = MyDataset('data/finetune/draper/draper_stream_train.pkl', train_rcount)

In [None]:
val_rcount = len(pd.read_pickle('data/finetune/draper/draper_val.pkl'))
val_dataset = MyDataset('data/finetune/draper/draper_stream_val.pkl', val_rcount)

In [None]:
import pickle
pickle.dump( train_dataset, open( "td.pkl", "wb" ))
pickle.dump( val_dataset, open( "vd.pkl", "wb" ))

In [None]:
import pickle
train_dataset = pickle.load(open( "td.pkl", "rb" ))
val_dataset = pickle.load(open( "vd.pkl", "rb" ))

### Load pretrained model

In [None]:
## Pre-trained RoBERTa

model = RobertaForSequenceClassification.from_pretrained('./models/VulBERTa/')
print(model.num_parameters())

### Custom loss function with class weights

In [None]:
try:
    cw = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=m1.label.tolist())
except:
    cw = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=m1.target.tolist())
    
c_weights = torch.FloatTensor([cw[0], cw[1]])

In [None]:
#criterion = torch.nn.CrossEntropyLoss() 
criterion = torch.nn.CrossEntropyLoss(weight=c_weights) 
criterion.to(device)

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs["logits"]
        #logits = outputs[0]        #### USE THIS IF CNN OR LSTM VURLBERTA
        loss = criterion(logits,labels)
        return (loss, outputs) if return_outputs else loss

## Train the model

In [None]:
training_args = TrainingArguments(
        output_dir="models/VB-MLP_%s" % mydataset,
        overwrite_output_dir=False,
        per_device_train_batch_size=4,
        num_train_epochs=10,
        evaluation_strategy='epoch',
        save_total_limit=20,
        seed=seed,
        learning_rate=3e-05,
        fp16=True,
        report_to=None,
        load_best_model_at_end =True
)

In [None]:
trainer = MyTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset 
)

In [None]:
trainer.train()

In [None]:
import mlflow
mlflow.end_run()