Imports

In [None]:
from transformers import get_linear_schedule_with_warmup
import gc
gc.collect()
import torch.nn as nn
import torch.utils.data as data
import torch
import time
import tqdm
import pandas as pd
import numpy as np
import json
import random
import uuid
import os
import sys

In [None]:
# sys.path.insert(0, r'PATH_TO_REPO')
# repo_dir = r'PATH_TO_REPO'
sys.path.insert(0, r'D:\Projects\aaadoc')
repo_dir = f'D:\\Projects\\aaadoc'

from src.utils.utils import get_files_in_from_directory
from src.dl.dl_utils import get_repo_seminames, get_files_in_set, chunks
from src.dl.datasets.BaseRawDataset import BaseRawDataset
from src.dl.datasets.SampleLevelRawDataset import SampleLevelRawDataset
from src.dl.datasets.CommitLevelRawDataset import CommitLevelRawDataset
from src.dl.datasets.sampling.OverSampledDataset import OverSampledDataset
from src.dl.datasets.sampling.UnderSampledDataset import UnderSampledDataset


from src.dl.models.BertAndLinear import BertAndLinear
from src.dl.models.LstmAggregator import LstmAggregator
from src.dl.models.ConvAggregator import ConvAggregator
from src.dl.models.MeanAggregator import MeanAggregator

Config

In [None]:
# Model config
base_model = 'microsoft/graphcodebert-base'
batch_size_ = 2
num_epochs_ = 3

fraction_of_data = 1

sample_limit = 1_000_000
eval_sample_limit = 1_000_000
# test_percentage = 0.15
# eval_percentage = 0.05
folds_count = 5
current_fold = -1

learning_rate = 2e-6
oversampling_ratio = None # if None, no ratio controll will be applied
class_ratio = 5

aggregator_num_epochs_ = 5
aggregator_class_ratio = 5
aggregator_learning_rate = 2e-4



In [None]:
save_model_in_each_epoch = True
eval_model_in_each_epoch = True

model_guid = 'debug_run'
model_name = model_guid

work_dir = f'{repo_dir}\\src\\5_train_dl\\binaries\\{model_name}'
results_dir = f'{repo_dir}\\src\\5_train_dl\\binaries\\data{model_name}'
raw_input_path = f'{repo_dir}\\src\\5_train_dl\\binaries\\debug_test_folds'


In [None]:
try:
    os.mkdir(work_dir)
except FileExistsError:
    pass

try:
    os.mkdir(results_dir) 
except FileExistsError:
    pass

In [None]:
seed = 42
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

In [None]:
def train_model(model, optimizer, data_loader, loss_module, scheduler, eval_loader = None):
    torch.cuda.empty_cache()
    global current_fold

    model.train()
    model.to(device)

    accumulated_loss = 0
    all_samples = 0
    positive_samples = 0

    for epoch in range(num_epochs_):
        print(f'Epoch {epoch}/{num_epochs_}')
        accumulated_loss = 0

        with tqdm.tqdm(total=len(data_loader)) as pbar:
            for data_inputs_x, data_labels in data_loader:
                commit_ids = data_inputs_x[0]
                data_inputs = data_inputs_x[1]

                # Step 0: Diagnostics :x
                positive_samples += len([1 for x in data_labels if x[0] == 1])
                all_samples += len(data_labels)
                
                # Step 1: Mode data to device
                data_inputs = data_inputs.to(device)
                data_labels = data_labels.to(device)

                # Step 2: Calculate model output
                preds = model(data_inputs)
                preds = preds.squeeze(dim=0)

                # Step 3: Calculate loss
                loss = loss_module(preds, data_labels.float())
                accumulated_loss += loss.item()

                ## Step 4: Perform backpropagation
                optimizer.zero_grad()
                loss.backward()

                ## Step 5: Update the parameters
                optimizer.step()
                scheduler.step()
                
                # Step 6: Progress bar
                pbar.update()
        print('Loss in this epoch:', accumulated_loss)

        if save_model_in_each_epoch:
            torch.save(model.state_dict(), f'{work_dir}/model_{model_name}_fold_{current_fold}_epoch_{epoch}.pickle')

        if eval_loader != None:
            eval_model(model, eval_loader)


    print(f'Model saw positive samples {positive_samples} times and background samples {all_samples-positive_samples}')
    print(f'Ratio 1:{(all_samples-positive_samples)/positive_samples}')


def eval_model(model, data_loader):
    torch.cuda.empty_cache()
    gc.collect()
    model.eval()
    model.to(device)

    all_labels = []
    all_predictions = []
    data_size = len(data_loader)
    with tqdm.tqdm(total=data_size) as pbar:
        for data_inputs_x, data_labels in data_loader:
            commits_ids = data_inputs_x[0]
            data_inputs = data_inputs_x[1]

            data_inputs = data_inputs.to(device)
            data_labels = data_labels.to(device)
            preds = model(data_inputs)
            preds = preds.squeeze(dim=0)

            labels_in_memory = data_labels.cpu().detach().numpy()
            if len(labels_in_memory.shape) == 1:
                all_labels.append(labels_in_memory)
            else:
                for x in labels_in_memory:
                    all_labels.append(x)
                    
            preds_in_memory = preds.cpu().detach().numpy()
            if labels_in_memory.shape[0] == 1:
                all_predictions.append(preds_in_memory)
            else:
                for x in preds_in_memory:
                    all_predictions.append(x)

            pbar.update()

    predictions_arr = [1 if x[0]>x[1] else 0 for x in all_predictions] #TODO softmax
    targets_arr = [1 if x[0]>x[1] else 0 for x in all_labels]
    P = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1])
    TP = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1 and targets_arr[x]==1])
    FP = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1 and targets_arr[x]==0])
    FN = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0 and targets_arr[x]==1])
    TN = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0 and targets_arr[x]==0])
    N = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0])

    precission = TP/(TP+FP) if (TP+FP)!=0 else 0
    recall = TP/(TP+FN) if (TP+FN)!=0 else 0
    print('Precission:',f'{TP}/{TP+FP}', precission)
    print('Recall', f'{TP}/{TP+FN}', recall)
    print(f'P:{P},', f'TP:{TP},', f'FP:{FP},', f'FN:{FN},', f'TN:{TN},', f'N:{N}')

    return precission, recall


def load_data(input_data, oversampling_ratio=None, class_ratio=None, sample_limit=None):
    positive_files = input_data[0]
    background_files = input_data[1]

    dataset = SampleLevelRawDataset()
    dataset.load_files(positive_files, background_files)

    if oversampling_ratio != None and class_ratio != None and sample_limit != None:
        dataset.setup_ratios(oversampling_ratio, class_ratio, sample_limit)   

    if oversampling_ratio == None and class_ratio == None and sample_limit != None:
        dataset.limit_data(sample_limit)   

    return dataset


def embed_files(tokenizer, data_files, marker):
    with tqdm.tqdm(total=len(data_files)) as pbar:
        for data_file in data_files:
            with open(data_file, 'r') as f:
                data = json.load(f)

            embeddings = []
            for data_point in data:
                if 'commit_sample' in data_point and \
                    data_point['commit_sample'] != None and \
                    len(data_point['commit_sample']) > 0:

                    tensor = torch.Tensor(data_point['commit_sample']).int()
                    tensor = tensor[None, :] # Extend to a batch mode
                    tensor = tensor.to(device)
                    result = tokenizer(tensor)
                    labels = result[0][:,0,:]
                    labels_in_memory = labels.cpu()
                    res = {
                        'commit_id': data_point['commit_id'],
                        'file_name': data_point['file_name'],
                        'is_security_related': data_point['is_security_related'],
                        'commit_sample': labels_in_memory.tolist()
                    }
                    embeddings.append(res)

            if len(embeddings) > 0:
                file_name = os.path.basename(data_file)
                new_file = os.path.join(results_dir, marker + '-embedded-' + file_name)
                with open(new_file, 'w') as f:
                    json.dump(embeddings, f)

            pbar.update()


def map_files_to_new_repo(data_files, marker):
    new_data_files = []
    for data_file in data_files:
        file_name = os.path.basename(data_file)
        new_file = os.path.join(results_dir, marker + '-embedded-' + file_name)
        if os.path.exists(new_file):
            new_data_files.append(new_file)

    return new_data_files


def load_fold_info(fold_number):
    with open(os.path.join(raw_input_path, f'fold-{fold_number}-files.json'), 'r') as f:
        data = json.load(f)

    all_positives = data['positive_files']
    all_backgrounds = data['background_files']
    # all_positives = [os.path.join(repo_dir, x) for x in all_positives]
    # all_backgrounds = [os.path.join(repo_dir, x) for x in all_backgrounds]
    
    positives = random.sample(all_positives, int(fraction_of_data*len(all_positives)))
    backgrounds = random.sample(all_backgrounds, int(fraction_of_data*len(all_backgrounds)))
    return (positives, backgrounds)
    

In [None]:
def fine_tune(train_dataset, eval_dataset):
    global current_fold

    # Define model
    model = BertAndLinear(base_model)
    loss_module = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, 
        num_warmup_steps=int(len(train_dataset)*0.25), 
        num_training_steps=len(train_dataset)*num_epochs_)

    # Prep the loaders
    train_data_loader = data.DataLoader(train_dataset, batch_size=batch_size_, drop_last=True, shuffle=True)
    eval_data_loader = data.DataLoader(eval_dataset, batch_size=batch_size_, drop_last=True, shuffle=True)

    # Train the model
    train_model(model, optimizer, train_data_loader, loss_module, scheduler, eval_loader=eval_data_loader)
    torch.save(model.state_dict(), f'{work_dir}/model_{model_name}_fold_{current_fold}_final.pickle')
    return model


def embed_files_with_model(model, files, marker):
    # Test the model on test subset
    for param in model.codebert.parameters():
        param.requires_grad = False
    model.codebert.eval()
    model.codebert.to(device)

    print('Embedding files with transformer from', marker)
    embed_files(model.codebert, files, marker)


def make_commit_level_datesets(train_data, eval_data, test_data, marker):
    train_data_embeded_pos = map_files_to_new_repo(train_data[0], marker)
    train_data_embeded_bac = map_files_to_new_repo(train_data[1], marker)

    eval_data_embeded_pos = map_files_to_new_repo(eval_data[0], marker)
    eval_data_embeded_bac = map_files_to_new_repo(eval_data[1], marker)

    test_data_embeded_pos = map_files_to_new_repo(test_data[0], marker)
    test_data_embeded_bac = map_files_to_new_repo(test_data[1], marker)


    train_dataset_embeded = CommitLevelRawDataset()
    train_dataset_embeded.load_files(train_data_embeded_pos, train_data_embeded_bac)
    train_dataset_embeded = UnderSampledDataset(train_dataset_embeded, aggregator_class_ratio)
    eval_dataset_embeded = CommitLevelRawDataset()
    eval_dataset_embeded.load_files(eval_data_embeded_pos, eval_data_embeded_bac)
    test_dataset_embeded = CommitLevelRawDataset()
    test_dataset_embeded.load_files(test_data_embeded_pos, test_data_embeded_bac)
    return train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded


def train_aggregator(model, optimizer, data_loader, loss_module, scheduler, test_loader = None):
    global current_fold
    torch.cuda.empty_cache()
    model.train()
    model.to(device)

    accumulated_loss = 0
    all_samples = 0
    positive_samples = 0
    results = []

    for epoch in range(aggregator_num_epochs_):
        print(f'Epoch {epoch}/{aggregator_num_epochs_}')
        accumulated_loss = 0
        model.train()

        with tqdm.tqdm(total=len(data_loader)) as pbar:
            for data_inputs_x, data_labels in data_loader:
                commits_id = data_inputs_x[0]
                data_inputs = data_inputs_x[1]

                # Step 0: Diagnostics :x
                positive_samples += len([1 for x in data_labels if x[0] == 1])
                all_samples += len(data_labels)

                #TODO different commit mode and sample mode
                data_inputs = torch.stack(data_inputs)
                
                # Step 1: Mode data to device 
                data_inputs = data_inputs.to(device)
                data_labels = data_labels.to(device) 

                # Step 2: Calculate model output
                preds = model(data_inputs)
                
                #TODO different commit mode and sample mode
                # preds = preds.squeeze(dim=0)

                # Step 3: Calculate loss
                loss = loss_module(preds, data_labels.float())
                accumulated_loss += loss.item()

                ## Step 4: Perform backpropagation
                optimizer.zero_grad()
                loss.backward()

                ## Step 5: Update the parameters
                optimizer.step()
                scheduler.step()
                
                # Step 6: Progress bar
                pbar.update()
        print('Loss in this epoch:', accumulated_loss)

        if save_model_in_each_epoch:
            torch.save(model.state_dict(), f'{work_dir}/model_agg_{model_name}_fold_{current_fold}_epoch_{epoch}.pickle')

        eval_set_loss, precission, recall = eval_aggregator(model, test_loader, loss_module)
        results.append({
            'epoch':epoch,
            'eval_set_loss':eval_set_loss,
            'precission':precission,
            'recall':recall,
        })


    print(f'Model saw positive samples {positive_samples} times and background samples {all_samples-positive_samples}')
    print(f'Ratio 1:{(all_samples-positive_samples)/positive_samples}')
    return results


def eval_aggregator(model, data_loader, loss_module):
    torch.cuda.empty_cache()
    model.eval()
    model.to(device)

    all_labels = []
    all_predictions = []
    data_size = len(data_loader)
    accumulated_loss = 0
    with tqdm.tqdm(total=data_size) as pbar:
        for data_inputs_x, data_labels in data_loader:
            commits_id = data_inputs_x[0]
            data_inputs = data_inputs_x[1]

            #TODO different commit mode and sample mode
            data_inputs = torch.stack(data_inputs)

            data_inputs = data_inputs.to(device)
            data_labels = data_labels.to(device)
            preds = model(data_inputs)
            
            loss = loss_module(preds, data_labels.float())
            lossxd = float(loss.item())
            accumulated_loss += lossxd

            #TODO different commit mode and sample mode
            # preds = preds.squeeze(dim=0)

            labels_in_memory = data_labels.cpu().detach().numpy()
            if len(labels_in_memory.shape) == 1:
                all_labels.append(labels_in_memory)
            else:
                for x in labels_in_memory:
                    all_labels.append(x)
                    
            preds_in_memory = preds.cpu().detach().numpy()
            if labels_in_memory.shape[0] == 1:
                all_predictions.append(preds_in_memory)
            else:
                for x in preds_in_memory:
                    all_predictions.append(x)

            pbar.update()

    #TODO different commit mode and sample mode
    predictions_arr = [1 if x[0,0]>x[0,1] else 0 for x in all_predictions]
    targets_arr = [1 if x[0]>x[1] else 0 for x in all_labels]
    P = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1])
    TP = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1 and targets_arr[x]==1])
    FP = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1 and targets_arr[x]==0])
    FN = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0 and targets_arr[x]==1])
    TN = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0 and targets_arr[x]==0])
    N = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0])

    precission = TP/(TP+FP) if (TP+FP)!=0 else 0
    recall = TP/(TP+FN) if (TP+FN)!=0 else 0
    print('Loss:', accumulated_loss)
    print('Precission:',f'{TP}/{TP+FP}', precission)
    print('Recall', f'{TP}/{TP+FN}', recall)
    print(f'P:{P},', f'TP:{TP},', f'FP:{FP},', f'FN:{FN},', f'TN:{TN},', f'N:{N}')

    return accumulated_loss, precission, recall


def train_and_eval_aggregator(model, train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded):
    global current_fold
    loss_module = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=aggregator_learning_rate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, 
        num_warmup_steps=int(len(train_dataset_embeded)*0.25), 
        num_training_steps=len(train_dataset_embeded)*num_epochs_)

    # Prep the loaders
    train_data_embeded_loader = data.DataLoader(train_dataset_embeded, batch_size=1, drop_last=True, shuffle=True)
    eval_data_embeded_loader = data.DataLoader(eval_dataset_embeded, batch_size=1, drop_last=True, shuffle=True)

    performance_results = train_aggregator(model, optimizer, train_data_embeded_loader, loss_module, scheduler, test_loader=eval_data_embeded_loader)

    lowest_loss = 1_000_000
    lowest_loss_epoch = 0
    for res in performance_results:
        if lowest_loss > res['eval_set_loss']:
            lowest_loss = res['eval_set_loss']
            lowest_loss_epoch = res['epoch']


    model_path = f'{work_dir}/model_agg_{model_name}_fold_{current_fold}_epoch_{lowest_loss_epoch}.pickle'
    model.load_state_dict(torch.load(model_path))
    test_data_embeded_loader = data.DataLoader(test_dataset_embeded, drop_last=True, batch_size=1)
    accumulated_loss, precission, recall = eval_aggregator(model, test_data_embeded_loader, loss_module)

    return accumulated_loss, precission, recall


def evaluate_aggregators(train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded):
    lstm_results = []
    model = LstmAggregator()
    lstm_results = train_and_eval_aggregator(model, train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded)
    del model

    conv_results = []
    model = ConvAggregator()
    conv_results = train_and_eval_aggregator(model, train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded)
    del model

    mean_results = []
    model = MeanAggregator()
    mean_results = train_and_eval_aggregator(model, train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded)
    del model

    return (lstm_results, conv_results, mean_results)


def temp_f1_score(precission, recall):
    return 0 if precission+recall==0  else 2*precission*recall/(precission+recall)


def print_summary(resutls, folds):
    print('avg_accumulated_loss', sum([x[0] for x in resutls])/folds)
    avg_precission = sum([x[1] for x in resutls])/folds
    avg_recall = sum([x[2] for x in resutls])/folds
    print('avg_precission', avg_precission)
    print('avg_recall', avg_recall)
    f1_scores = sum([temp_f1_score(x[1], x[2]) for x in resutls])/folds
    print('avg_f1', f1_scores)

Run

In [None]:
def do_a_fold(i):
    global current_fold
    current_fold = i
    data_files = []
    for x in range(folds_count):
        data_files.append(load_fold_info(x))
    

    with_offset = [(x+i)%folds_count for x in range(folds_count)]
    train_data_p = []
    train_data_n = []
    for x in range(len(data_files)-2):
        train_data_p += data_files[with_offset[x]][0]
        train_data_n += data_files[with_offset[x]][1]

    train_data = [train_data_p, train_data_n]
    eval_data = data_files[with_offset[-2]]
    test_data = data_files[with_offset[-1]]

    train_dataset = load_data(train_data, oversampling_ratio, class_ratio, sample_limit)
    eval_dataset = load_data(eval_data, sample_limit=eval_sample_limit)

    file_to_embed = []
    file_to_embed += eval_data[1]
    file_to_embed += eval_data[0]
    file_to_embed += test_data[1]
    file_to_embed += test_data[0]
    
    all_files = []
    for x in data_files:
        all_files += x[0]
        all_files += x[1]


    print('FOLD', i)
    print(len(train_data[0]), len(train_data[1]))
    print(len(eval_data[0]), len(eval_data[1]))
    print(len(test_data[0]), len(test_data[1]))
    print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
    print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

    model = fine_tune(train_dataset, eval_dataset)
    embed_files_with_model(model, all_files, f'fold-{i}')
    
    del model
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
def do_fold_finetuning():
    for i in range(folds_count):
        do_a_fold(i)

do_fold_finetuning()


In [None]:
def do_a_fold_aggregator(i):
    data_files = []
    for x in range(folds_count):
        data_files.append(load_fold_info(x))
    
    all_files = []
    for x in data_files:
        all_files += x[0]
        all_files += x[1]


    with_offset = [(x+i)%folds_count for x in range(folds_count)]
    train_data_p = []
    train_data_n = []
    for x in range(len(data_files)-2):
        train_data_p += data_files[with_offset[x]][0]
        train_data_n += data_files[with_offset[x]][1]

    train_data = [train_data_p, train_data_n]
    eval_data = data_files[with_offset[-2]]
    test_data = data_files[with_offset[-1]]
    
    print('FOLD', 'Embedded', i)
    print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
    print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

    train_dataset, eval_dataset, test_dataset = make_commit_level_datesets(train_data, eval_data, test_data, f'fold-{i}')
    lstm_results, conv_results, mean_results = evaluate_aggregators(train_dataset, eval_dataset, test_dataset)

    
    gc.collect()
    torch.cuda.empty_cache()
    return lstm_results, conv_results, mean_results

# Train aggregators

In [None]:
def do_folds_aggregators():    
    global current_fold
    lstm_folds_results = []
    conv_folds_results = []
    mean_folds_results = []
    for i in range(folds_count):
        current_fold = i
        lstm_results, conv_results, mean_results = do_a_fold_aggregator(i)
        lstm_folds_results.append(lstm_results)
        conv_folds_results.append(conv_results)
        mean_folds_results.append(mean_results)
        
    return lstm_folds_results, conv_folds_results, mean_folds_results

lstm_folds_results, conv_folds_results, mean_folds_results = do_folds_aggregators()

# Summary

In [None]:
print('done')
print('LSTM')
print_summary(lstm_folds_results, folds_count)
print('CONV')
print_summary(conv_folds_results, folds_count)
print('MEAN')
print_summary(mean_folds_results, folds_count)