Imports

In [None]:
from transformers import get_linear_schedule_with_warmup
import gc
gc.collect()
import torch.nn as nn
import torch.utils.data as data
import torch
import time
import tqdm
import pandas as pd
import numpy as np
import json
import random
import uuid
import os

In [None]:
def get_files_in_from_directory(dir, extension=None, startswith=None):
    files_list = []
    for root, subdirs, files in os.walk(dir):
        for file in files:
            if extension != None and not file.endswith(extension):
                continue

            if startswith != None and not file.startswith(startswith):
                continue
            
            file_path = os.path.join(root, file)
            files_list.append(file_path)
    return files_list

In [None]:
def get_repo_seminames(files):
    repos = set()
    for x in files:
        segments = x.split('-')
        repo_semi_full_name = f'{segments[2]}-{segments[3]}'
        repos.add(repo_semi_full_name)

    return repos


def get_files_in_set(filenames, test_repos):
    filtered_json_files = []
    for x in filenames:
        is_test = False
        for r in test_repos:
            if r in x:
                is_test = True
                break
        if is_test:
            filtered_json_files.append(x)

    return filtered_json_files

In [None]:
from abc import abstractmethod
import json
import torch
import torch.utils.data as data
import random

# ['js', 'jsx', 'ts', 'tsx', ]
VALID_EXTENSIONS = set(['java'])

class BaseRawDataset(data.Dataset):
    def __init__(self):
        self.positive_data = []
        self.background_data = []


    @abstractmethod
    def _load_file(self, collection, json_file):
        pass


    def crop_data(self, positive_count, background_count):
        self.positive_data = random.sample(self.positive_data, min(positive_count, len(self.positive_data)))
        self.background_data = random.sample(self.background_data, min(background_count, len(self.background_data)))


    def limit_data(self, limit):
        final_positive_count = int(limit*len(self.positive_data)/(len(self.positive_data)+len(self.background_data)))
        self.positive_data = random.sample(self.positive_data, min(final_positive_count, len(self.positive_data)))
        self.background_data = random.sample(self.background_data, min(limit-final_positive_count, len(self.background_data)))
        

    def setup_ratios(self, oversampling_ratio, class_ratio, samples_limit):
        if oversampling_ratio == -1:
            oversampling_ratio = int(len(self.background_data)/(class_ratio*len(self.positive_data)))

        positive_count = len(self.positive_data) * oversampling_ratio
        self.positive_data = self.positive_data * oversampling_ratio
        background_count = class_ratio*len(self.positive_data)
        self.background_data = random.sample(self.background_data, min(background_count, len(self.background_data)))

        final_positive_count = int(samples_limit*positive_count/(positive_count+background_count))
        self.positive_data = random.sample(self.positive_data, min(final_positive_count, len(self.positive_data)))
        self.background_data = random.sample(self.background_data, min(samples_limit-final_positive_count, len(self.background_data)))



        
    def split_data(self, fraction):
        positive_cut_point = int(fraction*len(self.positive_data))
        background_cut_point = int(fraction*len(self.background_data))

        random.shuffle(self.positive_data)
        random.shuffle(self.background_data)
        part_a = BaseRawDataset()
        part_a.positive_data = self.positive_data[:positive_cut_point]
        part_a.background_data = self.background_data[:background_cut_point]

        part_b = BaseRawDataset()
        part_b.positive_data = self.positive_data[positive_cut_point:]
        part_b.background_data = self.background_data[background_cut_point:]

        return part_a, part_b


    def load_files(self, positive_json_files, background_json_files):
        positive_data_temp = []
        background_data_temp = []

        for filename in positive_json_files:
            try:
                with open(filename, 'r') as f:
                    temp_data = json.load(f)
                    temp_data = [x for x in temp_data if x['file_name'].split('.')[-1] in VALID_EXTENSIONS]
                    self._load_file(positive_data_temp, temp_data)
            except Exception as e:
                print('Failed to load', filename)
                print(e)

        for filename in background_json_files:
            try:
                with open(filename, 'r') as f:
                    temp_data = json.load(f)
                    temp_data = [x for x in temp_data if x['file_name'].split('.')[-1] in VALID_EXTENSIONS]
                    self._load_file(background_data_temp, temp_data)
            except Exception as e:
                print('Failed to load', filename)
                print(e)

        self.positive_data = positive_data_temp
        self.background_data = background_data_temp


    def __len__(self):
        return len(self.positive_data) + len(self.background_data)


    def __getitem__(self, idx):
        if idx < len(self.positive_data):
            data_point = self.positive_data[idx]
            data_label = torch.Tensor([1, 0]) 
        else:
            data_point = self.background_data[idx - len(self.positive_data)]
            data_label = torch.Tensor([0, 1]) 
        
        return data_point, data_label
        

In [None]:
import torch
import numpy as np


class SampleLevelRawDataset(BaseRawDataset):
    def __init__(self):
        # super().__init__('positive-encodings', 'background-encodings')
        super().__init__()


    def _load_file(self, collection, json_file):
        data = [x['commit_sample'] for x in json_file
            if 'commit_sample' in x and x['commit_sample'] != None and len(x['commit_sample']) > 0]
            
        if len(data) > 0:
            tensors = torch.stack([torch.Tensor(x).int() for x in data])
            tensors = torch.unique(tensors, dim=0)
            collection += tensors


In [None]:
import torch


class CommitLevelRawDataset(BaseRawDataset):
    def __init__(self):
        # super().__init__('embedded-positive-encodings', 'embedded-background-encodings')
        super().__init__()


    def _load_file(self, collection, json_file):
        data = [x['commit_sample'] for x in json_file
            if 'commit_sample' in x and x['commit_sample'] != None and len(x['commit_sample']) > 0]

        if len(data) > 0:  
            collection.append([torch.Tensor(x) for x in data])

    


In [None]:
from math import floor
import torch.utils.data as data
import torch
import random

class OverSampledDataset(data.Dataset):
    def __init__(self, base_dataset: BaseRawDataset, ratio):
        super().__init__()
        self.base_dataset = base_dataset
        self.ratio = ratio
        self.data = []
        self.labels = []

        self._oversample_to_ratio()
        

    def _oversample_to_ratio(self):
        self.data = []
        self.labels = []

        target_number_of_positive = int(len(self.base_dataset.background_data) / self.ratio)
        whole_repetition = floor(target_number_of_positive/len(self.base_dataset.positive_data))
        self.data = self.base_dataset.positive_data * whole_repetition
        extra_sampled = random.sample(self.base_dataset.positive_data, target_number_of_positive-len(self.data))
        self.data += extra_sampled

        self.labels = [[1, 0] for _ in self.data] + [[0, 1] for _ in self.base_dataset.background_data]
        # self.labels = [[1, -1] for _ in self.data] + [[-1, 1] for _ in background_data]
        self.data += self.base_dataset.background_data
        

        self.labels = torch.Tensor(self.labels)
        self.labels = self.labels.int()
        print('Data loaded')

    def __len__(self):
        return len(self.labels)


    def __getitem__(self, idx):
        data_point = self.data[idx]
        data_label = self.labels[idx]
        return data_point, data_label

In [None]:
import torch.utils.data as data
import torch
import random

class UnderSampledDataset(data.Dataset):
    def __init__(self, base_dataset:BaseRawDataset, ratio):
        super().__init__()
        self.data = []
        self.labels = []
        self.base_dataset = base_dataset

        self._undersample_to_ratio(ratio)
        

    def _undersample_to_ratio(self, ratio):
        target_number_of_background = int(len(self.base_dataset.positive_data) * ratio)
        if target_number_of_background > len(self.base_dataset.background_data):
            raise Exception("Cannot undersample")

        background_sampled = random.sample(self.base_dataset.background_data, target_number_of_background)
        self.data = self.base_dataset.positive_data + background_sampled

        # self.labels = [torch.Tensor([1, -1]).int() for x in positive_data] + [torch.Tensor([-1, 1]).int() for x in background_sampled]
        self.labels = [torch.Tensor([1, 0]).int() for x in self.base_dataset.positive_data] + [torch.Tensor([0, 1]).int() for x in background_sampled]
        print('Data loaded')

    def __len__(self):
        return len(self.labels)


    def __getitem__(self, idx):
        data_point = self.data[idx]
        data_label = self.labels[idx]
        return data_point, data_label

In [None]:
from transformers import AutoModel
import torch.nn as nn

class BertAndLinear(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.codebert = AutoModel.from_pretrained(base_model)
        self.linear1 = nn.Linear(768, 2)
        # self.act_fn = nn.Softmax()
        # self.act_fn = nn.Tanh()

    def forward(self, x):
        x_1 = self.codebert(x)
        x_2 = x_1[0]
        x_22 = x_2[:,0,:]
        x_3 = self.linear1(x_22)
        # x_4 = self.act_fn(x_3)
        return x_3

In [None]:
from unicodedata import bidirectional
from transformers import AutoModel
import torch.nn as nn

class LstmAggregator(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=768,
            hidden_size=512,
            num_layers = 5,
            # bidirectional=True,
            dropout=0.2,
            batch_first = False)
        self.linear1 = nn.Linear(512, 2)
        self.act_fn = nn.Softmax()

    def forward(self, x):
        xx = x.squeeze(1)
        xx = xx.squeeze(1)
        lenx = xx.shape[0]
        out = self.lstm(xx.view(lenx, 1, -1))
        x_3 = self.linear1(out[0][-1])
        return x_3

In [None]:
from transformers import AutoModel
import torch.nn as nn
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

class ConvAggregator(nn.Module):
    def __init__(self):
        super().__init__()
        self.max_vector_size = 100
        
        self.conv1 = nn.Conv2d(1, 4, kernel_size=(2,8), stride=(1, 4))
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=(4,8), stride=(2, 4))

        self.conv2 = nn.Conv2d(4, 1, kernel_size=4, stride=2)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=4, stride=2)

        self.dropout = nn.Dropout(p=0.05)
        self.head = nn.Linear(100, 2)

    def forward(self, x):
        x_2 = torch.zeros([1, 1, self.max_vector_size, 768])
        x_2 = x_2.to(x[0].get_device())
        xx = x.movedim(0,-2)
        _, _, h, w = xx.shape
        if h > self.max_vector_size:
            xx = xx[:,:,0:self.max_vector_size,:]
            h = self.max_vector_size
        x_2[0, 0, 0:h, 0:w] = xx

        x_3 = self.conv1(x_2)
        x_4 = self.relu1(x_3)
        x_5 = self.pool1(x_4)
        
        x_6 = self.conv2(x_5)
        x_7 = self.relu2(x_6)
        x_8 = self.pool2(x_7)
        x_9 = torch.flatten(x_8, start_dim=1, end_dim=3)

        x99= self.dropout(x_9)
        x_10 = self.head(x99)
        return x_10

In [None]:
from transformers import AutoModel
import torch.nn as nn
import torch

class MeanAggregator(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear3 = nn.Sequential(
          nn.Linear(768, 2),
        )

    def forward(self, x):
        xx = x.squeeze(1)
        x_2 = self.linear3(xx)
        x_3 = torch.mean(x_2, dim=0)

        return x_3

Config

In [None]:
# Model config
base_model = 'microsoft/graphcodebert-base'
batch_size_ = 2
num_epochs_ = 3

fraction_of_data = 1

sample_limit = 1_000_000_000
eval_sample_limit = 1_000_000_000
test_percentage = 0.15
eval_percentage = 0.05
folds_count=5

learning_rate = 1e-6
oversampling_ratio = 4
class_ratio = 2


In [None]:
aggregator_num_epochs_ = 15
aggregator_class_ratio = 2
aggregator_learning_rate = 5e-6

save_model_in_each_epoch = True
eval_model_in_each_epoch = True

model_guid = str(uuid.uuid4())
model_name = model_guid

work_dir = f'D:\\Projects\\aaa\src\\rq5\\binaries\\{model_name}'
results_dir = f'D:\\Projects\\aaa\src\\rq5\\binaries\\data{model_name}'

# Data config - Set to None if you want to use cached datasets
# raw_input_path = 'D:\\Projects\\aaa\\results\\dl\\java2\\CodeParserMiner_ast'
# raw_input_path = 'D:\\Projects\\aaa\\results\\dl\\java2\\CodeParserMiner_edit'
raw_input_path = 'D:\\Projects\\aaa\\results\\dl\\java2\\AddedCodeMiner'
# raw_input_path = 'D:\\Projects\\aaa\\results\\dl\\java2\\RollingWindowMiner'
# raw_input_path = None\


In [None]:
try:
    os.mkdir(work_dir)
except FileExistsError:
    pass

try:
    os.mkdir(results_dir) 
except FileExistsError:
    pass

In [None]:
seed = 42
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

In [None]:
def train_model(model, optimizer, data_loader, loss_module, scheduler, eval_loader = None):
    torch.cuda.empty_cache()
    model.train()
    model.to(device)

    accumulated_loss = 0
    all_samples = 0
    positive_samples = 0

    for epoch in range(num_epochs_):
        print(f'Epoch {epoch}/{num_epochs_}')
        accumulated_loss = 0

        with tqdm.tqdm(total=len(data_loader)) as pbar:
            for data_inputs, data_labels in data_loader:
                # Step 0: Diagnostics :x
                positive_samples += len([1 for x in data_labels if x[0] == 1])
                all_samples += len(data_labels)
                
                # Step 1: Mode data to device
                data_inputs = data_inputs.to(device)
                data_labels = data_labels.to(device)

                # Step 2: Calculate model output
                preds = model(data_inputs)
                preds = preds.squeeze(dim=0)

                # Step 3: Calculate loss
                loss = loss_module(preds, data_labels.float())
                accumulated_loss += loss.item()

                ## Step 4: Perform backpropagation
                optimizer.zero_grad()
                loss.backward()

                ## Step 5: Update the parameters
                optimizer.step()
                scheduler.step()
                
                # Step 6: Progress bar
                pbar.update()
        print('Loss in this epoch:', accumulated_loss)

        if save_model_in_each_epoch:
            torch.save(model.state_dict(), f'{work_dir}/model_{model_name}_epoch_{epoch}.pickle')

        if eval_loader != None:
            eval_model(model, eval_loader)


    print(f'Model saw positive samples {positive_samples} times and background samples {all_samples-positive_samples}')
    print(f'Ratio 1:{(all_samples-positive_samples)/positive_samples}')


def eval_model(model, data_loader):
    torch.cuda.empty_cache()
    gc.collect()
    model.eval()
    model.to(device)

    all_labels = []
    all_predictions = []
    data_size = len(data_loader)
    with tqdm.tqdm(total=data_size) as pbar:
        for data_inputs, data_labels in data_loader:
            data_inputs = data_inputs.to(device)
            data_labels = data_labels.to(device)
            preds = model(data_inputs)
            preds = preds.squeeze(dim=0)

            labels_in_memory = data_labels.cpu().detach().numpy()
            if len(labels_in_memory.shape) == 1:
                all_labels.append(labels_in_memory)
            else:
                for x in labels_in_memory:
                    all_labels.append(x)
                    
            preds_in_memory = preds.cpu().detach().numpy()
            if labels_in_memory.shape[0] == 1:
                all_predictions.append(preds_in_memory)
            else:
                for x in preds_in_memory:
                    all_predictions.append(x)

            pbar.update()

    predictions_arr = [1 if x[0]>x[1] else 0 for x in all_predictions] #TODO softmax
    targets_arr = [1 if x[0]>x[1] else 0 for x in all_labels]
    P = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1])
    TP = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1 and targets_arr[x]==1])
    FP = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1 and targets_arr[x]==0])
    FN = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0 and targets_arr[x]==1])
    TN = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0 and targets_arr[x]==0])
    N = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0])

    precission = TP/(TP+FP) if (TP+FP)!=0 else 0
    recall = TP/(TP+FN) if (TP+FN)!=0 else 0
    print('Precission:',f'{TP}/{TP+FP}', precission)
    print('Recall', f'{TP}/{TP+FN}', recall)
    print(f'P:{P},', f'TP:{TP},', f'FP:{FP},', f'FN:{FN},', f'TN:{TN},', f'N:{N}')

    return precission, recall


def load_files(input_path, data_fraction=1):
    positive_json_files = get_files_in_from_directory(input_path, extension='.json', startswith='positive-encodings')
    background_json_files = get_files_in_from_directory(input_path, extension='.json', startswith='background-encodings')

    if data_fraction < 1:
        positive_json_files = random.sample(positive_json_files, int(len(positive_json_files)*data_fraction))
        background_json_files = random.sample(background_json_files, int(len(background_json_files)*data_fraction))


    repos_set = get_repo_seminames(positive_json_files)
    repos_count = len(repos_set)


    repos_test = set(random.sample(list(repos_set), int(repos_count*test_percentage)))
    repos_set.difference_update(repos_test)
    repos_eval = set(random.sample(list(repos_set), int(repos_count*test_percentage)))
    repos_set.difference_update(repos_eval)

    positive_train = get_files_in_set(positive_json_files, repos_set)
    positive_eval = get_files_in_set(positive_json_files, repos_eval)
    positive_test = get_files_in_set(positive_json_files, repos_test)

    background_train = get_files_in_set(background_json_files, repos_set)
    background_eval = get_files_in_set(background_json_files, repos_eval)
    background_test = get_files_in_set(background_json_files, repos_test)

    return (positive_json_files, background_json_files), (positive_train, background_train), (positive_eval, background_eval), (positive_test, background_test)

def divide_chunks(array, n):
    for i in range(0, len(array), n):
        yield array[i:i + n]

def chunks(array, number_of_chunks):
    for i in range(0, number_of_chunks):
        yield array[i::number_of_chunks]

def load_fold_data(input_path, fold_count = 5,  data_fraction=1):
    positive_json_files = get_files_in_from_directory(input_path, extension='.json', startswith='positive-encodings')
    background_json_files = get_files_in_from_directory(input_path, extension='.json', startswith='background-encodings')

    if data_fraction < 1:
        positive_json_files = random.sample(positive_json_files, int(len(positive_json_files)*data_fraction))
        background_json_files = random.sample(background_json_files, int(len(background_json_files)*data_fraction))


    repos_set = get_repo_seminames(positive_json_files)
    repos_list = list(repos_set)
    random.shuffle(repos_list)

    result = []
    repos_folded = chunks(repos_list, fold_count)
    for fold in repos_folded:
        fold_set = set(fold)
        fold_positive = get_files_in_set(positive_json_files, fold_set)
        fold_background = get_files_in_set(background_json_files, fold_set)
        result.append((fold_positive, fold_background))

    return result
    

def load_data(input_data, oversampling_ratio=None, class_ratio=None, sample_limit=None):
    positive_files = input_data[0]
    background_files = input_data[1]

    dataset = SampleLevelRawDataset()
    dataset.load_files(positive_files, background_files)

    if oversampling_ratio != None and class_ratio != None and sample_limit != None:
        dataset.setup_ratios(oversampling_ratio, class_ratio, sample_limit)   

    if oversampling_ratio == None and class_ratio == None and sample_limit != None:
        dataset.limit_data(sample_limit)   

    return dataset


def embed_files(tokenizer, data_files, marker):
    with tqdm.tqdm(total=len(data_files)) as pbar:
        for data_file in data_files:
            with open(data_file, 'r') as f:
                data = json.load(f)

            embeddings = []
            for data_point in data:
                if 'commit_sample' in data_point and \
                    data_point['commit_sample'] != None and \
                    len(data_point['commit_sample']) > 0:

                    tensor = torch.Tensor(data_point['commit_sample']).int()
                    tensor = tensor[None, :] # Extend to a batch mode
                    tensor = tensor.to(device)
                    result = tokenizer(tensor)
                    labels = result[0][:,0,:]
                    labels_in_memory = labels.cpu()
                    res = {
                        'commit_id': data_point['commit_id'],
                        'file_name': data_point['file_name'],
                        'is_security_related': data_point['is_security_related'],
                        'commit_sample': labels_in_memory.tolist()
                    }
                    embeddings.append(res)

            if len(embeddings) > 0:
                file_name = os.path.basename(data_file)
                new_file = os.path.join(results_dir, marker + '-embedded-' + file_name)
                with open(new_file, 'w') as f:
                    json.dump(embeddings, f)

            pbar.update()


def map_files_to_new_repo(data_files, marker):
    new_data_files = []
    for data_file in data_files:
        file_name = os.path.basename(data_file)
        new_file = os.path.join(results_dir, marker + '-embedded-' + file_name)
        if os.path.exists(new_file):
            new_data_files.append(new_file)

    return new_data_files



def save_file_datasets(file_dataset, dataset_type):
    data = {
        'positive_files': file_dataset[0],
        'background_files': file_dataset[1]
    }
    with open(os.path.join(results_dir, f'{dataset_type}-files.json'), 'w') as f:
        json.dump(data, f)


def load_file_dataset(dataset_type):
    with open(os.path.join(results_dir, f'{dataset_type}-files.json'), 'r') as f:
        data = json.load(f)

    return (data['positive_files'], data['background_files'])




In [None]:
def fine_tune(train_data, eval_data):
    train_dataset = load_data(train_data, oversampling_ratio, class_ratio, sample_limit)
    eval_dataset = load_data(eval_data, sample_limit=eval_sample_limit)

    # Define model
    model = BertAndLinear(base_model)
    loss_module = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, 
        num_warmup_steps=int(len(train_dataset)*0.25), 
        num_training_steps=len(train_dataset)*num_epochs_)

    # Prep the loaders
    train_data_loader = data.DataLoader(train_dataset, batch_size=batch_size_, drop_last=True, shuffle=True)
    eval_data_loader = data.DataLoader(eval_dataset, batch_size=batch_size_, drop_last=True, shuffle=True)

    # Train the model
    train_model(model, optimizer, train_data_loader, loss_module, scheduler, eval_loader=eval_data_loader)
    torch.save(model.state_dict(), f'{work_dir}/model_{model_name}_final.pickle')
    return model

def embed_files_2(model, files, marker):
    # Test the model on test subset
    for param in model.codebert.parameters():
        param.requires_grad = False
    model.codebert.eval()
    model.codebert.to(device)

    print('Embedding files with transformer from', marker)
    embed_files(model.codebert, files, marker)

def make_commit_level_datesets(train_data, eval_data, test_data, marker):
    train_data_embeded_pos = map_files_to_new_repo(train_data[0], marker)
    train_data_embeded_bac = map_files_to_new_repo(train_data[1], marker)

    eval_data_embeded_pos = map_files_to_new_repo(eval_data[0], marker)
    eval_data_embeded_bac = map_files_to_new_repo(eval_data[1], marker)

    test_data_embeded_pos = map_files_to_new_repo(test_data[0], marker)
    test_data_embeded_bac = map_files_to_new_repo(test_data[1], marker)


    train_dataset_embeded = CommitLevelRawDataset()
    train_dataset_embeded.load_files(train_data_embeded_pos, train_data_embeded_bac)
    train_dataset_embeded = UnderSampledDataset(train_dataset_embeded, aggregator_class_ratio)
    eval_dataset_embeded = CommitLevelRawDataset()
    eval_dataset_embeded.load_files(eval_data_embeded_pos, eval_data_embeded_bac)
    test_dataset_embeded = CommitLevelRawDataset()
    test_dataset_embeded.load_files(test_data_embeded_pos, test_data_embeded_bac)
    return train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded

def train_aggregator(model, optimizer, data_loader, loss_module, scheduler, test_loader = None):
    torch.cuda.empty_cache()
    model.train()
    model.to(device)

    accumulated_loss = 0
    all_samples = 0
    positive_samples = 0
    results = []

    for epoch in range(aggregator_num_epochs_):
        print(f'Epoch {epoch}/{aggregator_num_epochs_}')
        accumulated_loss = 0
        model.train()

        with tqdm.tqdm(total=len(data_loader)) as pbar:
            for data_inputs, data_labels in data_loader:
                # Step 0: Diagnostics :x
                positive_samples += len([1 for x in data_labels if x[0] == 1])
                all_samples += len(data_labels)

                #TODO different commit mode and sample mode
                data_inputs = torch.stack(data_inputs)
                
                # Step 1: Mode data to device 
                data_inputs = data_inputs.to(device)
                data_labels = data_labels.to(device) 

                # Step 2: Calculate model output
                preds = model(data_inputs)
                
                #TODO different commit mode and sample mode
                # preds = preds.squeeze(dim=0)

                # Step 3: Calculate loss
                loss = loss_module(preds, data_labels.float())
                accumulated_loss += loss.item()

                ## Step 4: Perform backpropagation
                optimizer.zero_grad()
                loss.backward()

                ## Step 5: Update the parameters
                optimizer.step()
                scheduler.step()
                
                # Step 6: Progress bar
                pbar.update()
        print('Loss in this epoch:', accumulated_loss)

        if save_model_in_each_epoch:
            torch.save(model.state_dict(), f'{work_dir}/model_{model_name}_epoch_{epoch}.pickle')

        eval_set_loss, precission, recall = eval_aggregator(model, test_loader, loss_module)
        results.append({
            'epoch':epoch,
            'eval_set_loss':eval_set_loss,
            'precission':precission,
            'recall':recall,
        })


    print(f'Model saw positive samples {positive_samples} times and background samples {all_samples-positive_samples}')
    print(f'Ratio 1:{(all_samples-positive_samples)/positive_samples}')
    return results


def eval_aggregator(model, data_loader, loss_module):
    torch.cuda.empty_cache()
    model.eval()
    model.to(device)

    all_labels = []
    all_predictions = []
    data_size = len(data_loader)
    accumulated_loss = 0
    with tqdm.tqdm(total=data_size) as pbar:
        for data_inputs, data_labels in data_loader:

            #TODO different commit mode and sample mode
            data_inputs = torch.stack(data_inputs)

            data_inputs = data_inputs.to(device)
            data_labels = data_labels.to(device)
            preds = model(data_inputs)
            
            loss = loss_module(preds, data_labels.float())
            lossxd = float(loss.item())
            accumulated_loss += lossxd

            #TODO different commit mode and sample mode
            # preds = preds.squeeze(dim=0)

            labels_in_memory = data_labels.cpu().detach().numpy()
            if len(labels_in_memory.shape) == 1:
                all_labels.append(labels_in_memory)
            else:
                for x in labels_in_memory:
                    all_labels.append(x)
                    
            preds_in_memory = preds.cpu().detach().numpy()
            if labels_in_memory.shape[0] == 1:
                all_predictions.append(preds_in_memory)
            else:
                for x in preds_in_memory:
                    all_predictions.append(x)

            pbar.update()

    #TODO different commit mode and sample mode
    predictions_arr = [1 if x[0,0]>x[0,1] else 0 for x in all_predictions]
    targets_arr = [1 if x[0]>x[1] else 0 for x in all_labels]
    P = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1])
    TP = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1 and targets_arr[x]==1])
    FP = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==1 and targets_arr[x]==0])
    FN = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0 and targets_arr[x]==1])
    TN = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0 and targets_arr[x]==0])
    N = len([1 for x in range(len(predictions_arr)) if predictions_arr[x]==0])

    precission = TP/(TP+FP) if (TP+FP)!=0 else 0
    recall = TP/(TP+FN) if (TP+FN)!=0 else 0
    print('Loss:', accumulated_loss)
    print('Precission:',f'{TP}/{TP+FP}', precission)
    print('Recall', f'{TP}/{TP+FN}', recall)
    print(f'P:{P},', f'TP:{TP},', f'FP:{FP},', f'FN:{FN},', f'TN:{TN},', f'N:{N}')

    return accumulated_loss, precission, recall


def train_and_eval_aggregator(model, train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded):
    loss_module = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=aggregator_learning_rate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, 
        num_warmup_steps=int(len(train_dataset_embeded)*0.25), 
        num_training_steps=len(train_dataset_embeded)*num_epochs_)

    # Prep the loaders
    train_data_embeded_loader = data.DataLoader(train_dataset_embeded, batch_size=1, drop_last=True, shuffle=True)
    eval_data_embeded_loader = data.DataLoader(eval_dataset_embeded, batch_size=1, drop_last=True, shuffle=True)

    performance_results = train_aggregator(model, optimizer, train_data_embeded_loader, loss_module, scheduler, test_loader=eval_data_embeded_loader)

    lowest_loss = 1_000_000
    lowest_loss_epoch = 0
    for res in performance_results:
        if lowest_loss > res['eval_set_loss']:
            lowest_loss = res['eval_set_loss']
            lowest_loss_epoch = res['epoch']


    model_path = f'{work_dir}/model_{model_name}_epoch_{lowest_loss_epoch}.pickle'
    model.load_state_dict(torch.load(model_path))
    test_data_embeded_loader = data.DataLoader(test_dataset_embeded, drop_last=True, batch_size=1)
    accumulated_loss, precission, recall = eval_aggregator(model, test_data_embeded_loader, loss_module)

    return accumulated_loss, precission, recall


def evaluate_aggregators(train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded):
    lstm_results = []
    model = LstmAggregator()
    lstm_results = train_and_eval_aggregator(model, train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded)
    del model

    # conv_results = []
    model = ConvAggregator()
    conv_results = train_and_eval_aggregator(model, train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded)
    del model

    # mean_results = []
    model = MeanAggregator()
    mean_results = train_and_eval_aggregator(model, train_dataset_embeded, eval_dataset_embeded, test_dataset_embeded)
    del model

    return (lstm_results, conv_results, mean_results)

def print_summary(resutls, folds):
    print('accumulated_loss', sum([x[0] for x in resutls])/folds)
    precission = sum([x[1] for x in resutls])/folds
    recall = sum([x[2] for x in resutls])/folds
    print('precission', precission)
    print('recall', recall)
    print('f1', 2*precission*recall/(precission+recall))

Run

In [None]:
def do_a_fold(i):
    data_files = []
    for x in range(folds_count):
        data_files.append(load_file_dataset(f'fold-{x}'))
    
    all_files = []
    for x in data_files:
        all_files += x[0]
        all_files += x[1]


    with_offset = [(x+i)%folds_count for x in range(folds_count)]
    train_data_p = []
    train_data_n = []
    for x in range(len(data_files)-2):
        train_data_p += data_files[with_offset[x]][0]
        train_data_n += data_files[with_offset[x]][1]

    train_data = [train_data_p, train_data_n]
    eval_data = data_files[with_offset[-2]]
    test_data = data_files[with_offset[-1]]

    print('FOLD', i)
    print(len(train_data[0]), len(train_data[1]))
    print(len(eval_data[0]), len(eval_data[1]))
    print(len(test_data[0]), len(test_data[1]))
    print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
    print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

    model = fine_tune(train_data, eval_data)
    embed_files_2(model, all_files, f'epoch{i}')
    
    del model
    gc.collect()
    torch.cuda.empty_cache()
    print('FOLD', 'Embedded', i)
    print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
    print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

    train_dataset, eval_dataset, test_dataset = make_commit_level_datesets(train_data, eval_data, test_data, f'epoch{i}')

    lstm_results, conv_results, mean_results = evaluate_aggregators(train_dataset, eval_dataset, test_dataset)
    gc.collect()
    torch.cuda.empty_cache()
    return lstm_results, conv_results, mean_results

In [None]:
def do_stuff():
    data_files = load_fold_data(raw_input_path, fold_count=folds_count, data_fraction=fraction_of_data)

    for i in range(folds_count):
        save_file_datasets(data_files[i], f'fold-{i}')
    
    lstm_folds_results = []
    conv_folds_results = []
    mean_folds_results = []

    for i in range(folds_count):
        lstm_results, conv_results, mean_results = do_a_fold(i)
        lstm_folds_results.append(lstm_results)
        conv_folds_results.append(conv_results)
        mean_folds_results.append(mean_results)


    print('LSTM')
    print_summary(lstm_folds_results, folds_count)
    print('CONV')
    print_summary(conv_folds_results, folds_count)
    print('MEAN')
    print_summary(mean_folds_results, folds_count)
    return lstm_folds_results, conv_folds_results, mean_folds_results


lstm_folds_results, conv_folds_results, mean_folds_results = do_stuff()


Summary

In [None]:
print('done')
print('LSTM')
print_summary(lstm_folds_results, folds_count)
print('CONV')
print_summary(conv_folds_results, folds_count)
print('MEAN')
print_summary(mean_folds_results, folds_count)