1. early-stop --> epoch
2. hyper-prarm with list
- datasets * 8
- model * 3
- loss * 3
- use_ast
  
3. other hyper-prarm
- lr = 3e-5
- bsz = 8 (24g4090 + 2Bert)

4. TBC
- customized loss
- ast ckpt

In [1]:
import pandas as pd
from functools import partial

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_scheduler
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification
from tqdm import trange, tqdm

from model import *

In [2]:
# dataset utils
def get_map(labels):
    l_set = set()
    for label in labels:
        l_set.update(label)
    ids2token = list(l_set)
    token2ids = {ids2token[i] : i for i in range(len(ids2token))}
    return ids2token, token2ids

def onehot(labels, token2ids):
    vec = [0 for i in token2ids]
    for label in labels.split('| '):
        vec[token2ids[label]] = 1
    return vec

def lab(labels, token2ids):
    return [token2ids[label] for label in labels.split('| ')]

def label_vectorize(data):
    data = data.rename(columns={'Title_Description' : 'Context', 'AST' : 'AST', 'FixedByID' : 'Dev', 'Name' : 'Btype'})
    data = data[['Context', 'AST', 'Dev', 'Btype']]
    # avoid NaN in dataset
    data['Context'].fillna('[UNK]', inplace=True)
    data['AST'].fillna('[UNK]', inplace=True)
    data['Dev'].fillna('unknown', inplace=True)
    data['Btype'].fillna('unknown', inplace=True)
    
    D_labels = [label.split('| ') for label in data['Dev']]
    _D_ids2token, D_token2ids = get_map(D_labels)
    data['Dev_l'] = data['Dev'].map(partial(lab, token2ids = D_token2ids))
    data['Dev_vec'] = data['Dev'].map(partial(onehot, token2ids = D_token2ids))
    
    B_labels = [label.split('| ') for label in data['Btype']]
    _B_ids2token, B_token2ids = get_map(B_labels)
    data['Btype_l'] = data['Btype'].map(partial(lab, token2ids = B_token2ids))
    data['Btype_vec'] = data['Btype'].map(partial(onehot, token2ids = B_token2ids))
    
    return data, _D_ids2token, _B_ids2token

def tokenize_function(_tokenizer, example, max_seq_len = 512):
    example = example if type(example) == str else _tokenizer.unk_token
    return _tokenizer(example, padding='max_length',
                                truncation=True, max_length=max_seq_len, return_tensors="pt")

def tensor_func(example):
    return torch.tensor(example)

class TextCodeDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data
        
    def __getitem__(self, item):
        return (self.data['x_C'][item], self.data['x_A'][item]), self.data['y'][item]
    
    def __len__(self):
        return len(self.data)

In [3]:
# loss & metrics
class CustomizedBCELoss(nn.Module):
    """
    a flexible version of BCE,
    which enable the loss to focus more on the performance of positive samples' prediction
    """

    def __init__(self, weight_pos=0.8, weight_neg=0.2, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.weight_pos = weight_pos
        self.weight_neg = weight_neg

    def forward(self, x, y):
        x = nn.Sigmoid()(x)
        loss_pos = y * torch.log(x)
        loss_neg = (1 - y) * torch.log(1 - x)
        # loss = 0.8*loss_pos + 0.2*loss_neg
        loss = self.weight_pos * loss_pos + self.weight_neg * loss_neg
        return -torch.sum(loss)

class AsymmetricLossOptimized(nn.Module):
    """
    AsymmetricLoss from https://github.com/Alibaba-MIIL/ASL/blob/main/src/loss_functions/losses.py

    Notice - optimized version, minimizes memory allocation and gpu uploading,
    favors inplace operations
    """

    def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=False):
        super(AsymmetricLossOptimized, self).__init__()

        self.gamma_neg = gamma_neg
        self.gamma_pos = gamma_pos
        self.clip = clip
        self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
        self.eps = eps

        # prevent memory allocation and gpu uploading every iteration, and encourages inplace operations
        self.targets = self.anti_targets = self.xs_pos = self.xs_neg = self.asymmetric_w = self.loss = None

    def forward(self, x, y):
        """
        Parameters
        ----------
        x: input logits
        y: targets (multi-label binarized vector)
        """

        self.targets = y
        self.anti_targets = 1 - y

        # Calculating Probabilities
        self.xs_pos = torch.sigmoid(x)
        self.xs_neg = 1.0 - self.xs_pos

        # Asymmetric Clipping
        if self.clip is not None and self.clip > 0:
            self.xs_neg.add_(self.clip).clamp_(max=1)

        # Basic CE calculation
        self.loss = self.targets * torch.log(self.xs_pos.clamp(min=self.eps))
        self.loss.add_(self.anti_targets * torch.log(self.xs_neg.clamp(min=self.eps)))

        # Asymmetric Focusing
        if self.gamma_neg > 0 or self.gamma_pos > 0:
            if self.disable_torch_grad_focal_loss:
                torch.set_grad_enabled(False)
            self.xs_pos = self.xs_pos * self.targets
            self.xs_neg = self.xs_neg * self.anti_targets
            self.asymmetric_w = torch.pow(1 - self.xs_pos - self.xs_neg,
                                          self.gamma_pos * self.targets + self.gamma_neg * self.anti_targets)
            if self.disable_torch_grad_focal_loss:
                torch.set_grad_enabled(True)
            self.loss *= self.asymmetric_w

        return -self.loss.sum()

def metrics(y: torch.Tensor, pred: torch.Tensor, split_pos: list, threshold: float = 0.5, from_logits=True):
    if from_logits:
        pred = nn.Sigmoid()(pred)
    pred = torch.where(pred > threshold, 1, 0)

    y_d, y_b = torch.split(y, split_pos, dim=1)
    pred_d, pred_b = torch.split(pred, split_pos, dim=1)

    TPd, TPb = torch.sum(y_d * pred_d, dim=1), torch.sum(y_b * pred_b, dim=1)
    TNd, TNb = torch.sum((1 - y_d) * (1 - pred_d), dim=1), torch.sum((1 - y_b) * (1 - pred_b), dim=1)
    FPd, FPb = torch.sum((1 - y_d) * pred_d, dim=1), torch.sum((1 - y_b) * pred_b, dim=1)
    FNd, FNb = torch.sum(y_d * (1 - pred_d), dim=1), torch.sum(y_b * (1 - pred_b), dim=1)

    acc = torch.mean((TPd + TNd) / (TPd + TNd + FPd + FNd + 1e-6)).item(), torch.mean(
        (TPb + TNb) / (TPb + TNb + FPb + FNb + 1e-6)).item()
    recall = torch.mean(TPd / (TPd + FNd + 1e-6)).item(), torch.mean(TPb / (TPb + FNb + 1e-6)).item()
    precision = torch.mean(TPd / (TPd + FPd + 1e-6)).item(), torch.mean(TPb / (TPb + FPb + 1e-6)).item()
    F1 = 2 * recall[0] * precision[0] / (recall[0] + precision[0] + 1e-6), 2 * recall[1] * precision[1] / (
            recall[1] + precision[1] + 1e-6)

    return {
        'acc': acc,
        'precision': precision,
        'recall': recall,
        'F1': F1
    }

In [4]:
def train_imm(_path, _logname, _loss_fn, _use_ast = True, _is_textcnn = False, _num_epochs = 20, _bsz = 8,
              _lr = 3e-5, _ckpt = 'bert-base-uncased', device = 'cuda' if torch.cuda.is_available() else 'cpu'):
    logname = '../res_log/' + _logname + '.txt'
    logstr = _logname + '\n' + '-'*60 + '\n'
    
    # dataset label vectorize
    dataset = pd.read_csv(_path)
    logstr += 'dataset shape:{}\n'.format(dataset.shape)
    print('dataset shape:{}'.format(dataset.shape))
    dataset, D_ids2token, B_ids2token = label_vectorize(dataset)
    n_classes = [len(D_ids2token), len(B_ids2token)]
    logstr += 'n_classes:{}\n'.format(n_classes) + '-'*60 + '\n'
    print('n_classes: ', n_classes)

    check_point = _ckpt
    tokenizer = AutoTokenizer.from_pretrained(check_point)
    # datset tensorize
    dataset['x_C'] = dataset['Context'].map(partial(tokenize_function, tokenizer))
    dataset['x_A'] = dataset['AST'].map(partial(tokenize_function, tokenizer))
    dataset['y'] = dataset['Dev_vec'] + dataset['Btype_vec']
    dataset['y'] = dataset['y'].map(tensor_func)

    # split datset
    t_dataset = dataset[:int(0.8*len(dataset))].reset_index(drop=True)
    train_dataset = t_dataset.sample(frac=0.8,random_state=0,axis=0).reset_index(drop=True)
    val_dataset = t_dataset[~t_dataset.index.isin(train_dataset.index)].reset_index(drop=True)
    test_dataset = dataset[int(0.8*len(dataset)):].reset_index(drop=True)

    # wrap dataset & dataloader
    train_dataset = TextCodeDataset(train_dataset)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=_bsz, drop_last=True)
    val_dataset = TextCodeDataset(val_dataset)
    val_dataloader = DataLoader(val_dataset, shuffle=True, batch_size=_bsz, drop_last=True)
    test_dataset = TextCodeDataset(test_dataset)
    test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=_bsz, drop_last=True)

    # load model
    if _is_textcnn:
        model = MetaModel(n_classes = n_classes, use_AST=_use_ast)
    else:
        # TODO: seperate ckpt
        model = PretrainModel(text_ckpt=_ckpt, code_ckpt=_ckpt, n_classes=n_classes, use_AST=_use_ast)
    model = model.to(device)

    # loss
    loss_fn = _loss_fn.to(device)

    # optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=_lr)

    # lr_scheduler
    num_epochs = _num_epochs
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    # train process
    val_loss_min, val_down = 100000.0, [1, 1, 1, 1, 1]
    for epoch in trange(num_epochs):
        # train
        model.train()
        train_loss = 0.0
        for x, y in train_dataloader:
            x_C = {k: v.to(device) for k, v in x[0].items()}
            x_A = {k: v.to(device) for k, v in x[1].items()}
            y = y.to(device)
            
            outputs = model(x_C, x_A)

            loss = loss_fn(outputs, y.float())
            train_loss += loss.item()/len(train_dataloader)
        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
        logstr += '{}th epoch\n train_loss: {}\n'.format(epoch, train_loss)
        # print('{}th epoch\n train_loss: {}\n'.format(epoch, train_loss))
    
        # val
        model.eval()
        val_loss, val_acc, val_f1 = 0.0, [0.0, 0.0], [0.0, 0.0]
        for x, y in val_dataloader:
            x_C = {k: v.to(device) for k, v in x[0].items()}
            x_A = {k: v.to(device) for k, v in x[1].items()}
            y = y.to(device)

            outputs = model(x_C, x_A)
        
            loss = loss_fn(outputs, y.float())
            val_loss += loss.item()/len(val_dataloader)
            metric = metrics(y, outputs, split_pos = n_classes)
            val_acc[0] += metric['acc'][0]/len(val_dataloader)
            val_acc[1] += metric['acc'][1]/len(val_dataloader)
            val_f1[0] += metric['F1'][0]/len(val_dataloader)
            val_f1[1] += metric['F1'][1]/len(val_dataloader)
        logstr += '{}th epoch\n val_loss: {}\n val_acc:{}\n val_f1: {}\n'.format(epoch, val_loss, val_acc, val_f1)
        # print('{}th epoch\n val_loss: {}\n val_acc:{}\n val_f1: {}'.format(epoch, val_loss, val_acc, val_f1))

        val_down.append(1 if val_loss_min - val_loss > 1e-10 else 0)
        val_loss_min = min(val_loss_min, val_loss)
        if val_down[-1] + val_down[-2] + val_down[-3] + val_down[-4] + val_down[-5] == 0:
            break

    # test
    model.eval()
    test_loss, test_acc, test_f1 = 0.0, [0.0, 0.0], [0.0, 0.0]
    for x, y in tqdm(test_dataloader):
        x_C = {k: v.to(device) for k, v in x[0].items()}
        x_A = {k: v.to(device) for k, v in x[1].items()}
        y = y.to(device)
            
        outputs = model(x_C, x_A)
                
        loss = loss_fn(outputs, y.float())
        test_loss += loss.item()/len(test_dataloader)
        metric = metrics(y, outputs, split_pos = n_classes)
        test_acc[0] += metric['acc'][0]/len(test_dataloader)
        test_acc[1] += metric['acc'][1]/len(test_dataloader)
        test_f1[0] += metric['F1'][0]/len(test_dataloader)
        test_f1[1] += metric['F1'][1]/len(test_dataloader)
    logstr += '-' * 60 + '\ntest_loss: {}\n test_acc:{}\n test_f1: {}'.format(test_loss, test_acc, test_f1)
    print('test_loss: {}\n test_acc:{}\n test_f1: {}'.format(test_loss, test_acc, test_f1))

    with open(logname, 'w') as f:
        f.write(logstr)

In [5]:
# 8 datasets
# path, name, dataset size
pathlist = [
    ('../Data/aspnet/aspnet_1.csv', 'aspnet', 7151),
    ('../Data/efcore/efcore_1.csv', 'efcore', ),
    ('../Data/elasticSearch/elasticSearch_1.csv', 'elasticSearch'),
    ('../Data/mixedRealityToolUnity/mixedRealityToolUnity_1.csv', 'mixedRealityToolUnity'),
    ('../Data/monoGame/monoGame_1.csv', 'monoGame'),
    ('../Data/powershell/powerShell_1.csv', 'powerShell'),
    ('../Data/realmJava/realmJava_1.csv', 'realmJava'),
    ('../Data/roslyn/roslyn_1.csv', 'roslyn'),
]
losslist = [
    (nn.BCEWithLogitsLoss(), 'BCE'),
    (CustomizedBCELoss(), 'CBCE'),
    (AsymmetricLossOptimized(), 'ASL'),
]

ckptlist = [
    ('bert-base-uncased', 'Multi-triage'),  # just for tokenize
    ('bert-base-uncased', ' Bert'),
    ('roberta-base', 'Robert'),
    ('albert-base-v2', 'albert'),
]

astlist = [
    'no_AST',
    # 'use_AST',
]

In [None]:
for path in pathlist:
    for ckpt in ckptlist:
        for loss in losslist:
            for ast in astlist:
                is_t, u_ast = (ckpt[1] == 'Multi-triage'), (ast == 'use_AST') 
                if loss[1] == 'CBCE':
                    loss[0] = CustomizedBCELoss(p_N=path[2]/2)

                logname = ' '.join([path[1], ckpt[1], loss[1], ast])
                print('-'*100, logname, '-'*100, sep='\n')
            
                train_imm(_path = path[0], _logname = '../res_log/' + logname + '.txt', 
                      _loss_fn = loss[0], _use_ast = u_ast, _is_textcnn = is_t, _ckpt = ckpt[0])

----------------------------------------------------------------------------------------------------
aspnet Multi-triage BCE no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(7151, 7)
n_classes:  [61, 132]


  return F.conv1d(input, weight, bias, self.stride,
100%|██████████| 20/20 [00:34<00:00,  1.72s/it]
100%|██████████| 178/178 [00:00<00:00, 548.09it/s]


test_loss: 0.08315895236191459
 test_acc:[0.9914924565995686, 0.9841036927164257]
 test_f1: [0.7464875521381351, 0.0029260144551772676]
----------------------------------------------------------------------------------------------------
aspnet Multi-triage ASL no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(7151, 7)
n_classes:  [61, 132]


100%|██████████| 20/20 [00:40<00:00,  2.04s/it]
100%|██████████| 178/178 [00:00<00:00, 491.38it/s]


test_loss: 14.192807698517706
 test_acc:[0.9911816257439314, 0.9730167064104195]
 test_f1: [0.7443590021458304, 0.3252574691984619]
----------------------------------------------------------------------------------------------------
aspnet  Bert BCE no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(7151, 7)
n_classes:  [61, 132]


  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 20/20 [21:27<00:00, 64.39s/it]
100%|██████████| 178/178 [00:06<00:00, 28.71it/s]


test_loss: 0.04116710515044041
 test_acc:[0.9921141156319828, 0.9872106035773665]
 test_f1: [0.6769651465298621, 0.4813009334402033]
----------------------------------------------------------------------------------------------------
aspnet  Bert ASL no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(7151, 7)
n_classes:  [61, 132]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 20/20 [21:37<00:00, 64.90s/it]
100%|██████████| 178/178 [00:06<00:00, 28.63it/s]


test_loss: 17.13844633906075
 test_acc:[0.9901225024394777, 0.984295212150958]
 test_f1: [0.7419702737434072, 0.5681322851241607]
----------------------------------------------------------------------------------------------------
aspnet Robert BCE no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(7151, 7)
n_classes:  [61, 132]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 20/20 [21:44<00:00, 65.21s/it]
100%|██████████| 178/178 [00:06<00:00, 29.09it/s]


test_loss: 0.048497773204626664
 test_acc:[0.9919644580798205, 0.9841462541162281]
 test_f1: [0.690307830434486, 0.0]
----------------------------------------------------------------------------------------------------
aspnet Robert ASL no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(7151, 7)
n_classes:  [61, 132]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 20/20 [21:54<00:00, 65.71s/it]
100%|██████████| 178/178 [00:06<00:00, 29.19it/s]


test_loss: 16.09969470742044
 test_acc:[0.9891324455148723, 0.9836461677979879]
 test_f1: [0.7334819687771934, 0.5725674298373816]
----------------------------------------------------------------------------------------------------
aspnet albert BCE no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(7151, 7)
n_classes:  [61, 132]


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 20/20 [23:25<00:00, 70.28s/it]
100%|██████████| 178/178 [00:07<00:00, 23.26it/s]


test_loss: 0.0445819696714955
 test_acc:[0.991998996627465, 0.9860880729857446]
 test_f1: [0.6804763791363463, 0.33973016652897636]
----------------------------------------------------------------------------------------------------
aspnet albert ASL no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(7151, 7)
n_classes:  [61, 132]


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 20/20 [23:34<00:00, 70.74s/it]
100%|██████████| 178/178 [00:07<00:00, 23.13it/s]


test_loss: 14.162022064241139
 test_acc:[0.9897886417555016, 0.981911832027221]
 test_f1: [0.7233821751890152, 0.5343408129934141]
----------------------------------------------------------------------------------------------------
efcore Multi-triage BCE no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(6612, 7)
n_classes:  [25, 58]


100%|██████████| 20/20 [00:30<00:00,  1.52s/it]
100%|██████████| 165/165 [00:00<00:00, 545.85it/s]


test_loss: 0.11704160900730072
 test_acc:[0.9703938487804292, 0.9714080344546926]
 test_f1: [0.6113625533222877, 0.0708456471918589]
----------------------------------------------------------------------------------------------------
efcore Multi-triage ASL no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(6612, 7)
n_classes:  [25, 58]


100%|██████████| 20/20 [00:37<00:00,  1.88s/it]
100%|██████████| 165/165 [00:00<00:00, 485.92it/s]


test_loss: 10.80376569863522
 test_acc:[0.9619090091098434, 0.9374608177127265]
 test_f1: [0.6181393157525095, 0.3284657550165212]
----------------------------------------------------------------------------------------------------
efcore  Bert BCE no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(6612, 7)
n_classes:  [25, 58]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 20/20 [19:46<00:00, 59.33s/it]
100%|██████████| 165/165 [00:05<00:00, 28.75it/s]


test_loss: 0.091461328778303
 test_acc:[0.9715756654739375, 0.9743991631450083]
 test_f1: [0.555063576446371, 0.3636511525008343]
----------------------------------------------------------------------------------------------------
efcore  Bert ASL no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(6612, 7)
n_classes:  [25, 58]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 80%|████████  | 16/20 [16:56<04:14, 63.55s/it]
100%|██████████| 165/165 [00:05<00:00, 28.67it/s]


test_loss: 18.672516912402536
 test_acc:[0.9661514285838961, 0.9654127500273972]
 test_f1: [0.6344624430134864, 0.4512340457911784]
----------------------------------------------------------------------------------------------------
efcore Robert BCE no_AST
----------------------------------------------------------------------------------------------------
dataset shape:(6612, 7)
n_classes:  [25, 58]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 4/20 [03:59<16:00, 60.00s/it]