# DeepLineDP Model for python code

## Imports

In [50]:
# Import required libraries
import os, re, time

import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, roc_auc_score, matthews_corrcoef


import more_itertools

import pyarrow.parquet as pq

from gensim.models import Word2Vec

from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, PackedSequence


from sklearn.utils import compute_class_weight

## Loading Data

In [51]:
path_to_line_random = '.'

train_df = pd.read_parquet(f'{path_to_line_random}/train.parquet.gzip')
train_df = train_df.reset_index(drop=True)

test_df = pd.read_parquet(f'{path_to_line_random}/test.parquet.gzip')
test_df = test_df.reset_index(drop=True)

train_df['target'] = train_df['lines'].apply(lambda line : 0 if len(line) == 0 else 1)
test_df['target'] = test_df['lines'].apply(lambda line : 0 if len(line) == 0 else 1)

train_df.head()

KeyboardInterrupt: 

In [None]:
train_df_1 = train_df[train_df['target'] == 1].sample(75, random_state=42)
train_df_0 = train_df[train_df['target'] == 0].sample(75, random_state=42)

# Combine the DataFrames
train_df = pd.concat([train_df_1, train_df_0], ignore_index=True)

In [None]:
# data_root_dir = '../datasets/original/'
# save_dir = "../datasets/preprocessed_data/"

char_to_remove = ['+','-','*','/','=','++','--','\\','<str>','<char>','|','&','!']

# if not os.path.exists(save_dir):
#     os.makedirs(save_dir)

# file_lvl_dir = data_root_dir+'File-level/'
# line_lvl_dir = data_root_dir+'Line-level/'


def is_comment_line(code_line, comments_list):
    '''
        input
            code_line (string): source code in a line
            comments_list (list): a list that contains every comments
        output
            boolean value
    '''

    code_line = code_line.strip()

    if len(code_line) == 0:
        return False
    elif code_line.startswith('#'):
        return True
    elif code_line in comments_list:
        return True

    return False

def is_empty_line(code_line):
    '''
        input
            code_line (string)
        output
            boolean value
    '''

    if len(code_line.strip()) == 0:
        return True

    return False

def preprocess_code_line(code_line):
    '''
        input
            code_line (string)
    '''

    code_line = re.sub("\'\'", "\'", code_line)
    code_line = re.sub("\".*?\"", "<str>", code_line)
    code_line = re.sub("\'.*?\'", "<char>", code_line)
    code_line = re.sub('\b\d+\b','',code_line)
    code_line = re.sub("\\[.*?\\]", '', code_line)
    code_line = re.sub("[\\.|,|:|;|{|}|(|)]", ' ', code_line)

    for char in char_to_remove:
        code_line = code_line.replace(char,' ')

    code_line = code_line.strip()

    return code_line

def preprocess_code(code_str):
    '''
        input
            code_str (multi line str)
    '''
    if(code_str is None):
        return ''
    code_str = code_str.decode("latin-1")
    code_lines = code_str.splitlines()

    preprocess_code_lines = []
    is_comments = []
    is_blank_line = []

    # multi-line comments
    comments = re.findall(r'("""(.*?)""")|(\'\'\'(.*?)\'\'\')', code_str, re.DOTALL)
    comments_temp = []
    for tup in comments:
        temp = ''
        for s in tup:
            temp += s
        comments_temp.append(temp)
    comments_str = '\n'.join(comments_temp)
    comments_list = comments_str.split('\n')

    for l in code_lines:
        l = l.strip()
        is_comment = is_comment_line(l,comments_list)
        is_comments.append(is_comment)

        if not is_comment:
            l = preprocess_code_line(l)

        preprocess_code_lines.append(l)

    return ' \n '.join(preprocess_code_lines)

  code_line = re.sub('\b\d+\b','',code_line)


In [None]:
train_df['content'] = train_df['content'].apply(preprocess_code)
train_df.to_csv('./preprocessed_train_df.csv')

In [None]:
train_df.head()

Unnamed: 0,datetime,commit,repo,filepath,content,methods,lines,target
0,2020-02-23 23:09:24-01:00,a85808e3257c8b5ae906ecc7e816ffea19ce52b2,core,homeassistant\components\derivative\sensor.py,\n from decimal import Decimal DecimalExcept...,[],"[14, 15, 16, 17, 49, 50, 51, 52, 53, 54, 67]",1
1,2022-03-02 02:03:19-01:00,42e2b801fe4cdb9e6fcff3c53b7d732bde59282b,airflow,airflow\www\views.py,# \n # Licensed to the Apache Software Foundat...,"[_mark_task_instance_state, confirm]","[2224, 2286, 3599]",1
2,2021-03-31 20:48:32-01:00,6cf57da89f199c5749974e141b4dba536bd57ee3,core,homeassistant\components\blink\__init__.py,\n import asyncio \n from copy import deepcop...,[_reauth_flow_wrapper],"[19, 53]",1
3,2019-12-07 02:48:45-01:00,f7e1040236e088f4a0b5c725461cdf0eed80b068,lightning,pytorch_lightning\trainer\distrib_parts.py,""""""" \n Lightning makes multi-gpu training and ...",[parse_gpu_ids],"[167, 168, 169, 170, 171, 172, 173, 174, 175, ...",1
4,2019-02-07 10:51:16-01:00,c1e51ef486cec17b69727b47452a34a7796d5677,ansible,lib\ansible\modules\source_control\github_webh...,#!/usr/bin/python \n # \n # Copyright: (c) 201...,[main],"[89, 94, 97, 131, 132]",1


## Token Embedding Layer

In [None]:
max_seq_len = 50
def prepare_code2d(code_list):
    '''
        input
            code_list (list): list that contains code each line (in str format)
        output
            code2d (nested list): a list that contains list of tokens with padding by '<pad>'
    '''
    # content to list(content)
    code_list = str(code_list)
    code_list = code_list.splitlines()
    code2d = []

    for c in code_list:
        c = re.sub('\\s+',' ',c)

        c = c.lower()

        token_list = c.strip().split()
        total_tokens = len(token_list)

        token_list = token_list[:max_seq_len]

        if total_tokens < max_seq_len:
            token_list = token_list + ['<pad>']*(max_seq_len-total_tokens)

        code2d.append(token_list)

    return code2d

In [None]:
train_df['content'] = train_df['content'].apply(prepare_code2d)
train_df.to_csv('./token2d_train_df.csv')

In [None]:
test_df = test_df.head(100)
test_df['content'] = test_df['content'].apply(preprocess_code)
test_df['content'] = test_df['content'].apply(prepare_code2d)

In [None]:
def get_code3d_and_label(df):
    '''
        input
            df (DataFrame): a dataframe from get_df()
        output
            code3d (nested list): a list of code2d from prepare_code2d()
            all_file_label (list): a list of file-level label
    '''
    code_3d = prepare_code2d(train_df['content'].to_numpy())
    all_file_label = train_df['target'].to_numpy().tolist()
    return code_3d, all_file_label

In [None]:
def train_word2vec_model(embedding_dim = 50):

    w2v_path = './word2vec'

    save_path = w2v_path+'/'+'w2v'+str(embedding_dim)+'dim.bin'

    if os.path.exists(save_path):
        print('word2vec model at {} is already exists'.format(save_path))
        return

    if not os.path.exists(w2v_path):
        os.makedirs(w2v_path)

    # train_df = pd.read_csv('./token2d_train_df.csv') #uncomment to load saved file

    train_code_3d, _ = get_code3d_and_label(train_df)

    all_texts = list(more_itertools.collapse(train_code_3d[:],levels=2))

    word2vec = Word2Vec(all_texts,vector_size=embedding_dim, min_count=1,sorted_vocab=1)

    word2vec.save(save_path)
    print('save word2vec model at path {} done'.format(save_path))
    return word2vec

In [None]:
word2vec = train_word2vec_model()

word2vec model at ./word2vec/w2v50dim.bin is already exists


## Util functions

In [None]:


def get_x_vec(code_3d, word2vec):
    x_vec = [[[
        word2vec.wv.key_to_index[token] if token in word2vec.wv.key_to_index else len(word2vec.wv.key_to_index)
        for token in text
    ] for text in texts] for texts in code_3d]
    return x_vec


def pad_code(code_list_3d, max_sent_len, max_seq_len, limit_sent_len=True, mode='train'):
  padded = []

  for file in code_list_3d:
    sent_list = []
    for line in file:
      new_line = line
      # Truncate if line is longer than max_seq_len
      if len(line) > max_seq_len:
        new_line = line[:max_seq_len]
          #edited here ..just trying
      elif len(line) <= max_seq_len:
          new_line = line + [0] * (max_seq_len - len(new_line))
      sent_list.append(new_line)

    # Pad the entire file (all sentences) to max_sent_len with zeros
    padded_file = sent_list + [[0] * max_seq_len for _ in range(max_sent_len - len(sent_list))]

    # If in training mode and `limit_sent_len` is True, keep only the first max_sent_len sentences
    if mode == 'train' and limit_sent_len:
        padded_file = padded_file[:max_sent_len]
      
    padded.append(padded_file)
    # print(padded_file)
  return padded


def get_w2v_weight_for_deep_learning_models(word2vec_model, embed_dim):
    word2vec_weights = torch.FloatTensor(word2vec_model.wv.vectors)
    # add zero vector for unknown tokens
    word2vec_weights = torch.cat((word2vec_weights, torch.zeros(1,embed_dim)))
    return word2vec_weights
    
def get_dataloader(code_vec, label_list, batch_size, max_sent_len):
  y_tensor = torch.FloatTensor([label for label in label_list])
  
  # Ensure padding happens to max_sent_len
  code_vec_pad = pad_code(code_vec, max_sent_len,max_seq_len)
  
  # Print shapes for debugging (optional)
  print(f"code_vec shape: {len(code_vec)}")
  print(f"Y_tensor shape: {len(y_tensor)}")
  print(f"code_vec_pad shape: {len(code_vec_pad)}")
  
  tensor_dataset = TensorDataset(torch.tensor(code_vec_pad), y_tensor)
  dl = DataLoader(tensor_dataset, shuffle=True, batch_size=batch_size, drop_last=True)
  return dl

## Model

In [None]:
# Model structure
class HierarchicalAttentionNetwork(nn.Module):
    def __init__(self, vocab_size, embed_dim, word_gru_hidden_dim, sent_gru_hidden_dim, word_gru_num_layers, sent_gru_num_layers, word_att_dim, sent_att_dim, use_layer_norm, dropout):
        """
        vocab_size: number of words in the vocabulary of the model
        embed_dim: dimension of word embeddings
        word_gru_hidden_dim: dimension of word-level GRU; biGRU output is double this size
        sent_gru_hidden_dim: dimension of sentence-level GRU; biGRU output is double this size
        word_gru_num_layers: number of layers in word-level GRU
        sent_gru_num_layers: number of layers in sentence-level GRU
        word_att_dim: dimension of word-level attention layer
        sent_att_dim: dimension of sentence-level attention layer
        use_layer_norm: whether to use layer normalization
        dropout: dropout rate; 0 to not use dropout
        """
        super(HierarchicalAttentionNetwork, self).__init__()

        self.sent_attention = SentenceAttention(
            vocab_size, embed_dim, word_gru_hidden_dim, sent_gru_hidden_dim,
            word_gru_num_layers, sent_gru_num_layers, word_att_dim, sent_att_dim, use_layer_norm, dropout)

        self.fc = nn.Linear(2 * sent_gru_hidden_dim, 1)
        self.sig = nn.Sigmoid()

        self.use_layer_nome = use_layer_norm
        self.dropout = dropout

    def forward(self, code_tensor):
        
        code_lengths = []
        sent_lengths = []

        for file in code_tensor:
            code_line = []
            code_lengths.append(len(file))
            for line in file:
                code_line.append(len(line))
            sent_lengths.append(code_line)
        
        code_tensor = code_tensor.type(torch.LongTensor)
        code_lengths = torch.tensor(code_lengths).type(torch.LongTensor)
        sent_lengths = torch.tensor(sent_lengths).type(torch.LongTensor)
        
        code_embeds, word_att_weights, sent_att_weights, sents = self.sent_attention(code_tensor, code_lengths, sent_lengths)

        scores = self.fc(code_embeds)
        final_scrs = self.sig(scores)

        return final_scrs, word_att_weights, sent_att_weights, sents

class SentenceAttention(nn.Module):
    """
    Sentence-level attention module. Contains a word-level attention module.
    """
    def __init__(self, vocab_size, embed_dim, word_gru_hidden_dim, sent_gru_hidden_dim,
                word_gru_num_layers, sent_gru_num_layers, word_att_dim, sent_att_dim, use_layer_norm, dropout):
        super(SentenceAttention, self).__init__()

        # Word-level attention module
        self.word_attention = WordAttention(vocab_size, embed_dim, word_gru_hidden_dim, word_gru_num_layers, word_att_dim, use_layer_norm, dropout)

        # Bidirectional sentence-level GRU
        self.gru = nn.GRU(2 * word_gru_hidden_dim, sent_gru_hidden_dim, num_layers=sent_gru_num_layers,
                          batch_first=True, bidirectional=True, dropout=dropout)

        self.use_layer_norm = use_layer_norm
        if use_layer_norm:
            self.layer_norm = nn.LayerNorm(2 * sent_gru_hidden_dim, elementwise_affine=True)
        self.dropout = nn.Dropout(dropout)

        # Sentence-level attention
        self.sent_attention = nn.Linear(2 * sent_gru_hidden_dim, sent_att_dim)

        # Sentence context vector u_s to take dot product with
        # This is equivalent to taking that dot product (Eq.10 in the paper),
        # as u_s is the linear layer's 1D parameter vector here
        self.sentence_context_vector = nn.Linear(sent_att_dim, 1, bias=False)

    def forward(self, code_tensor, code_lengths, sent_lengths):

        # Sort code_tensor by decreasing order in length
        code_lengths, code_perm_idx = code_lengths.sort(dim=0, descending=True)
        code_tensor = code_tensor[code_perm_idx]
        sent_lengths = sent_lengths[code_perm_idx]

        # Make a long batch of sentences by removing pad-sentences
        # i.e. `code_tensor` was of size (num_code_tensor, padded_code_lengths, padded_sent_length)
        # -> `packed_sents.data` is now of size (num_sents, padded_sent_length)
        packed_sents = pack_padded_sequence(code_tensor, lengths=code_lengths.tolist(), batch_first=True)

        # effective batch size at each timestep
        valid_bsz = packed_sents.batch_sizes

        # Make a long batch of sentence lengths by removing pad-sentences
        # i.e. `sent_lengths` was of size (num_code_tensor, padded_code_lengths)
        # -> `packed_sent_lengths.data` is now of size (num_sents)
        packed_sent_lengths = pack_padded_sequence(sent_lengths, lengths=code_lengths.tolist(), batch_first=True)

    
    
        # Word attention module
        sents, word_att_weights = self.word_attention(packed_sents.data, packed_sent_lengths.data)

        sents = self.dropout(sents)

        # Sentence-level GRU over sentence embeddings
        packed_sents, _ = self.gru(PackedSequence(sents, valid_bsz))

        if self.use_layer_norm:
            normed_sents = self.layer_norm(packed_sents.data)
        else:
            normed_sents = packed_sents

        # Sentence attention
        att = torch.tanh(self.sent_attention(normed_sents))
        att = self.sentence_context_vector(att).squeeze(1)

        val = att.max()
        att = torch.exp(att - val)

        # Restore as documents by repadding
        att, _ = pad_packed_sequence(PackedSequence(att, valid_bsz), batch_first=True)

        sent_att_weights = att / torch.sum(att, dim=1, keepdim=True)

        # Restore as documents by repadding
        code_tensor, _ = pad_packed_sequence(packed_sents, batch_first=True)

        # Compute document vectors
        code_tensor = code_tensor * sent_att_weights.unsqueeze(2)
        code_tensor = code_tensor.sum(dim=1)

        # Restore as documents by repadding
        word_att_weights, _ = pad_packed_sequence(PackedSequence(word_att_weights, valid_bsz), batch_first=True)

        # Restore the original order of documents (undo the first sorting)
        _, code_tensor_unperm_idx = code_perm_idx.sort(dim=0, descending=False)
        code_tensor = code_tensor[code_tensor_unperm_idx]

        word_att_weights = word_att_weights[code_tensor_unperm_idx]
        sent_att_weights = sent_att_weights[code_tensor_unperm_idx]

        return code_tensor, word_att_weights, sent_att_weights, sents


class WordAttention(nn.Module):
    """
    Word-level attention module.
    """

    def __init__(self, vocab_size, embed_dim, gru_hidden_dim, gru_num_layers, att_dim, use_layer_norm, dropout):
        super(WordAttention, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embed_dim)

        # output (batch, hidden_size)
        self.gru = nn.GRU(embed_dim, gru_hidden_dim, num_layers=gru_num_layers, batch_first=True, bidirectional=True, dropout=dropout)

        self.use_layer_norm = use_layer_norm
        if use_layer_norm:
            self.layer_norm = nn.LayerNorm(2 * gru_hidden_dim, elementwise_affine=True)
        self.dropout = nn.Dropout(dropout)

        # Maps gru output to `att_dim` sized tensor
        self.attention = nn.Linear(2 * gru_hidden_dim, att_dim)

        # Word context vector (u_w) to take dot-product with
        self.context_vector = nn.Linear(att_dim, 1, bias=False)

    def init_embeddings(self, embeddings):
        """
        Initialized embedding layer with pretrained embeddings.
        embeddings: embeddings to init with
        """
        self.embeddings.weight = nn.Parameter(embeddings)

    def freeze_embeddings(self, freeze=False):
        """
        Set whether to freeze pretrained embeddings.
        """
        self.embeddings.weight.requires_grad = freeze

    def forward(self, sents, sent_lengths):
        """
        sents: encoded sentence-level data; LongTensor (num_sents, pad_len, embed_dim)
        return: sentence embeddings, attention weights of words
        """
        # Sort sents by decreasing order in sentence lengths
        sent_lengths, sent_perm_idx = sent_lengths.sort(dim=0, descending=True)
        sents = sents[sent_perm_idx]

        sents = self.embeddings(sents)

        packed_words = pack_padded_sequence(sents, lengths=sent_lengths.tolist(), batch_first=True)

        # effective batch size at each timestep
        valid_bsz = packed_words.batch_sizes

        # Apply word-level GRU over word embeddings
        packed_words, _ = self.gru(packed_words)

        if self.use_layer_norm:
            normed_words = self.layer_norm(packed_words.data)
        else:
            normed_words = packed_words

        # Word Attenton
        att = torch.tanh(self.attention(normed_words.data))
        att = self.context_vector(att).squeeze(1)

        val = att.max()
        att = torch.exp(att - val) # att.size: (n_words)

        # Restore as sentences by repadding
        att, _ = pad_packed_sequence(PackedSequence(att, valid_bsz), batch_first=True)

        att_weights = att / torch.sum(att, dim=1, keepdim=True)

        # Restore as sentences by repadding
        sents, _ = pad_packed_sequence(packed_words, batch_first=True)

        # Compute sentence vectors
        sents = sents * att_weights.unsqueeze(2)
        sents = sents.sum(dim=1)

        # Restore the original order of sentences (undo the first sorting)
        _, sent_unperm_idx = sent_perm_idx.sort(dim=0, descending=False)
        sents = sents[sent_unperm_idx]

        att_weights = att_weights[sent_unperm_idx]

        return sents, att_weights

## Model Training

In [None]:
torch.manual_seed(0)

# model setting
batch_size = 32 #args.batch_size
num_epochs = 10 #args.num_epochs
max_grad_norm = 5
embed_dim = 50 #args.embed_dim
word_gru_hidden_dim = 64 #args.word_gru_hidden_dim
sent_gru_hidden_dim = 64 #args.sent_gru_hidden_dim
word_gru_num_layers = 2 #args.word_gru_num_layers
sent_gru_num_layers = 2 #args.sent_gru_num_layers
word_att_dim = 64
sent_att_dim = 64
use_layer_norm = True
dropout = 0.1 #args.dropout
lr = 0.001 #args.lr

save_every_epochs = 1
exp_name = ''#args.exp_name

max_train_LOC = 900

weight_dict = {}

def get_loss_weight(labels):
    '''
        input
            labels: a PyTorch tensor that contains labels
        output
            weight_tensor: a PyTorch tensor that contains weight of defect/clean class
    '''
    label_list = labels.cpu().numpy().squeeze().tolist()
    weight_list = []

    for lab in label_list:
        if lab == 0:
            weight_list.append(weight_dict['clean'])
        else:
            weight_list.append(weight_dict['defect'])

    weight_tensor = torch.tensor(weight_list).reshape(-1,1)
    return weight_tensor


def train_model():


    train_code3d, train_label = get_code3d_and_label(train_df)

    sample_weights = compute_class_weight(class_weight = 'balanced', classes = np.unique(train_label), y = train_label)

    weight_dict['defect'] = np.max(sample_weights)
    weight_dict['clean'] = np.min(sample_weights)

    word2vec = Word2Vec.load('./word2vec/w2v50dim.bin')
    print('load Word2Vec finished')

    word2vec_weights = get_w2v_weight_for_deep_learning_models(word2vec, embed_dim)

    vocab_size = len(word2vec.wv.key_to_index) + 1  # Use key_to_index
    # for unknown tokens

    x_train_vec = get_x_vec(train_code3d, word2vec)
    # x_valid_vec = get_x_vec(valid_code3d, word2vec)

    max_sent_len = min(max([len(sent) for sent in (x_train_vec)]), max_train_LOC)

    # print(x_train_vec[0])

    train_dl = get_dataloader(x_train_vec,train_label,batch_size,max_sent_len)

    # valid_dl = get_dataloader(x_valid_vec, valid_label,batch_size,max_sent_len)

    model = HierarchicalAttentionNetwork(
        vocab_size=vocab_size,
        embed_dim=embed_dim,
        word_gru_hidden_dim=word_gru_hidden_dim,
        sent_gru_hidden_dim=sent_gru_hidden_dim,
        word_gru_num_layers=word_gru_num_layers,
        sent_gru_num_layers=sent_gru_num_layers,
        word_att_dim=word_att_dim,
        sent_att_dim=sent_att_dim,
        use_layer_norm=use_layer_norm,
        dropout=dropout)

    # model = model.cuda()
    model.sent_attention.word_attention.freeze_embeddings(False)
    
    # print some word attention weights
    for inputs, labels in train_dl:
        output, word_att_weights, sent_att_weights, sents = model(inputs)
        print(word_att_weights)
        break
    

    optimizer = optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

    criterion = nn.BCELoss()

    for epoch in range(num_epochs):
        start = time.time()
        train_losses = []

        model.train()

        for inputs, labels in train_dl:

            # # inputs_cuda, labels_cuda = inputs.cuda(), labels.cuda()
            # output, _, __, ___ = model(inputs)
            # i need outputs and attention weights from model
            output,word_att_weights,sent_att_weights,sents = model(inputs)

            weight_tensor = get_loss_weight(labels)

            criterion.weight = weight_tensor

            loss = criterion(output, labels.reshape(batch_size,1))

            train_losses.append(loss.item())
            
            

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
            optimizer.step()
        print(f'Epoch {epoch}: {time.time()-start}')

    torch.save(model.state_dict(), './DeepLineDPReplication.pth')

In [None]:
train_model()

load Word2Vec finished
code_vec shape: 150
Y_tensor shape: 150
code_vec_pad shape: 150
tensor([[[0.0136, 0.0174, 0.0138,  ..., 0.0237, 0.0219, 0.0249],
         [0.0211, 0.0208, 0.0188,  ..., 0.0231, 0.0219, 0.0206],
         [0.0203, 0.0185, 0.0163,  ..., 0.0223, 0.0238, 0.0254],
         ...,
         [0.0215, 0.0236, 0.0186,  ..., 0.0211, 0.0221, 0.0227],
         [0.0229, 0.0207, 0.0185,  ..., 0.0203, 0.0212, 0.0213],
         [0.0205, 0.0222, 0.0191,  ..., 0.0220, 0.0218, 0.0250]],

        [[0.0166, 0.0195, 0.0155,  ..., 0.0233, 0.0234, 0.0262],
         [0.0198, 0.0232, 0.0197,  ..., 0.0207, 0.0234, 0.0236],
         [0.0201, 0.0200, 0.0176,  ..., 0.0209, 0.0230, 0.0216],
         ...,
         [0.0194, 0.0184, 0.0171,  ..., 0.0207, 0.0214, 0.0251],
         [0.0202, 0.0196, 0.0166,  ..., 0.0237, 0.0272, 0.0241],
         [0.0217, 0.0189, 0.0183,  ..., 0.0211, 0.0221, 0.0227]],

        [[0.0147, 0.0188, 0.0166,  ..., 0.0221, 0.0251, 0.0278],
         [0.0220, 0.0198, 0.0177,  .

In [None]:
batch_size = len(test_df)
torch.manual_seed(0)
embed_dim = 50 #args.embed_dim
word_gru_hidden_dim = 64 #args.word_gru_hidden_dim
sent_gru_hidden_dim = 64 #args.sent_gru_hidden_dim
word_gru_num_layers = 2 #args.word_gru_num_layers
sent_gru_num_layers = 2 #args.sent_gru_num_layers
word_att_dim = 64
sent_att_dim = 64
use_layer_norm = True
dropout = 0.1 #args.dropout
lr = 0.001 #args.lr
max_test_LOC = 900

def test_model():


    test_code3d, test_label = get_code3d_and_label(test_df)

    sample_weights = compute_class_weight(class_weight = 'balanced', classes = np.unique(test_label), y = test_label)

    weight_dict['defect'] = np.max(sample_weights)
    weight_dict['clean'] = np.min(sample_weights)

    word2vec = Word2Vec.load('./word2vec/w2v50dim.bin')
    print('load Word2Vec finished')

    word2vec_weights = get_w2v_weight_for_deep_learning_models(word2vec, embed_dim)

    vocab_size = len(word2vec.wv.key_to_index) + 1  # Use key_to_index
    # for unknown tokens

    x_test_vec = get_x_vec(test_code3d, word2vec)

    max_sent_len = min(max([len(sent) for sent in (x_test_vec)]), max_test_LOC)

    test_dl = get_dataloader(x_test_vec,test_label,batch_size,max_sent_len)

    loaded_dict = torch.load('./DeepLineDPReplication.pth')
    model = HierarchicalAttentionNetwork(
        vocab_size=vocab_size,
        embed_dim=embed_dim,
        word_gru_hidden_dim=word_gru_hidden_dim,
        sent_gru_hidden_dim=sent_gru_hidden_dim,
        word_gru_num_layers=word_gru_num_layers,
        sent_gru_num_layers=sent_gru_num_layers,
        word_att_dim=word_att_dim,
        sent_att_dim=sent_att_dim,
        use_layer_norm=use_layer_norm,
        dropout=dropout
    )
    model.load_state_dict(loaded_dict)

    start = time.time()

    model.eval()

    for inputs, labels in test_dl:
    #     outputs, _, __, ___ = model(inputs)
        outputs, word_att_weights, sent_att_weights, sents = model(inputs)
        return outputs,word_att_weights,sent_att_weights, labels


In [None]:
test_df.head()

Unnamed: 0,datetime,commit,repo,filepath,content,methods,lines,target
0,2021-07-13 15:35:10+04:00,000fbe63d390c59b9c1e29216c35fc52b991f2f3,lightning,pytorch_lightning\trainer\connectors\logger_co...,"[[#, copyright, the, pytorch, lightning, team....","[extract_batch_size, _extract_batch_size]","[17, 24, 593]",1
1,2022-07-11 13:12:55+04:00,038d5338530411bb47283fda1e84dec91137880b,localstack,localstack\aws\app.py,"[[import, logging, <pad>, <pad>, <pad>, <pad>,...",[__init__],[62],1
2,2014-07-16 08:51:12+04:00,0786e84a33155ebc8d8d3502e3a7f3060b86a4ec,scrapy,scrapy\utils\iterators.py,"[[import, re, csv, six, <pad>, <pad>, <pad>, <...",[csviter],"[3, 4, 5, 6, 7, 55]",1
3,2020-07-28 17:22:24+04:00,0094cb0d0472b08f92915e948907b237eea020e3,spaCy,spacy\cli\train.py,"[[from, typing, import, optional, dict, any, t...",[update_meta],[449],1
4,2020-01-18 21:30:15+04:00,03c6f4bf250edd18eb818ed65090f508636b0bff,localstack,localstack\services\awslambda\lambda_executors.py,"[[import, os, <pad>, <pad>, <pad>, <pad>, <pad...",[],"[30, 31, 33, 37, 39]",1


In [None]:
y_pred,word_att_weights,sent_att_weights, y_gt = test_model() # test --> changed to train
# y_pred, y_gt = train_model()

load Word2Vec finished
code_vec shape: 150
Y_tensor shape: 150
code_vec_pad shape: 150


In [None]:
print(sent_att_weights)

tensor([[0.2644, 0.0440, 0.0272,  ..., 0.0113, 0.0107, 0.0104],
        [0.5365, 0.0208, 0.0137,  ..., 0.0074, 0.0070, 0.0068],
        [0.5585, 0.0212, 0.0136,  ..., 0.0070, 0.0066, 0.0064],
        ...,
        [0.5578, 0.0783, 0.0183,  ..., 0.0059, 0.0055, 0.0054],
        [0.6976, 0.1461, 0.0348,  ..., 0.0015, 0.0014, 0.0014],
        [0.5365, 0.0208, 0.0137,  ..., 0.0074, 0.0070, 0.0068]],
       grad_fn=<IndexBackward0>)


In [None]:
y_pred = y_pred.detach().numpy()
word_att_weights = word_att_weights.detach().numpy()
sent_att_weights = sent_att_weights.detach().numpy()
y_gt = y_gt.detach().numpy()

In [None]:
y_probs = np.array([prob[0] for prob in y_pred])
word_att_weights = np.array([att[0] for att in word_att_weights])
sent_att_weights = np.array([att[0] for att in sent_att_weights])
y_pred = np.where(y_probs >= 0.45, 1, 0)

In [None]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
y_gt

array([1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1.,
       0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1.,
       1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1.,
       0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 1., 0., 1.,
       1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1.],
      dtype=float32)

In [None]:
# precision, Recall, F1-score, Confusion matrix, False Alarm Rate, Distance-to-Heaven, AUC
import sklearn.metrics as metrics
import math

prec, rec, f1, _ = metrics.precision_recall_fscore_support(y_gt,y_pred,average='binary') # at threshold = 0.5
tn, fp, fn, tp = metrics.confusion_matrix(y_gt, y_pred, labels=[0, 1]).ravel()
FAR = fp/(fp+tn)
dist_heaven = math.sqrt((pow(1-rec,2)+pow(0-FAR,2))/2.0)
AUC = metrics.roc_auc_score(y_gt, y_probs)

print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"F1-score: {f1}")
print(f"False Alarm Rate: {FAR}")
print(f"Distance to Heaven: {dist_heaven}")
print(f"AUC: {AUC}")

Precision: 0.49
Recall: 1.0
F1-score: 0.6577181208053692
False Alarm Rate: 1.0
Distance to Heaven: 0.7071067811865476
AUC: 0.49279711884753896


In [None]:
train_df.to_csv('./toscore_train_df.csv')

In [None]:
print(train_df.columns)

Index(['datetime', 'commit', 'repo', 'filepath', 'content', 'methods', 'lines',
       'target'],
      dtype='object')


In [None]:
# Set save.fig.dir
save_fig_dir = '../output/figure/'
os.makedirs(save_fig_dir, exist_ok=True)

def preprocess(x, reverse):
    x.columns = ["variable","value"]
    tmp = pd.concat([x["variable"], x["value"]], axis=1)
    tmp = tmp.pivot(columns="variable", values="value")
    tmp.columns = tmp.columns.str.replace(".value", "")
    df = tmp
    ranking = None
    
    if reverse == True:
        ranking = (max(df.columns) - df.columns) + 1
    else:
        ranking = df.columns
    
    df["rank"] = "Rank" + ranking.astype(str)
    return df

def get_top_k_tokens(df, k):
    top_k = df[(df["is.comment.line"] == "False") & (df["file.level.ground.truth"] == "True") & (df["prediction.label"] == "True")]
    top_k = top_k.groupby(["test", "filename"]).apply(lambda x: x.nlargest(k, "token.attention.score")).reset_index(drop=True)
    top_k = top_k[["project", "train", "test", "filename", "token"]].drop_duplicates()
    top_k["flag"] = "topk"
    return top_k

In [None]:
path_to_deep_random = '.'
test = pd.read_parquet(f'{path_to_deep_random}/test.parquet.gzip')
test = test.reset_index(drop=True)
test.head()

Unnamed: 0,datetime,commit,repo,filepath,content,methods,lines
0,2021-07-13 15:35:10+04:00,000fbe63d390c59b9c1e29216c35fc52b991f2f3,lightning,pytorch_lightning\trainer\connectors\logger_co...,b'# Copyright The PyTorch Lightning team.\n#\n...,"[extract_batch_size, _extract_batch_size]","[17, 24, 593]"
1,2022-07-11 13:12:55+04:00,038d5338530411bb47283fda1e84dec91137880b,localstack,localstack\aws\app.py,b'import logging\n\nfrom localstack.aws import...,[__init__],[62]
2,2014-07-16 08:51:12+04:00,0786e84a33155ebc8d8d3502e3a7f3060b86a4ec,scrapy,scrapy\utils\iterators.py,"b'import re, csv, six\n\ntry:\n from cStrin...",[csviter],"[3, 4, 5, 6, 7, 55]"
3,2020-07-28 17:22:24+04:00,0094cb0d0472b08f92915e948907b237eea020e3,spaCy,spacy\cli\train.py,"b'from typing import Optional, Dict, Any, Tupl...",[update_meta],[449]
4,2020-01-18 21:30:15+04:00,03c6f4bf250edd18eb818ed65090f508636b0bff,localstack,localstack\services\awslambda\lambda_executors.py,b'import os\nimport re\nimport glob\nimport js...,[],"[30, 31, 33, 37, 39]"


In [None]:
test.to_csv("./init_test.csv")

In [None]:

x=test['content'][0]

def process_byte_column(df, i):
    text_data = df['content'].iloc[i].decode('latin-1', errors='replace')
    lines = text_data.split('\n')
    return lines
print(test['lines'][0])

[ 17  24 593]


In [None]:
print(len(test))

10000


In [None]:
print(len(y_gt))

100


In [None]:
# create a new column which is a array label for comment lines
is_comment = False
columns = ['datetime', 'commit', 'repo', 'filepath', 'is_comment', 'line', 'is_buggy','attention_score','file_target','ypred_file']
tdf = pd.DataFrame(columns=columns)
# tdf.columns = ['datetime', 'commit', 'repo', 'filepath', 'is_comment', 'line', 'is_buggy']
row_dict = []

In [None]:
print(tdf.columns)

Index(['datetime', 'commit', 'repo', 'filepath', 'is_comment', 'line',
       'is_buggy', 'attention_score'],
      dtype='object')


In [None]:
print((word_att_weights[0][0]))

0.0124259265


In [None]:

# for i in range(len(train)):
print(len(test))
for i in range(len(test)):
    t_temp =[]
    
    t_temp.append(test['target'][i])
    temp = process_byte_column(test, i)
    j = 0  # line number in the file
    if i % 100:
        print(f'doc num:{i}')
    is_comment = False
    for line in temp:
        
        j += 1
        row_dict = []
        row_dict.append(test['datetime'][i])
        row_dict.append(test['commit'][i])
        row_dict.append(test['repo'][i])
        row_dict.append(test['filepath'][i])
        if line.startswith('#'):
            row_dict.append(True)
            row_dict.append(line)
            row_dict.append(False)
        elif line.startswith("'''"):
            is_comment = True
            row_dict.append(True)
            row_dict.append(line)
            row_dict.append(False)
        elif is_comment:
            row_dict.append(True)
            row_dict.append(line)
            row_dict.append(False)
        elif line.endswith("'''") and is_comment:
            is_comment = False
            row_dict.append(True)
            row_dict.append(line)
            row_dict.append(False)
        elif line == '':
            continue
        else:
            row_dict.append(False)
            row_dict.append(line)
            row_dict.append(False if j not in test['lines'][i] else True)
        # tdf[attention_score]. Each word score must be updated from word attention weights in the test model output
        if j < len(word_att_weights):
            row_dict.append([word_att_weights[j][k]
                            for k in range(len(word_att_weights[j]))])
        else:
            row_dict.append(None)  # or some other default value
        
        # fill tdf[file_target] with the y_gt from test_df
        row_dict.append(y_gt[i])
        # fill tdf[ypred_file] with the y_pred from test_df
        row_dict.append(y_pred[i])
        tdf.loc[len(tdf.index)] = row_dict
        

In [None]:
print(tdf['is_comment'])

0        True
1        True
2        True
3        True
4        True
        ...  
8550    False
8551    False
8552    False
8553     True
8554     True
Name: is_comment, Length: 8555, dtype: bool


In [None]:
tdf.head()

In [None]:
# tdf.to_csv("./line_lvl_eval.csv")
# load line_lvl_eval.csv to tdf dataframe
tdf.to_csv("./tdf_final.csv")

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, roc_auc_score, matthews_corrcoef

# Set save.fig.dir
save_fig_dir = './output/figure/'
os.makedirs(save_fig_dir, exist_ok=True)

def preprocess(x, reverse):
    x.columns = ["variable","value"]
    tmp = pd.concat([x["variable"], x["value"]], axis=1)
    tmp = tmp.pivot(columns="variable", values="value")
    tmp.columns = tmp.columns.str.replace(".value", "")
    df = tmp
    ranking = None
    
    if reverse == True:
        ranking = (max(df.columns) - df.columns) + 1
    else:
        ranking = df.columns
    
    df["rank"] = "Rank" + ranking.astype(str)
    return df

def get_top_k_tokens(df, k):
    top_k = df[(df["is_comment"] == "False") & (df["target"] == "True") & (df["is_buggy"] == "True")]
    top_k = top_k.groupby(["test", "filename"]).apply(lambda x: x.nlargest(k, "token.attention.score")).reset_index(drop=True)
    top_k = top_k[["project", "train", "test", "filename", "token"]].drop_duplicates()
    top_k["flag"] = "topk"
    return top_k

prediction_dir = '../output/prediction/DeepLineDP/within-release/'
all_files = os.listdir(prediction_dir)

df_all = pd.DataFrame() 

for f in all_files:
    df = pd.read_csv(os.path.join(prediction_dir, f))
    df_all = pd.concat([df_all, df])


FileNotFoundError: [WinError 3] The system cannot find the path specified: '../output/prediction/DeepLineDP/within-release/'

In [None]:
# ---------------- Code for RQ1 -----------------------#

##df_all will be tdf dataframe

# RQ1-1
df_to_plot = df_all[(df_all["is_comment"] == "False") & (df_all["target"] == "True") & (df_all["y_pred"] == "True")]
# df_to_plot = df_to_plot.groupby(["test", "filename", "token"]).agg({"token.attention.score": ["max", "min", "sd"]}).reset_index()
df_to_plot = df_to_plot.groupby(["datetime","commit", "repo", "filepath",]).agg({"token.attention.score": ["max", "min", "std"]}).reset_index()
df_to_plot.columns = ["datetime","commit", "repo", "filepath", "max", "min", "sd"]
# df_to_plot.columns = ["test", "filename", "token", "Range", "SD"]

# RQ1-2
df_all_copy = df_all.copy()
df_all_copy = df_all_copy[(df_all_copy["is_comment"] == "False") & (df_all_copy["target"] == "True") & (df_all_copy["y_pred"] == "True")]

clean_lines_df = df_all_copy[df_all_copy["is_buggy"] == "False"]
buggy_lines_df = df_all_copy[df_all_copy["is_buggy"] == "True"]

# clean_lines_token_score = clean_lines_df.groupby(["test", "filename", "token"]).agg({"token.attention.score": "min"}).reset_index()
clean_lines_token_score = clean_lines_df.groupby(["commit", "repo", "filepath"]).agg({"token.attention.score": "min"}).reset_index()
clean_lines_token_score["class"] = "Clean Lines"

# buggy_lines_token_score = buggy_lines_df.groupby(["test", "filename", "token"]).agg({"token.attention.score": "max"}).reset_index()
buggy_lines_token_score = buggy_lines_df.groupby(["commit", "repo", "filepath"]).agg({"token.attention.score": "max"}).reset_index()
buggy_lines_token_score["class"] = "Defective Lines"

all_lines_token_score = pd.concat([buggy_lines_token_score, clean_lines_token_score])
all_lines_token_score["class"] = pd.Categorical(all_lines_token_score["class"], categories=["Defective Lines", "Clean Lines"])

KeyError: 'is_comment'

In [None]:

def get_file_level_metrics(df_file):
    all_gt = df_file["target"]
    all_prob = df_file["y_pred"] #prediction.prob
    all_pred = df_file["target"] #prediction.label
    
    confusion_mat = confusion_matrix(all_pred, all_gt)
    
    bal_acc = confusion_mat[1, 1] / (confusion_mat[1, 1] + confusion_mat[0, 1])
    AUC = roc_auc_score(all_gt, all_prob)
    
    all_pred = np.where(all_pred == "False", 0, 1)
    all_gt = np.where(all_gt == "False", 0, 1)
    
    MCC = matthews_corrcoef(all_gt, all_pred)
    
    if np.isnan(MCC):
        MCC = 0
    
    eval_result = [AUC, MCC, bal_acc]
    
    return eval_result

def get_file_level_eval_result(prediction_dir, method_name):
    all_files = os.listdir(prediction_dir)

    all_auc = []
    all_mcc = []
    all_bal_acc = []
    all_test_rels = []

    for f in all_files:
        df = pd.read_csv(os.path.join(prediction_dir, f))

        if method_name == "DeepLineDP":
            df = df[["train", "test", "filename", "target", "y_pred", "target"]]
            df = df.drop_duplicates()

        file_level_result = get_file_level_metrics(df)

        AUC = file_level_result[0]
        MCC = file_level_result[1]
        bal_acc = file_level_result[2]

        all_auc.append(AUC)
        all_mcc.append(MCC)
        all_bal_acc.append(bal_acc)
        all_test_rels.append(f.replace(".csv", ""))

    result_df = pd.DataFrame({"AUC": all_auc, "MCC": all_mcc, "Balance.Accuracy": all_bal_acc})

    all_test_rels = [rel.replace(".csv", "") for rel in all_test_rels]

    result_df["release"] = all_test_rels
    result_df["technique"] = method_name

    return result_df

# bi_lstm_prediction_dir = "../output/prediction/Bi-LSTM/"
# cnn_prediction_dir = "../output/prediction/CNN/"
# dbn_prediction_dir = "../output/prediction/DBN/"
# lr_prediction_dir = "../output/prediction/LR/"

# bi_lstm_result = get_file_level_eval_result(bi_lstm_prediction_dir, "Bi.LSTM")
# cnn_result = get_file_level_eval_result(cnn_prediction_dir, "CNN")
# dbn_result = get_file_level_eval_result(dbn_prediction_dir, "DBN")
# lr_result = get_file_level_eval_result(lr_prediction_dir, "LR")
# deepline_dp_result = get_file_level_eval_result(prediction_dir, "DeepLineDP")

# all_result = pd.concat([bi_lstm_result, cnn_result, dbn_result, lr_result, deepline_dp_result])

# all_result.columns = ["AUC", "MCC", "Balance.Accuracy", "Release", "Technique"]

# auc_result = all_result[["Technique", "AUC"]]
# auc_result = preprocess(auc_result, False)
# auc_result.loc[auc_result["variable"] == "Bi.LSTM", "variable"] = "Bi-LSTM"

# mcc_result = all_result[["Technique", "MCC"]]
# mcc_result = preprocess(mcc_result, False)
# mcc_result.loc[mcc_result["variable"] == "Bi.LSTM", "variable"] = "Bi-LSTM"

# bal_acc_result = all_result[["Technique", "Balance.Accuracy"]]
# bal_acc_result = preprocess(bal_acc_result, False)
# bal_acc_result.loc[bal_acc_result["variable"] == "Bi.LSTM", "variable"] = "Bi-LSTM"


In [None]:
# ---------------- Code for RQ3 -----------------------#

def get_line_metrics_result(baseline_df, cur_df_file):
    baseline_df_with_ground_truth = pd.merge(baseline_df, cur_df_file, on=["filename", "line.number"])

    sorted_df = baseline_df_with_ground_truth.groupby("filename").apply(lambda x: x.sort_values("line.score", ascending=False)).reset_index(drop=True)
    sorted_df["order"] = sorted_df.groupby("filename").cumcount() + 1

    # IFA
    IFA = sorted_df[sorted_df["line.level.ground.truth"] == "True"].groupby("filename").apply(lambda x: x.nsmallest(1, "order")).reset_index(drop=True)

    total_true = sorted_df.groupby("filename").agg({"line.level.ground.truth": lambda x: sum(x == "True")}).reset_index()

    # Recall20%LOC
    recall20LOC = sorted_df.groupby("filename").apply(lambda x: x[x["order"] <= int(0.2 * len(x))]).groupby("filename").agg({"line.level.ground.truth": lambda x: sum(x == "True")}).reset_index()
    recall20LOC = recall20LOC.merge(total_true, on="filename")
    recall20LOC["recall20LOC"] = recall20LOC["line.level.ground.truth"] / recall20LOC["line.level.ground.truth_y"]

    # Effort20%Recall
    effort20Recall = sorted_df.merge(total_true, on="filename").groupby("filename").apply(lambda x: sum(x["line.level.ground.truth"].cumsum() / x["line.level.ground.truth_y"] <= 0.2) / len(x)).reset_index()

    ifa_list = IFA["order"].tolist()
    recall_list = recall20LOC["recall20LOC"].tolist()
    effort_list = effort20Recall[0].tolist()

    result_df = pd.DataFrame({"ifa_list": ifa_list, "recall_list": recall_list, "effort_list": effort_list})

    return result_df

all_eval_releases = ['activemq-5.2.0', 'activemq-5.3.0', 'activemq-5.8.0', 'camel-2.10.0', 'camel-2.11.0', 'derby-10.5.1.1', 'groovy-1_6_BETA_2', 'hbase-0.95.2', 'hive-0.12.0', 'jruby-1.5.0', 'jruby-1.7.0.preview1', 'lucene-3.0.0', 'lucene-3.1', 'wicket-1.5.3']

error_prone_result_dir = '../output/ErrorProne_result/'
ngram_result_dir = '../output/n_gram_result/'
rf_result_dir = '../output/RF-line-level-result/'

n_gram_result_df = pd.DataFrame()
error_prone_result_df = pd.DataFrame()
rf_result_df = pd.DataFrame()

for rel in all_eval_releases:
    error_prone_result = pd.read_csv(os.path.join(error_prone_result_dir, rel + '-line-lvl-result.txt'), quotechar="")
    error_prone_result["EP_prediction_result"] = error_prone_result["EP_prediction_result"].replace({"False": 0, "True": 1})

    n_gram_result = pd.read_csv(os.path.join(ngram_result_dir, rel + '-line-lvl-result.txt'), quotechar="")
    rf_result = pd.read_csv(os.path.join(rf_result_dir, rel + '-line-lvl-result.csv'))

    n_gram_result = n_gram_result[["filename", "line.number", "line.score"]]
    rf_result = rf_result[["filename", "line_number", "line.score.pred"]]

    cur_df_file = df_all[df_all["test"] == rel]
    cur_df_file = cur_df_file[["filename", "line.number", "line.level.ground.truth"]]

    n_gram_eval_result = get_line_metrics_result(n_gram_result, cur_df_file)
    error_prone_eval_result = get_line_metrics_result(error_prone_result, cur_df_file)
    rf_eval_result = get_line_metrics_result(rf_result, cur_df_file)

    n_gram_result_df = pd.concat([n_gram_result_df, n_gram_eval_result])
    error_prone_result_df = pd.concat([error_prone_result_df, error_prone_eval_result])
    rf_result_df = pd.concat([rf_result_df, rf_eval_result])

# Force attention score of comment line is 0
df_all.loc[df_all["is.comment.line"] == "True", "token.attention.score"] = 0

tmp_top_k = get_top_k_tokens(df_all, 1500)

merged_df_all = pd.merge(df_all, tmp_top_k, on=["project", "train", "test", "filename", "token"], how="left")
merged_df_all.loc[merged_df_all["flag"].isna(), "token.attention.score"] = 0

sum_line_attn = merged_df_all[(merged_df_all["file.level.ground.truth"] == "True") & (merged_df_all["prediction.label"] == "True")]
sum_line_attn = sum_line_attn.groupby(["test", "filename", "is.comment.line", "file.level.ground.truth", "prediction.label", "line.number", "line.level.ground.truth"]).agg({"token.attention.score": "sum", "num_tokens": "count"}).reset_index()

sorted_df = sum_line_attn.groupby(["test", "filename"]).apply(lambda x: x.sort_values("token.attention.score", ascending=False)).reset_index(drop=True)
sorted_df["order"] = sorted_df.groupby(["test", "filename"]).cumcount() + 1

# get result from DeepLineDP
# calculate IFA
IFA = sorted_df[sorted_df["line.level.ground.truth"] == "True"].groupby(["test", "filename"]).apply(lambda x: x.nsmallest(1, "order")).reset_index(drop=True)

total_true = sorted_df.groupby(["test", "filename"]).agg({"line.level.ground.truth": lambda x: sum(x == "True")}).reset_index()

# calculate Recall20%LOC
recall20LOC = sorted_df.groupby(["test", "filename"]).apply(lambda x: x[x["order"] <= int(0.2 * len(x))]).groupby(["test", "filename"]).agg({"line.level.ground.truth": lambda x: sum(x == "True")}).reset_index()
recall20LOC = recall20LOC.merge(total_true, on=["test", "filename"])
recall20LOC["recall20LOC"] = recall20LOC["line.level.ground.truth"] / recall20LOC["line.level.ground.truth_y"]

# calculate Effort20%Recall
effort20Recall = sorted_df.merge(total_true, on=["test", "filename"]).groupby(["test", "filename"]).apply(lambda x: sum(x["line.level.ground.truth"].cumsum() / x["line.level.ground.truth_y"] <= 0.2) / len(x)).reset_index()

# prepare data for plotting
deeplinedp_ifa = IFA["order"].tolist()
deeplinedp_recall = recall20LOC["recall20LOC"].tolist()
deeplinedp_effort = effort20Recall[0].tolist()

deepline_dp_line_result = pd.DataFrame({"IFA": deeplinedp_ifa, "Recall20%LOC": deeplinedp_recall, "Effort@20%Recall": deeplinedp_effort})

rf_result_df.columns = ["IFA", "Recall20%LOC", "Effort@20%Recall"]
n_gram_result_df.columns = ["IFA", "Recall20%LOC", "Effort@20%Recall"]
error_prone_result_df.columns = ["IFA", "Recall20%LOC", "Effort@20%Recall"]
deepline_dp_line_result.columns = ["IFA", "Recall20%LOC", "Effort@20%Recall"]

rf_result_df["technique"] = "RF"
n_gram_result_df["technique"] = "N.gram"
error_prone_result_df["technique"] = "ErrorProne"
deepline_dp_line_result["technique"] = "DeepLineDP"

all_line_result = pd.concat([rf_result_df, n_gram_result_df, error_prone_result_df, deepline_dp_line_result])

recall_result_df = all_line_result[["technique", "Recall20%LOC"]]
ifa_result_df = all_line_result[["technique", "IFA"]]
effort_result_df = all_line_result[["technique", "Effort@20%Recall"]]

recall_result_df = preprocess(recall_result_df, False)
ifa_result_df = preprocess(ifa_result_df, True)
effort_result_df = preprocess(effort_result_df, True)


In [None]:

# ---------------- Code for RQ4 -----------------------#

# get within-project result
deepline_dp_result["project"] = ['activemq', 'activemq', 'activemq', 'camel', 'camel', 'derby', 'groovy', 'hbase', 'hive', 'jruby', 'jruby', 'lucene', 'lucene', 'wicket']

file_level_by_project = deepline_dp_result.groupby("project").agg({"all.auc": "mean", "all.mcc": "mean", "all.bal.acc": "mean"}).reset_index()
file_level_by_project.columns = ["project", "AUC", "MCC", "Balance Accurracy"]

# get cross-project result
prediction_dir = '../output/prediction/DeepLineDP/cross-release/'

projs = ['activemq', 'camel', 'derby', 'groovy', 'hbase', 'hive', 'jruby', 'lucene', 'wicket']

def get_line_level_metrics(df_all):
    sum_line_attn = df_all[(df_all["file.level.ground.truth"] == "True") & (df_all["prediction.label"] == "True")]
    sum_line_attn = sum_line_attn.groupby("filename").agg({"token.attention.score": "sum", "num_tokens": "count"}).reset_index()

    sorted_df = sum_line_attn.groupby("filename").apply(lambda x: x.sort_values("token.attention.score", ascending=False)).reset_index(drop=True)
    sorted_df["order"] = sorted_df.groupby("filename").cumcount() + 1

    # calculate IFA
    IFA = sorted_df[sorted_df["line.level.ground.truth"] == "True"].groupby("filename").apply(lambda x: x.nsmallest(1, "order")).reset_index(drop=True)

    total_true = sorted_df.groupby("filename").agg({"line.level.ground.truth": lambda x: sum(x == "True")}).reset_index()

    # calculate Recall20%LOC
    recall20LOC = sorted_df.groupby("filename").apply(lambda x: x[x["order"] <= int(0.2 * len(x))]).groupby("filename").agg({"line.level.ground.truth": lambda x: sum(x == "True")}).reset_index()
    recall20LOC = recall20LOC.merge(total_true, on="filename")
    recall20LOC["recall20LOC"] = recall20LOC["line.level.ground.truth"] / recall20LOC["line.level.ground.truth_y"]

    # calculate Effort20%Recall
    effort20Recall = sorted_df.merge(total_true, on="filename").groupby("filename").apply(lambda x: sum(x["line.level.ground.truth"].cumsum() / x["line.level.ground.truth_y"] <= 0.2) / len(x)).reset_index()

    all_ifa = IFA["order"].tolist()
    all_recall = recall20LOC["recall20LOC"].tolist()
    all_effort = effort20Recall[0].tolist()

    result_df = pd.DataFrame({"all.ifa": all_ifa, "all.recall": all_recall, "all.effort": all_effort})

    return result_df

all_line_result = pd.DataFrame()
all_file_result = pd.DataFrame()

for p in projs:
    actual_pred_dir = os.path.join(prediction_dir, p)

    all_files = os.listdir(actual_pred_dir)

    all_auc = []
    all_mcc = []
    all_bal_acc = []
    all_src_projs = []
    all_tar_projs = []

    for f in all_files:
        df = pd.read_csv(os.path.join(actual_pred_dir, f))

        f = f.replace(".csv", "")
        f_split = f.split("-")
        target = f_split[-2]

        df_file = df[["train", "test", "filename", "file.level.ground.truth", "prediction.prob", "prediction.label"]]
        df_file = df_file.drop_duplicates()

        file_level_result = get_file_level_metrics(df_file)

        AUC = file_level_result[0]
        MCC = file_level_result[1]
        bal_acc = file_level_result[2]

        all_auc.append(AUC)
        all_mcc.append(MCC)
        all_bal_acc.append(bal_acc)

        all_src_projs.append(p)
        all_tar_projs.append(target)

        tmp_top_k = get_top_k_tokens(df, 1500)

        merged_df_all = pd.merge(df, tmp_top_k, on=["project", "train", "test", "filename", "token"], how="left")
        merged_df_all.loc[merged_df_all["flag"].isna(), "token.attention.score"] = 0

        line_level_result = get_line_level_metrics(merged_df_all)
        line_level_result["src"] = p
        line_level_result["target"] = target

        all_line_result = pd.concat([all_line_result, line_level_result])

    file_level_result = pd.DataFrame({"all.auc": all_auc, "all.mcc": all_mcc, "all.bal.acc": all_bal_acc})
    file_level_result["src"] = p
    file_level_result["target"] = all_tar_projs

    all_file_result = pd.concat([all_file_result, file_level_result])

final_file_level_result = all_file_result.groupby("target").agg({"all.auc": "mean", "all.bal.acc": "mean", "all.mcc": "mean"}).reset_index()
final_line_level_result = all_line_result.groupby("target").agg({"all.recall": "mean", "all.effort": "mean", "all.ifa": "mean"}).reset_index()


