## PREPROCESSING DATA

In [1]:
from preprocessing import Parser, ReportPreprocessor, SrcPreprocessor
from datasets import DATASETS
import os

from collections import defaultdict

In [2]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nhatm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\nhatm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
def preprocessing_data(dataset):
    parser = Parser(dataset)
    bug_reports = parser.report_parser()
    src_files = parser.src_parser()

    os.makedirs(f'outputs/{dataset.name}', exist_ok=True)

    SrcPreprocessor(src_files).preprocess_and_export(dataset.name)
    ReportPreprocessor(bug_reports).preprocess_and_export(dataset.name)

In [4]:
for DATASET in DATASETS:
    preprocessing_data(DATASET)

## INDEXING SOURCE CODE FILES


In [3]:
dataset_name = 'tomcat'

In [4]:
import pandas as pd
import ast

In [5]:
def prepare_dataframe_src_code(name):
    return pd.read_csv(f"outputs/{name}/source_code_data.csv")

In [6]:
def fix_and_fetch_src_code_infor(data_src_code):
    def extract_stemmed(column, field = 'stemmed'):
        return column.apply(ast.literal_eval).apply(lambda x: x[field])

    stem_columns = ['pos_tagged_comments']
    for col in stem_columns:
        data_src_code[col] = extract_stemmed(data_src_code[col])

    un_stem_columns = ['file_name', 'class_names', 'method_names']
    for col in un_stem_columns:
        data_src_code[col] = extract_stemmed(data_src_code[col], 'unstemmed')

    data_src_code['natural_language'] = data_src_code['pos_tagged_comments']
    data_src_code['code_entities'] = data_src_code.apply(lambda row: row['file_name'] + row['class_names']+ row['method_names'], axis=1)

    return data_src_code[['key', 'natural_language', 'code_entities', 'total_lines']]


In [7]:
def build_inverted_index(dataset, column_data_string):
    inverted_index = defaultdict(set)
    for data in dataset.iterrows():
        for content in data[1][column_data_string]:
            inverted_index[content].add(data[1]['key'])
    return inverted_index

In [8]:
df_dataset = fix_and_fetch_src_code_infor(prepare_dataframe_src_code(dataset_name))
inverted_index_natural_language_src_codes = build_inverted_index(df_dataset, 'natural_language')
inverted_index_code_entities_src_codes = build_inverted_index(df_dataset, 'code_entities')

## BUILD VSM

In [9]:
import math

In [10]:
def compute_ltc(documents, inverted_index, field):
    tf_idf = defaultdict(lambda: defaultdict(float))
    total_documents = len(documents)

    for data in documents.iterrows():
        doc_id = data[1].key
        field_content = data[1][field]

        content_frequency = defaultdict(int)

        if isinstance(field_content, list) and len(field_content) > 0:
            for content in field_content:
                content_frequency[content] += 1

            for content, count in content_frequency.items():
                tf = 1 + math.log(count, 10)
                df = len(inverted_index.get(content, []))
                idf = math.log(total_documents / df, 10) if df != 0 else 0

                tf_idf[doc_id][content] = tf * idf
        else:
            tf_idf[doc_id] = {}

    for doc_id in tf_idf:
        norm = math.sqrt(sum(weight ** 2 for weight in tf_idf[doc_id].values()))
        if norm > 0:
            for content in tf_idf[doc_id]:
                tf_idf[doc_id][content] /= norm

    return tf_idf


In [11]:
natural_lang_vsm_src_codes = compute_ltc(df_dataset, inverted_index_natural_language_src_codes, 'natural_language')
code_entities_vsm_src_codes = compute_ltc(df_dataset, inverted_index_code_entities_src_codes, 'code_entities')

## CALCULATE COEFFICIENT FILE SIZE

In [12]:
min_max_size_dataset = (df_dataset.total_lines.min(), df_dataset.total_lines.max())
min_max_size_dataset

(50, 138)

In [13]:
def calculate_coefficient_size(size, size_data):
    min_size, max_size = size_data
    value_standardization = (size - min_size) / (max_size - min_size)
    return 1/(1 + math.exp(-value_standardization))

In [14]:
df_dataset['coefficient_size'] = df_dataset['total_lines'].apply(lambda x: calculate_coefficient_size(x, min_max_size_dataset))

## HANDLE REPORT DATA

In [15]:
def prepare_dataframe_bug_reports(name):
    return pd.read_csv(f"outputs/{name}/bug_reports.csv")

In [16]:
def fix_and_fetch_bug_report(report):
    report.pos_tagged_description = report.pos_tagged_description.apply(ast.literal_eval)
    report.pos_tagged_description = report.pos_tagged_description.apply(lambda x: x['unstemmed'])

    report.pos_tagged_summary = report.pos_tagged_summary.apply(ast.literal_eval)
    report.pos_tagged_summary = report.pos_tagged_summary.apply(lambda x: x['stemmed'])

    report['fixed_files'] = report['fixed_files'].apply(lambda x: [f for f in x.split() if f != '.'] if isinstance(x, str) else x)

    report['natural_language'] = report.pos_tagged_summary
    report['code_entities'] = report.pos_tagged_description

    return report[['key', 'natural_language', 'code_entities', 'report_time', 'fixed_files']]

In [17]:
def compute_lnc(query, field):
    tf_idf = defaultdict(float)

    term_freq = defaultdict(int)
    for term in query[field]:
        term_freq[term] += 1

    for term, freq in term_freq.items():
        tf = 1 + math.log10(freq)
        tf_idf[term] = tf * 1

    norm = math.sqrt(sum(weight ** 2 for weight in tf_idf.values()))
    if norm > 0:
        for term in tf_idf:
            tf_idf[term] /= norm

    return tf_idf

In [18]:
def compute_bnc(query, field):
    tf_idf = defaultdict(float)

    unique_terms = set(query[field])

    for term in unique_terms:
        tf = 1
        tf_idf[term] = tf * 1

    norm = math.sqrt(sum(weight ** 2 for weight in tf_idf.values()))
    if norm > 0:
        for term in tf_idf:
            tf_idf[term] /= norm

    return tf_idf


In [19]:
bug_report_dataset = fix_and_fetch_bug_report(prepare_dataframe_bug_reports(dataset_name))

## CALCULATE rVSM

In [20]:
def compute_indexing_score(query, vsm, compute_query, field):
    query_vec = compute_query(query, field)
    scores = {}

    for directory, doc_vec in vsm.items():
        dot_product = 0.0
        for term in query_vec:
            dot_product += query_vec[term] * doc_vec.get(term, 0.0)

        scores[directory] = dot_product * df_dataset.loc[df_dataset['key'] == directory, 'coefficient_size'].iloc[0]

    return scores

In [21]:
def compute_rank_files_indexing(query, coefficient):
    scores_indexing = defaultdict(float)
    scores_nl = compute_indexing_score(query, natural_lang_vsm_src_codes, compute_lnc,'natural_language')
    scores_ce = compute_indexing_score(query, code_entities_vsm_src_codes, compute_lnc,'code_entities')

    for score_key in scores_ce:
        scores_indexing[score_key] = coefficient * scores_nl[score_key] + (1 - coefficient)* scores_ce[score_key]

    return scores_indexing

## GET PREVIOUS BUG REPORT

In [23]:
def get_previous_bug_fixed_report(df, current_bug_id):
    df_sorted = df.sort_values(by='report_time')
    previous_bugs = df_sorted[df_sorted['report_time'] < df_sorted[df_sorted['key'] == current_bug_id]['report_time'].iloc[0]]
    return previous_bugs

def calculate_bug_query_similarity(df, current_bug_id):
    previous_bugs = get_previous_bug_fixed_report(df, current_bug_id)
    inverted_index = build_inverted_index(previous_bugs, 'natural_language')
    pre_bug_reports_vector = compute_ltc(previous_bugs, inverted_index, 'natural_language')

    current_bug_vector = compute_lnc(df[df.key == current_bug_id].iloc[0], 'natural_language')

    scores = {}

    for bug_id, words in pre_bug_reports_vector.items():
        dot_product = 0.0
        for term in current_bug_vector:
            dot_product += current_bug_vector[term] * words.get(term, 0.0)

        scores[bug_id] = dot_product

    return scores

In [24]:
def get_related_files(df, similarity_dict):
    relevant_bugs = {bug_id: score for bug_id, score in similarity_dict.items()}

    if not relevant_bugs:
        return {}

    file_scores = {}

    for bug_id, score in relevant_bugs.items():
        files = df[df['key'] == bug_id]['fixed_files'].iloc[0]
        for file in files:
            if file in file_scores:
                file_scores[file].append(score)
            else:
                file_scores[file] = [score]

    average_file_scores = {file: sum(scores) / len(scores) for file, scores in file_scores.items()}

    return average_file_scores

## CO-CHANGE MATRIX

In [25]:
import networkx as nx

G = nx.Graph()

for files in bug_report_dataset['fixed_files'].dropna():
    if len(files) < 2:
        continue
    for i in range(len(files)):
        for j in range(i + 1, len(files)):
            G.add_edge(files[i], files[j])

clusters = list(nx.connected_components(G))


In [26]:
file_to_cluster = {}
for cid, cluster in enumerate(clusters):
    for f_name in cluster:
        file_to_cluster[f_name] = cid

## FLOW

In [27]:
def calculate_final_score(report, alpha, top_k):
    rvsm_score = compute_rank_files_indexing(report, 0.4)

    similarity_bug_scores = calculate_bug_query_similarity(bug_report_dataset, report.key)
    simi_previous_bugs_score = get_related_files(bug_report_dataset, similarity_bug_scores)

    final_score = rvsm_score

    for directory in simi_previous_bugs_score:
        if directory in final_score:
            final_score[directory] = (1 - alpha) * final_score[directory] + alpha * simi_previous_bugs_score[directory]

    return dict(sorted(final_score.items(), key=lambda item: item[1], reverse=True)[:top_k])

In [28]:
import copy
import numpy as np

list_prediction = {}

for idx, bug_report in bug_report_dataset.iterrows():
    scores = calculate_final_score(bug_report, 0.2, 100)

    boosted_scores = copy.deepcopy(scores)
    cluster_scores = defaultdict(list)

    for file in scores:
        c_id = file_to_cluster.get(file)
        if c_id is not None:
            cluster_scores[c_id].append(scores[file])

    cluster_avg = {cid: np.mean(scores) for cid, scores in cluster_scores.items()}

    for file_name in scores:
        c_id = file_to_cluster.get(file_name)
        if c_id is not None:
            boosted_scores[file_name] += 0.3 * cluster_avg[c_id]

    list_prediction[bug_report.key] = boosted_scores

## EVALUATE

In [29]:
def fetch_ground_truth():
    ground_truth = {}
    for report_data in bug_report_dataset.itertuples():
        ground_truth[report_data.key] = report_data.fixed_files
    return ground_truth

In [30]:
ground_truth_data = fetch_ground_truth()

In [31]:
import numpy as np

def top_k_accuracy(predictions, ground_truth, k=1):
    correct = 0
    total_queries = len(predictions)

    for bug_id in predictions:
        temp_prediction = dict(list(predictions[bug_id].items())[:k])
        predict_top_k = temp_prediction.keys()
        true_directory = ground_truth[bug_id]

        if any(doc in true_directory for doc in predict_top_k):
            correct += 1

    return correct / total_queries if total_queries > 0 else 0

def average_precision(predicted, relevant):
    if not relevant:
        return 0.0

    relevant_set = set(relevant)
    hits = 0
    precision_sum = 0.0

    for i, (doc_id, _) in enumerate(predicted):
        if doc_id in relevant_set:
            hits += 1
            precision_sum += hits / (i + 1)

    return precision_sum / len(relevant) if hits > 0 else 0.0

def mean_average_precision(predictions, ground_truth):
    ap_scores = []
    total_queries = len(predictions)

    for query_idx in range(total_queries):
        pred_docs = predictions[query_idx]
        true_docs = ground_truth[query_idx]
        ap = average_precision(pred_docs, true_docs)
        ap_scores.append(ap)

    return np.mean(ap_scores) if ap_scores else 0.0

def mean_reciprocal_rank(predictions, ground_truth):
    rr_scores = []
    total_queries = len(predictions)

    for query_idx in range(total_queries):
        pred_docs = [doc_id for doc_id, _ in predictions[query_idx]]
        true_docs = set(ground_truth[query_idx])

        for rank, doc_id in enumerate(pred_docs, 1):
            if doc_id in true_docs:
                rr_scores.append(1.0 / rank)
                break

    return np.mean(rr_scores) if rr_scores else 0.0

In [32]:
# Tính các chỉ số
top1_acc = top_k_accuracy(list_prediction, ground_truth_data, k=1)
top5_acc = top_k_accuracy(list_prediction, ground_truth_data, k=5)
top10_acc = top_k_accuracy(list_prediction, ground_truth_data, k=10)
top20_acc = top_k_accuracy(list_prediction, ground_truth_data, k=20)

# In kết quả
print(f"Top-1 Accuracy: {top1_acc:.4f}")
print(f"Top-5 Accuracy: {top5_acc:.4f}")
print(f"Top-10 Accuracy: {top10_acc:.4f}")
print(f"Top-20 Accuracy: {top20_acc:.4f}")

Top-1 Accuracy: 0.1903
Top-5 Accuracy: 0.3892
Top-10 Accuracy: 0.4972
Top-20 Accuracy: 0.6013
