## PREPROCESSING DATA

In [32]:
from preprocessing import Parser, ReportPreprocessor, SrcPreprocessor
from datasets import DATASET

from collections import defaultdict

In [33]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nhatm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\nhatm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
parser = Parser(DATASET)
bug_reports = parser.report_parser()
src_files = parser.src_parser()

In [4]:
src_processor = SrcPreprocessor(src_files, parser.name).preprocess()
report_processor = ReportPreprocessor(bug_reports, parser.name).preprocess()

## INDEXING SOURCE CODE FILES


In [34]:
import pandas as pd
import ast

In [35]:
file_path = "outputs/tomcat_source_code_data.csv"
src_df = pd.read_csv(file_path)

In [36]:
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    for key, contents in documents:
        for content in contents:
            inverted_index[content].append(key)
    return inverted_index

In [37]:
src_df['all_content'] = src_df['all_content'].apply(ast.literal_eval)
src_df['all_content'] = src_df['all_content'].apply(lambda x: x['stemmed'])

In [49]:
data_src_codes_content = src_df[['key', 'all_content']].values
data_src_codes_size = src_df[['key', 'total_lines']].values.tolist()

In [50]:
data_src_codes_content

array([['Generated',
        list(['sinc', 'common', 'annot', 'target', 'elementtyp', 'elementtyp', 'constructor', 'elementtyp', 'field', 'elementtyp', 'elementtyp', 'method', 'elementtyp', 'elementtyp', 'paramet', 'elementtyp', 'type', 'retent', 'retentionpolici', 'sourc', 'gener', 'string', 'valu', 'string', 'date', 'string', 'comment', 'element', 'type', 'annot', 'type', 'element', 'type', 'element', 'type', 'element', 'type', 'local', 'variabl', 'element', 'type', 'element', 'type', 'element', 'type', 'element', 'type', 'retent', 'polici'])],
       ['ManagedBean',
        list(['sinc', 'common', 'annot', 'target', 'elementtyp', 'type', 'retent', 'retentionpolici', 'runtim', 'managedbean', 'string', 'valu', 'element', 'type', 'retent', 'polici', 'manag', 'bean'])],
       ['PostConstruct',
        list(['sinc', 'common', 'annot', 'target', 'elementtyp', 'method', 'retent', 'retentionpolici', 'runtim', 'postconstruct', 'attribut', 'element', 'type', 'retent', 'polici', 'post', 'cons

In [39]:
inverted_index_source_codes = build_inverted_index(data_src_codes_content)

In [40]:
len(inverted_index_source_codes)

23097

## BUILD VSM

In [41]:
import math

In [42]:
def compute_ltc(documents, inverted_index):
    tf_idf = defaultdict(lambda: defaultdict(float))
    total_documents = len(documents)

    for key, contents in documents:
        content_frequency = defaultdict(int)
        for content in contents:
            content_frequency[content] += 1

        for content, count in content_frequency.items():
            tf = 1 + math.log(count, 10)
            df = len(inverted_index.get(content, []))
            idf = math.log((total_documents + 1) / (df + 1), 10)


            tf_idf[key][content] = tf * idf

    for doc_id in tf_idf:
        norm = math.sqrt(sum(weight ** 2 for weight in tf_idf[doc_id].values()))
        if norm > 0:
            for content in tf_idf[doc_id]:
                tf_idf[doc_id][content] /= norm

    return tf_idf

In [43]:
vsm = compute_ltc(data_src_codes_content, inverted_index_source_codes)

## CALCULATE COEFFICIENT FILE SIZE

In [14]:
max_size = 0
min_size = math.inf
for data_size in data_src_codes_size:
    if data_size[1] > max_size:
        max_size = data_size[1]
    if data_size[1] < min_size:
        min_size = data_size[1]

In [15]:
def calculate_coefficient_size(size):
    value_standardization = (size - min_size) / (max_size - min_size)
    return 1/(1 + math.exp(-value_standardization))

In [16]:
coefficient_size = defaultdict(float)
for data_size in data_src_codes_size:
    coefficient_size[data_size[0]] = calculate_coefficient_size(data_size[1])

## HANDLE REPORT DATA

In [81]:
file_path = "outputs/tomcat_bug_reports.csv"
report_df = pd.read_csv(file_path)

In [82]:
report_df.pos_tagged_description = report_df.pos_tagged_description.apply(ast.literal_eval)
report_df.pos_tagged_description = report_df.pos_tagged_description.apply(lambda x: x['stemmed'])

report_df.pos_tagged_summary = report_df.pos_tagged_summary.apply(ast.literal_eval)
report_df.pos_tagged_summary = report_df.pos_tagged_summary.apply(lambda x: x['stemmed'])

report_df['combined'] = report_df.apply(lambda row: row['pos_tagged_summary'] + row['pos_tagged_description'] if isinstance(row['pos_tagged_summary'], list) and isinstance(row['pos_tagged_description'], list) else [], axis=1)

In [83]:
report_data = report_df[['key', 'combined']].values.tolist()

In [84]:
report_data[0]

[56012, ['bug', 'statement', 'gener', 'use', 'attribut', 'page', 'direct']]

In [88]:
def compute_lnc(query):
    tf_idf = defaultdict(float)

    term_freq = defaultdict(int)
    for term in query[1]:
        term_freq[term] += 1

    for term, freq in term_freq.items():
        tf = 1 + math.log10(freq)
        tf_idf[term] = tf * 1

    norm = math.sqrt(sum(weight ** 2 for weight in tf_idf.values()))
    if norm > 0:
        for term in tf_idf:
            tf_idf[term] /= norm

    return tf_idf


In [89]:
compute_lnc(report_data[0])

defaultdict(float,
            {'bug': 0.3779644730092272,
             'statement': 0.3779644730092272,
             'gener': 0.3779644730092272,
             'use': 0.3779644730092272,
             'attribut': 0.3779644730092272,
             'page': 0.3779644730092272,
             'direct': 0.3779644730092272})

## RANKING RESULTS

In [91]:
def calculate_score(query, documents, inverted_index):
    query_vec = compute_lnc(query)
    doc_vecs = compute_ltc(documents, inverted_index)

    cosine_scores = {}

    for key, doc_vec in doc_vecs.items():
        dot_product = 0.0
        for term in query_vec:
            dot_product += query_vec[term] * doc_vec.get(term, 0.0) * coefficient_size[key]

        cosine_scores[key] = dot_product

    return cosine_scores

In [92]:
data = calculate_score(report_data[0], data_src_codes_content, inverted_index_source_codes)

In [93]:
sorted_data = dict(sorted(data.items(), key=lambda item: item[1], reverse=True))

In [94]:
sorted_data

{'QueryTimeoutInterceptor': 0.03668263558869763,
 'StatementFinalizerTest': 0.03649682040116023,
 'Bug53545': 0.027849572309249823,
 'StatementFinalizer': 0.026091523205476592,
 'TesterFunctions': 0.02347411824799349,
 'TwoDataSources': 0.021953815510510892,
 'Direction': 0.02106176365515753,
 'TestQueryTimeoutInterceptor': 0.02077175673994028,
 'TestStatementCache': 0.020111585225147097,
 'StatementCounterInterceptor': 0.019894210863469725,
 'SocketNioValidateSend': 0.01781356498308935,
 'CreateTestTable': 0.0173958226185588,
 'AbstractCreateStatementInterceptor': 0.017380692056548408,
 'TestJspReader': 0.01700284812879122,
 'TestException': 0.01652224575394712,
 'Bug50571': 0.016493886684341222,
 'SimplePOJOExample': 0.016213310864855234,
 'SocketNioSend': 0.01580515309212115,
 'SimplePOJOAsyncExample': 0.014996285299592706,
 'MultiPointSender': 0.01499614551226367,
 'TagFileInfo': 0.014708904885522964,
 'TestJspDocumentParser': 0.014522821657068552,
 'Bug51582': 0.013571112961719407