## PREPROCESSING DATA

In [1]:
from preprocessing import Parser, ReportPreprocessor, SrcPreprocessor
from datasets import DATASET

from collections import defaultdict, OrderedDict

In [2]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nhatm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\nhatm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
parser = Parser(DATASET)
bug_reports = parser.report_parser()
src_files = parser.src_parser()

In [4]:
src_processor = SrcPreprocessor(src_files, parser.name).preprocess()
report_processor = ReportPreprocessor(bug_reports, parser.name).preprocess()

## INDEXING SOURCE CODE FILES


In [3]:
import pandas as pd

In [4]:
file_path = "outputs/tomcat_source_code_data.csv"
src_df = pd.read_csv(file_path)

In [20]:
def prepare_data(input_data, key_string, value_string):
    output_data = {}
    for index, row in input_data.iterrows():
        all_content = eval(row[value_string])
        output_data[row[key_string]] = all_content['stemmed']
    return output_data

In [9]:
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    for key, contents in documents.items():
        for content in contents:
            inverted_index[content].append(key)
    return inverted_index

In [10]:
data_src_codes_content = prepare_data(src_df, 'key','all_content')
data_src_codes_size = src_df[['key', 'total_lines']].values.tolist()
inverted_index_source_codes = build_inverted_index(data_src_codes_content)

## BUILD VSM

In [11]:
import math

In [72]:
def compute_ltc(documents, inverted_index):
    tf_idf = defaultdict(lambda: defaultdict(float))
    total_documents = len(documents)

    for key, contents in documents.items():
        content_frequency = defaultdict(int)
        for content in contents:
            content_frequency[content] += 1

        for content, count in content_frequency.items():
            tf = 1 + math.log(count, 10)
            df = len(inverted_index.get(content, []))
            idf = math.log((total_documents + 1) / (df + 1), 10)


            tf_idf[key][content] = tf * idf

    for doc_id in tf_idf:
        norm = math.sqrt(sum(weight ** 2 for weight in tf_idf[doc_id].values()))
        if norm > 0:
            for content in tf_idf[doc_id]:
                tf_idf[doc_id][content] /= norm

    return tf_idf

In [73]:
vsm = compute_ltc(data_src_codes_content, inverted_index_source_codes)

In [78]:
vsm_iter = list(vsm)

## CALCULATE COEFFICIENT FILE SIZE

In [14]:
max_size = 0
min_size = math.inf
for data_size in data_src_codes_size:
    if data_size[1] > max_size:
        max_size = data_size[1]
    if data_size[1] < min_size:
        min_size = data_size[1]

In [15]:
def calculate_coefficient_size(size):
    value_standardization = (size - min_size) / (max_size - min_size)
    return 1/(1 + math.exp(-value_standardization))

In [16]:
coefficient_size = defaultdict(float)
for data_size in data_src_codes_size:
    coefficient_size[data_size[0]] = calculate_coefficient_size(data_size[1])

## HANDLE REPORT DATA

In [18]:
file_path = "outputs/tomcat_bug_reports.csv"
report_df = pd.read_csv(file_path)

In [28]:
def prepare_report_data():
    summary_contents = prepare_data(report_df, 'key', 'pos_tagged_summary')
    description_contents = prepare_data(report_df, 'key', 'pos_tagged_description')

    output_data = summary_contents
    for key, contents in description_contents.items():
        output_data[key].extend(contents)

    return output_data

In [29]:
report_data = prepare_report_data()

In [67]:
def compute_lnc(query):
    tf_idf = defaultdict(float)

    term_freq = defaultdict(int)
    for term in query:
        term_freq[term] += 1

    for term, freq in term_freq.items():
        tf = 1 + math.log10(freq)
        tf_idf[term] = tf

    norm = math.sqrt(sum(weight ** 2 for weight in tf_idf.values()))
    if norm > 0:
        for term in tf_idf:
            tf_idf[term] /= norm

    return tf_idf


## RANKING RESULTS

In [64]:
def calculate_cosine(query, documents, inverted_index):
    query_vec = compute_lnc(query)
    doc_vecs = compute_ltc(documents, inverted_index)

    cosine_scores = {}

    for key, doc_vec in doc_vecs.items():
        dot_product = 0.0
        for term in query_vec:
            dot_product += query_vec[term] * doc_vec.get(term, 0.0)

        cosine_scores[key] = dot_product

    return cosine_scores

In [92]:
data = calculate_cosine(report_data[56025], data_src_codes_content, inverted_index_source_codes)

In [93]:
sorted_data = dict(sorted(data.items(), key=lambda item: item[1], reverse=True))

In [94]:
sorted_data

{'ServerEndpoint': 0.10334454594719258,
 'WsMappingResult': 0.07345751369347404,
 'Heartbeat': 0.06629390015625677,
 'ServerContainer': 0.06381174442325215,
 'InstanceManager': 0.05926917771489239,
 'TestWsServerContainer': 0.05840202453344987,
 'ServerApplicationConfig': 0.05640820683041428,
 'ExamplesConfig': 0.05328884035706412,
 'WsPerSessionServerEndpointConfig': 0.050920651279297104,
 'DefaultServerEndpointConfigurator': 0.04853583194883345,
 'ExceptionUtils': 0.048391899043786664,
 'FilterChain': 0.04769394492030366,
 'Bug53545': 0.040317122438562225,
 'CometFilterChain': 0.04022320463114813,
 'PageData': 0.036806276043473106,
 'TesterFunctions': 0.03515098644252456,
 'DefaultServerEndpointConfig': 0.033887079608387966,
 'JspEngineInfo': 0.033167968825189964,
 'ServerEndpointConfig': 0.031368901800488715,
 'DateFormatCache': 0.029276005409629116,
 'TestJspReader': 0.02451817429501884,
 'SlowQueryReport': 0.02388590053433614,
 'TestStandardContextValve': 0.02286275513430326,
 'Te