In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# Read the file eclipse_new_updated.csv
df = pd.read_csv("..\\new_dataset\\eclipse\\eclipse_small_new.csv")


# Print the first 5 rows of the dataframe (preprocessed_description column only)
print(df['preprocessed_description'].head())

0    description regression group toc created autom...
1    output column page data set editor used result...
2    description regression failed preview chart vi...
3    description exception thrown link label anothe...
4    build id step reproduce start eclipse click he...
Name: preprocessed_description, dtype: object


In [8]:
# Split the data into training and testing data
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# print the first 1 rows of the training data
print(X_train.head(1))

# print the first 1 rows of the testing data
print(X_test.head(1))

     bug_id                                        description dup_id  \
762  214940  If my memory don't fail me, content.xml and ar...     []   

                              preprocessed_description  
762  memory dont fail content xml artifact xml curr...  
     bug_id                                        description dup_id  \
394  214519  It would be useful to allow SQL to be appended...     []   

                              preprocessed_description  
394  would useful allow sql appended end create tab...  


In [9]:
# print the size of training and testing data
print(f"Training data size: {X_train.shape}")
print(f"Testing data size: {X_test.shape}")

Training data size: (696, 4)
Testing data size: (174, 4)


In [10]:
# Use bag of words to convert the preprocessed_description into numerical data
vectorizer = CountVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train['preprocessed_description'])
X_test_tfidf = vectorizer.transform(X_test['preprocessed_description'])

In [None]:
# Make a dictionary to store the test example bug_id as a key and the predicted duplicates as values
predicted_duplicates_dict = {}
# Make a dictionary to store the test example bug_id as a key and the corrected duplicates as values
true_duplicates_dict = {}

# Iterate over each example of the test data
for i in range(X_test_tfidf.shape[0]):
    predicted_duplicates_dict[X_test['bug_id'].values[i]] = []
    true_duplicates_dict[X_test['bug_id'].values[i]] = []

    # Calculate the cosine similarity between the test example and all the training examples
    similarity = cosine_similarity(X_test_tfidf[i], X_train_tfidf)

    # Get the bug_ids of the training examples that have similarity greater than or equal to 0.9
    bug_ids = X_train['bug_id'].values[np.where(similarity >= 0.9)[1]]
    # If bug_ids is empty, replace it with -1
    if len(bug_ids) == 0:
        bug_ids = [-1]

    # Append the bug_ids to the predicted_duplicates list
    predicted_duplicates_dict[X_test['bug_id'].values[i]].extend(bug_ids)
    print("Predicted Duplicates for Bug ID {}: {}".format(X_test['bug_id'].values[i], bug_ids))

    # Print the bug ID of the test example
    print("Test Bug ID:", X_test['bug_id'].values[i])

    # Along with the bug_id of the training example
    print("Training Bug IDs:", X_train['bug_id'].values[np.where(similarity >= 0.9)[1]])

    # Get the dup_id of the test example
    dup_ids_str = X_test['dup_id'].values[i]

    # Handle cases where dup_ids is a string representation of a list
    if dup_ids_str.startswith('[') and dup_ids_str.endswith(']'):
        dup_ids_str = dup_ids_str[1:-1]
        if dup_ids_str:
            dup_ids = dup_ids_str.split(', ')
        else:
            dup_ids = []
    else:
        dup_ids = dup_ids_str.split(';')

    # After splitting check if there is an empty string in the list and remove it
    if '' in dup_ids:
        dup_ids.remove('')

    # If the dup_ids list is empty, replace it with -1
    if len(dup_ids) == 0:
        dup_ids = [-1]
    else:
        # Make the dup_ids list as integers
        dup_ids = [int(dup_id) for dup_id in dup_ids]

    # Append the dup_ids to the true_duplicates list
    true_duplicates_dict[X_test['bug_id'].values[i]].extend(dup_ids)
    print("True Duplicates for Bug ID {}: {}".format(X_test['bug_id'].values[i], dup_ids))

    print("\n")

# Calculate the true_positive, false_positive, true_negative, false_negative
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0

for key in predicted_duplicates_dict.keys():
    if -1 in predicted_duplicates_dict[key] and -1 in true_duplicates_dict[key]:
        true_negative += 1
    elif -1 in predicted_duplicates_dict[key] and -1 not in true_duplicates_dict[key]:
        false_negative += 1
    elif -1 not in predicted_duplicates_dict[key] and -1 in true_duplicates_dict[key]:
        false_positive += 1
    elif -1 not in predicted_duplicates_dict[key] and -1 not in true_duplicates_dict[key]:
        for bug_id in predicted_duplicates_dict[key]:
            if bug_id in true_duplicates_dict[key]:
                true_positive += 1

# Calculate the precision, recall, and f1 score
precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative) if (true_positive + true_negative + false_positive + false_negative) > 0 else 0

#print("Precision:", precision)
#print("Recall:", recall)
#print("F1 Score:", f1)
#print("Accuracy Score:", accuracy)

# Write the values to results.txt file
with open("results.txt", "w") as f:
    f.write("Accuracy Score: " + str(accuracy) + "\n")
    f.write("Precision: " + str(precision) + "\n")
    f.write("Recall: " + str(recall) + "\n")
    f.write("F1 Score: " + str(f1) + "\n")
