In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

In [2]:
# Read the file eclipse_new_updated.csv
df = pd.read_csv("..\\new_dataset\\eclipse\\eclipse_small_new.csv")


# Print the first 5 rows of the dataframe (preprocessed_description column only)
print(df['preprocessed_description'].head())

0    description regression group toc created autom...
1    output column page data set editor used result...
2    description regression failed preview chart vi...
3    description exception thrown link label anothe...
4    build id step reproduce start eclipse click he...
Name: preprocessed_description, dtype: object


In [3]:
# Split the data into training and testing data
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# Print the first 1 row of the training data
print(X_train.head(1))

# Print the first 1 row of the testing data
print(X_test.head(1))

     bug_id                                        description dup_id  \
762  214940  If my memory don't fail me, content.xml and ar...     []   

                              preprocessed_description  
762  memory dont fail content xml artifact xml curr...  
     bug_id                                        description dup_id  \
394  214519  It would be useful to allow SQL to be appended...     []   

                              preprocessed_description  
394  would useful allow sql appended end create tab...  


In [4]:
# Print the size of training and testing data
print(f"Training data size: {X_train.shape}")
print(f"Testing data size: {X_test.shape}")

Training data size: (696, 4)
Testing data size: (174, 4)


In [5]:
# Use Latent Semantic Analysis
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['preprocessed_description'])
lsa = TruncatedSVD(n_components=100)
X_lsa = lsa.fit_transform(X)


In [7]:
# Split the LSA transformed data into training and testing data
X_train_lsa, X_test_lsa = train_test_split(X_lsa, test_size=0.2, random_state=42)
# Print the first row of the LSA training data
print(X_train_lsa[0])

[ 0.07362097 -0.15141954 -0.07436443 -0.10251979  0.0217279  -0.09031283
  0.01396534 -0.17144325  0.00387314  0.20601978  0.16086724 -0.00818873
  0.04187784 -0.00436555  0.07027703 -0.06821416  0.00393371  0.00311686
  0.05898488  0.00574588 -0.09679636  0.10118397 -0.00454336  0.05949887
  0.06730524  0.02513401  0.01154922 -0.00488237 -0.01227337 -0.01954594
  0.21701561 -0.0056095   0.04790338  0.01996441 -0.01976221 -0.03331464
 -0.05086255  0.00259736  0.03020372  0.09977146 -0.06863719  0.05401869
 -0.006327    0.02225704 -0.04710008  0.01016613 -0.01962845  0.00712673
 -0.15783228  0.04741512 -0.04813148  0.0097571  -0.02319158  0.06597655
  0.05067605 -0.08362086 -0.08307749 -0.00039901  0.02356209  0.16990979
 -0.04431828 -0.0566475  -0.04300568  0.03347948 -0.12454569 -0.00514543
  0.02446015 -0.04915059  0.02503544  0.03187724 -0.09437575 -0.01061891
 -0.04405069  0.06047827 -0.00423513  0.1156071  -0.0754232   0.04987352
  0.00469303  0.02919723 -0.02876616  0.09168144  0

In [None]:
'''
Meaning of the Numbers:
Latent Dimensions (Topics):
Each number corresponds to a value in one of the 100 topics identified by the LSA model. 

Document Representation:
The vector [ 0.07362097, -0.15141954, ..., 0.04352631] is the representation of the first document in the latent semantic space. 
Each element of the vector indicates the extent to which the document is associated with the corresponding topic.

Topic Strengths:
The magnitude and sign (positive or negative) of each number indicate the strength and direction of the association with the corresponding latent dimension. 
Larger absolute values suggest a stronger association with that dimension.

Practical Example:
Assume X_train_lsa[0] corresponds to a document about "machine learning algorithms.

0.07362097 might indicate a weak association with the first latent topic.
-0.15141954 might indicate a moderate negative association with the second latent topic.
0.20601978 might indicate a strong positive association with the tenth latent topic.
'''

In [None]:
# Make a dictionary to store the test example bug_id as a key and the predicted duplicates as values
predicted_duplicates_dict = {}
# Make a dictionary to store the test example bug_id as a key and the corrected duplicates as values
true_duplicates_dict = {}

# Iterate over each example of the test data
for i in range(X_test_lsa.shape[0]):
    test_bug_id = X_test['bug_id'].values[i]
    predicted_duplicates_dict[test_bug_id] = []
    true_duplicates_dict[test_bug_id] = []

    # Calculate the cosine similarity between the test example and all the training examples
    similarity = cosine_similarity(X_test_lsa[i].reshape(1, -1), X_train_lsa)

    # Get the bug_ids of the training examples that have similarity greater than or equal to 0.9
    bug_ids = X_train['bug_id'].values[np.where(similarity >= 0.9)[1]]
    # If bug_ids is empty, replace it with -1
    if len(bug_ids) == 0:
        bug_ids = [-1]

    # Append the bug_ids to the predicted_duplicates list
    predicted_duplicates_dict[test_bug_id].extend(bug_ids)
    print("Predicted Duplicates for Bug ID {}: {}".format(test_bug_id, bug_ids))

    # Print the bug ID of the test example
    print("Test Bug ID:", test_bug_id)

    # Along with the bug_id of the training example
    print("Training Bug IDs:", X_train['bug_id'].values[np.where(similarity >= 0.9)[1]])

    # Get the dup_id of the test example
    dup_ids_str = X_test['dup_id'].values[i]

    # Handle cases where dup_ids is a string representation of a list
    if dup_ids_str.startswith('[') and dup_ids_str.endswith(']'):
        dup_ids_str = dup_ids_str[1:-1]
        if dup_ids_str:
            dup_ids = dup_ids_str.split(', ')
        else:
            dup_ids = []
    else:
        dup_ids = dup_ids_str.split(';')

    # After splitting check if there is an empty string in the list and remove it
    if '' in dup_ids:
        dup_ids.remove('')

    # If the dup_ids list is empty, replace it with -1
    if len(dup_ids) == 0:
        dup_ids = [-1]
    else:
        # Make the dup_ids list as integers
        dup_ids = [int(dup_id) for dup_id in dup_ids]

    # Append the dup_ids to the true_duplicates list
    true_duplicates_dict[test_bug_id].extend(dup_ids)
    print("True Duplicates for Bug ID {}: {}".format(test_bug_id, dup_ids))

    print("\n")

# Calculate the true_positive, false_positive, true_negative, false_negative
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0

for key in predicted_duplicates_dict.keys():
    if -1 in predicted_duplicates_dict[key] and -1 in true_duplicates_dict[key]:
        true_negative += 1
    elif -1 in predicted_duplicates_dict[key] and -1 not in true_duplicates_dict[key]:
        false_negative += 1
    elif -1 not in predicted_duplicates_dict[key] and -1 in true_duplicates_dict[key]:
        false_positive += 1
    elif -1 not in predicted_duplicates_dict[key] and -1 not in true_duplicates_dict[key]:
        for bug_id in predicted_duplicates_dict[key]:
            if bug_id in true_duplicates_dict[key]:
                true_positive += 1

# Calculate the precision, recall, and f1 score
precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative) if (true_positive + true_negative + false_positive + false_negative) > 0 else 0

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy Score:", accuracy)

# Write the values to results.txt file
with open("results.txt", "w") as f:
    f.write("Accuracy Score: " + str(accuracy) + "\n")
    f.write("Precision: " + str(precision) + "\n")
    f.write("Recall: " + str(recall) + "\n")
    f.write("F1 Score: " + str(f1) + "\n")