In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import csv
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import train_test_split

In [2]:
# Remove the stop words from the preprocessed_description column using nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Define the directory paths
directories = [
    ("..\\new_dataset\\eclipse", "..\\new_dataset\\eclipse"),
    #("..\\new_dataset\\eclipse_test", "..\\new_dataset\\eclipse_test"),
    #("..\\new_dataset\\firefox", "..\\new_dataset\\firefox"),
    #("..\\new_dataset\\netbeans", "..\\new_dataset\\netbeans"),
    #("..\\new_dataset\\openoffice", "..\\new_dataset\\openoffice")
]

In [4]:
# Read the file eclipse_new_updated.csv
df = pd.read_csv("..\\new_dataset\\eclipse\\eclipse_small_new_updated.csv")

# Print the first 5 rows of the dataframe (preprocessed_description column only)
print(df['preprocessed_description'].head())

0    description regression group toc are created a...
1    output column page in data set editor used res...
2    description regression failed to preview chart...
3    description exception is thrown out when link ...
4    build id m step to reproduce start eclipse cli...
Name: preprocessed_description, dtype: object


In [5]:


# Remove the stop words from the preprocessed_description column
df['preprocessed_description'] = df['preprocessed_description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Print the first 5 rows of the dataframe (preprocessed_description column only)
print(df['preprocessed_description'].head())

0    description regression group toc created autom...
1    output column page data set editor used result...
2    description regression failed preview chart vi...
3    description exception thrown link label anothe...
4    build id step reproduce start eclipse click he...
Name: preprocessed_description, dtype: object


In [6]:
# Split the data into training and testing data
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# print the first 1 rows of the training data
print(X_train.head(1))

# print the first 1 rows of the testing data
print(X_test.head(1))

     bug_id bug_severity                                        description  \
762  214940       normal  If my memory don't fail me, content.xml and ar...   

    dup_id priority                              short_desc  \
762     []       P3  [prov] Use ECF to get repository files   

                              preprocessed_description  
762  memory dont fail , content xml artifact xml cu...  
     bug_id bug_severity                                        description  \
394  214519  enhancement  It would be useful to allow SQL to be appended...   

    dup_id priority                                         short_desc  \
394     []       P5  Allow appending strings to CREATE TABLE statem...   

                              preprocessed_description  
394  would useful allow sql appended end create tab...  


In [7]:
# Use TF-IDF as a feature extraction technique
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train['preprocessed_description'])
X_test_tfidf = vectorizer.transform(X_test['preprocessed_description'])

In [51]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# make a dictionary to store the test example bug_id as a key and the predicted duplicates as values
predicted_duplicates_dict = {}
# make a dictionary to store the test example bug_id as a key and the corrected duplicates as values
true_duplicates_dict = {}

# Iterate over each example of the test data
for i in range(X_test_tfidf.shape[0]):
    predicted_duplicates_dict[X_test['bug_id'].values[i]] = []
    true_duplicates_dict[X_test['bug_id'].values[i]] = []
    
    # Calculate the cosine similarity between the test example and all the training examples
    similarity = cosine_similarity(X_test_tfidf[i], X_train_tfidf)

    # Get the bug_ids of the training examples that have similarity greater than or equal to 0.9
    bug_ids = X_train['bug_id'].values[np.where(similarity >= 0.9)[1]]
    # if bug_ids is empty, replace it with -1
    if len(bug_ids) == 0:
        bug_ids = [-1]

    # Append the bug_ids to the predicted_duplicates list
    predicted_duplicates_dict[X_test['bug_id'].values[i]].extend(bug_ids)
    print ("Predicted Duplicates for Bug ID {}: {}".format(X_test['bug_id'].values[i], bug_ids))

    # Print the bug ID of the test example
    print("Test Bug ID:", X_test['bug_id'].values[i])

    # along with the bug_id of the training example
    print("Training Bug IDs:", X_train['bug_id'].values[np.where(similarity >= 0.9)[1]])

    # Get the dup_id of the test example and split it by semicolons
    dup_ids = X_test['dup_id'].values[i].split(';')

    # after splitting check if there is an empty string in the list and remove it
    if '' in dup_ids:
        dup_ids.remove('')

    # if the dup_ids list is  ['[]'] replace it with -1
    if dup_ids == ['[]']:
        dup_ids = [-1]
    # make the dup_ids list as integers
    dup_ids = [int(dup_id) for dup_id in dup_ids]

    # Append the dup_ids to the true_duplicates list
    true_duplicates_dict[X_test['bug_id'].values[i]].extend(dup_ids)
    print("True Duplicates for Bug ID {}: {}".format(X_test['bug_id'].values[i], dup_ids))

    print("\n")
    
# calculate the true_positive, false_positive, true_negative, false_negative
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0

# There are 4 cases to consider:
# 1. The predicted duplicate is in the true duplicates list (true positive)
# 2. The predicted duplicate is not in the true duplicates list (false positive)
# 3. The true duplicate is not predicted as a duplicate (false negative)
# 4. The true duplicate is predicted as a duplicate (true negative)

# Note: if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains -1, and true_duplicates_dict[X_test['bug_id'].values[i]] contains -1, then it is a true negative
# if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains -1, and true_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, then it is a false negative
# if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, and true_duplicates_dict[X_test['bug_id'].values[i]] contains -1, then it is a false positive
# if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, and true_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, then it is a true positive

for key in predicted_duplicates_dict.keys():
    if -1 in predicted_duplicates_dict[key] and -1 in true_duplicates_dict[key]:
        true_negative += 1
    elif -1 in predicted_duplicates_dict[key] and -1 not in true_duplicates_dict[key]:
        false_negative += 1
    elif -1 not in predicted_duplicates_dict[key] and -1 in true_duplicates_dict[key]:
        false_positive += 1
    elif -1 not in predicted_duplicates_dict[key] and -1 not in true_duplicates_dict[key]:
        for bug_id in predicted_duplicates_dict[key]:
            if bug_id in true_duplicates_dict[key]:
                true_positive += 1

# calculate the precision, recall, and f1 score
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1 = 2 * precision * recall / (precision + recall)
accuracy_score = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy Score:", accuracy_score)

# confusion matrix
conf_matrix = [[true_negative, false_positive],
               [false_negative, true_positive]]

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)



Predicted Duplicates for Bug ID 214519: [-1]
Test Bug ID: 214519
Training Bug IDs: []
True Duplicates for Bug ID 214519: [-1]


Predicted Duplicates for Bug ID 214139: [-1]
Test Bug ID: 214139
Training Bug IDs: []
True Duplicates for Bug ID 214139: [-1]


Predicted Duplicates for Bug ID 214639: [-1]
Test Bug ID: 214639
Training Bug IDs: []
True Duplicates for Bug ID 214639: [-1]


Predicted Duplicates for Bug ID 214142: [-1]
Test Bug ID: 214142
Training Bug IDs: []
True Duplicates for Bug ID 214142: [-1]


Predicted Duplicates for Bug ID 215046: [-1]
Test Bug ID: 215046
Training Bug IDs: []
True Duplicates for Bug ID 215046: [-1]


Predicted Duplicates for Bug ID 214943: [-1]
Test Bug ID: 214943
Training Bug IDs: []
True Duplicates for Bug ID 214943: [-1]


Predicted Duplicates for Bug ID 214160: [-1]
Test Bug ID: 214160
Training Bug IDs: []
True Duplicates for Bug ID 214160: [-1]


Predicted Duplicates for Bug ID 214989: [214988 215040 214990]
Test Bug ID: 214989
Training Bug IDs: [21