In [94]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split

In [95]:
# Remove the stop words from the preprocessed_description column using nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [96]:
# Define the directory paths
directories = [
    ("..\\new_dataset\\eclipse", "..\\new_dataset\\eclipse"),
    #("..\\new_dataset\\eclipse_test", "..\\new_dataset\\eclipse_test"),
    #("..\\new_dataset\\firefox", "..\\new_dataset\\firefox"),
    #("..\\new_dataset\\netbeans", "..\\new_dataset\\netbeans"),
    #("..\\new_dataset\\openoffice", "..\\new_dataset\\openoffice")
]

In [97]:
# Read the file eclipse_new_updated.csv
df = pd.read_csv("..\\new_dataset\\eclipse\\eclipse_small_new_updated.csv")

# Print the first 5 rows of the dataframe (preprocessed_description column only)
print(df['preprocessed_description'].head())

0    description regression group toc are created a...
1    output column page in data set editor used res...
2    description regression failed to preview chart...
3    description exception is thrown out when link ...
4    build id m step to reproduce start eclipse cli...
Name: preprocessed_description, dtype: object


In [98]:
import numpy as np

# Check for NaN values and replace them with an empty string
df['preprocessed_description'] = df['preprocessed_description'].fillna('')

# Apply the lambda function only to non-NaN values
df['preprocessed_description'] = df['preprocessed_description'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop_words)]))


In [99]:
# Split the data into training and testing data
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# print the first 1 rows of the training data
print(X_train.head(1))

# print the first 1 rows of the testing data
print(X_test.head(1))

# Drop rows where bug_id is not numeric in X_train
X_train = X_train[X_train['bug_id'].apply(lambda x: str(x).isdigit())]

# Drop rows where bug_id is not numeric in X_test
X_test = X_test[X_test['bug_id'].apply(lambda x: str(x).isdigit())]


      bug_id bug_severity                                        description  \
2279    2540       normal  The contents of the Open>Perspective menu depe...   

     dup_id priority                                         short_desc  \
2279     []       P3  Contents of Perspective>Open menu should not c...   

                               preprocessed_description  
2279  content open perspective menu depend current p...  
      bug_id bug_severity                                        description  \
3500    3781  enhancement  0) On J platform, save a method with J charact...   

     dup_id priority                                         short_desc  \
3500     []       P4  J - LOW PRIORITY - Eclipse methods saved with ...   

                               preprocessed_description  
3500  j platform , save method j character invoke e ...  


In [100]:
# Use TF-IDF as a feature extraction technique
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train['preprocessed_description'])
X_test_tfidf = vectorizer.transform(X_test['preprocessed_description'])    

In [101]:
# print the size of X_train_tfidf
print(X_train_tfidf.shape)

# print the size of X_test_tfidf
print(X_test_tfidf.shape)



(3246, 14231)
(812, 14231)


In [102]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re

# make a dictionary to store the test example bug_id as a key and the predicted duplicates as values
predicted_duplicates_dict = {}
# make a dictionary to store the test example bug_id as a key and the corrected duplicates as values
true_duplicates_dict = {}

# Loop through the test examples
for i in range(X_test_tfidf.shape[0]):
    print("Processing test example", i)
    
    # Check if X_test['bug_id'].values is empty
    if len(X_test['bug_id'].values) == 0:
        continue
    
    predicted_duplicates_dict[X_test['bug_id'].values[i]] = []
    true_duplicates_dict[X_test['bug_id'].values[i]] = []
    
    # Calculate the cosine similarity between the test example and all the training examples
    similarity = cosine_similarity(X_test_tfidf[i], X_train_tfidf)

    # Get the bug_ids of the training examples that have similarity greater than or equal to 0.9
    bug_ids = X_train['bug_id'].values[np.where(similarity >= 0.9)[1]]
    # if bug_ids is empty, replace it with -1
    if len(bug_ids) == 0:
        bug_ids = [-1]

    # Append the bug_ids to the predicted_duplicates list
    predicted_duplicates_dict[X_test['bug_id'].values[i]].extend(bug_ids)

    # Get the dup_id of the test example
    dup_id_value = X_test['dup_id'].values[i]
    if not pd.isnull(dup_id_value):  # Check if the value is not NaN
        # split it by semicolons
        dup_ids = dup_id_value.split(';')
        # after splitting check if there is an empty string in the list and remove it
        dup_ids = [dup_id for dup_id in dup_ids if dup_id]  # Filter out empty strings
        # if the dup_ids list is ['[]'], replace it with -1
        if dup_ids == ['[]']:
            dup_ids = [-1]
        # make the dup_ids list as integers
        dup_ids = [int(dup_id) for dup_id in dup_ids]
    else:
        dup_ids = [-1]  # Assign -1 if the value is NaN
        
    # Append the dup_ids to the true_duplicates list
    true_duplicates_dict[X_test['bug_id'].values[i]].extend(dup_ids)


Processing test example 0
Processing test example 1
Processing test example 2
Processing test example 3
Processing test example 4
Processing test example 5
Processing test example 6
Processing test example 7
Processing test example 8
Processing test example 9
Processing test example 10
Processing test example 11
Processing test example 12
Processing test example 13
Processing test example 14
Processing test example 15
Processing test example 16
Processing test example 17
Processing test example 18
Processing test example 19
Processing test example 20
Processing test example 21
Processing test example 22
Processing test example 23
Processing test example 24
Processing test example 25
Processing test example 26
Processing test example 27
Processing test example 28
Processing test example 29
Processing test example 30
Processing test example 31
Processing test example 32
Processing test example 33
Processing test example 34
Processing test example 35
Processing test example 36
Processing 

In [103]:
# calculate the true_positive, false_positive, true_negative, false_negative
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0

# There are 4 cases to consider:
# 1. The predicted duplicate is in the true duplicates list (true positive)
# 2. The predicted duplicate is not in the true duplicates list (false positive)
# 3. The true duplicate is not predicted as a duplicate (false negative)
# 4. The true duplicate is predicted as a duplicate (true negative)

# Note: if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains -1, and true_duplicates_dict[X_test['bug_id'].values[i]] contains -1, then it is a true negative
# if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains -1, and true_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, then it is a false negative
# if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, and true_duplicates_dict[X_test['bug_id'].values[i]] contains -1, then it is a false positive
# if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, and true_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, then it is a true positive

for key in predicted_duplicates_dict.keys():
    if -1 in predicted_duplicates_dict[key] and -1 in true_duplicates_dict[key]:
        true_negative += 1
    elif -1 in predicted_duplicates_dict[key] and -1 not in true_duplicates_dict[key]:
        false_negative += 1
    elif -1 not in predicted_duplicates_dict[key] and -1 in true_duplicates_dict[key]:
        false_positive += 1
    elif -1 not in predicted_duplicates_dict[key] and -1 not in true_duplicates_dict[key]:
        for bug_id in predicted_duplicates_dict[key]:
            if bug_id in true_duplicates_dict[key]:
                true_positive += 1

# print the true_positive, false_positive, true_negative, false_negative
print("True Positive:", true_positive)
print("False Positive:", false_positive)
print("True Negative:", true_negative)
print("False Negative:", false_negative)

True Positive: 20
False Positive: 24
True Negative: 778
False Negative: 2


In [104]:


# calculate the precision, recall, and f1 score
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1 = 2 * precision * recall / (precision + recall)
accuracy_score = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy Score:", accuracy_score)

# confusion matrix
conf_matrix = [[true_negative, false_positive],
               [false_negative, true_positive]]



Precision: 0.45454545454545453
Recall: 0.9090909090909091
F1 Score: 0.6060606060606061
Accuracy Score: 0.9684466019417476


In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Define the range of n-grams to iterate through
# (1,1) , (1,2) , (1,3) , (1,4) , (1,5) , (1,6) , (1,7)
# (2,2) , (2,3) , (2,4) , (2,5) , (2,6) , (2,7)
# (3,3) , (3,4) , (3,5) , (3,6) , (3,7)
# (4,4) , (4,5) , (4,6) , (4,7)
# (5,5) , (5,6) , (5,7)
# (6,6) , (6,7)
# (7,7)
ngram_ranges = [(i, j) for i in range(1, 8) for j in range(i, 8)]

# Iterate through each n-gram range
for ngram_range in ngram_ranges:
    print("Processing n-gram range:", ngram_range)

    # Use TF-IDF as a feature extraction technique with the current n-gram range
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    X_train_tfidf = vectorizer.fit_transform(X_train['preprocessed_description'])
    X_test_tfidf = vectorizer.transform(X_test['preprocessed_description'])

    # make a dictionary to store the test example bug_id as a key and the predicted duplicates as values
    predicted_duplicates_dict = {}
    # make a dictionary to store the test example bug_id as a key and the corrected duplicates as values
    true_duplicates_dict = {}

    # Loop through the test examples
    for i in range(X_test_tfidf.shape[0]):
        print("Processing test example", i)

        # Check if X_test['bug_id'].values is empty
        if len(X_test['bug_id'].values) == 0:
            continue

        predicted_duplicates_dict[X_test['bug_id'].values[i]] = []
        true_duplicates_dict[X_test['bug_id'].values[i]] = []

        # Calculate the cosine similarity between the test example and all the training examples
        similarity = cosine_similarity(X_test_tfidf[i], X_train_tfidf)

        # Get the bug_ids of the training examples that have similarity greater than or equal to 0.9
        bug_ids = X_train['bug_id'].values[np.where(similarity >= 0.9)[1]]
        # if bug_ids is empty, replace it with -1
        if len(bug_ids) == 0:
            bug_ids = [-1]

        # Append the bug_ids to the predicted_duplicates list
        predicted_duplicates_dict[X_test['bug_id'].values[i]].extend(bug_ids)

        # Get the dup_id of the test example
        dup_id_value = X_test['dup_id'].values[i]
        if not pd.isnull(dup_id_value):  # Check if the value is not NaN
            # split it by semicolons
            dup_ids = dup_id_value.split(';')
            # after splitting check if there is an empty string in the list and remove it
            dup_ids = [dup_id for dup_id in dup_ids if dup_id]  # Filter out empty strings
            # if the dup_ids list is ['[]'], replace it with -1
            if dup_ids == ['[]']:
                dup_ids = [-1]
            # make the dup_ids list as integers
            dup_ids = [int(dup_id) for dup_id in dup_ids]
        else:
            dup_ids = [-1]  # Assign -1 if the value is NaN

        # Append the dup_ids to the true_duplicates list
        true_duplicates_dict[X_test['bug_id'].values[i]].extend(dup_ids)

    # calculate the true_positive, false_positive, true_negative, false_negative
    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0

    # There are 4 cases to consider:
    # 1. The predicted duplicate is in the true duplicates list (true positive)
    # 2. The predicted duplicate is not in the true duplicates list (false positive)
    # 3. The true duplicate is not predicted as a duplicate (false negative)
    # 4. The true duplicate is predicted as a duplicate (true negative)

    # Note: if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains -1, and true_duplicates_dict[X_test['bug_id'].values[i]] contains -1, then it is a true negative
    # if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains -1, and true_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, then it is a false negative
    # if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, and true_duplicates_dict[X_test['bug_id'].values[i]] contains -1, then it is a false positive
    # if predicted_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, and true_duplicates_dict[X_test['bug_id'].values[i]] contains a bug_id, then it is a true positive

    for key in predicted_duplicates_dict.keys():
        if -1 in predicted_duplicates_dict[key] and -1 in true_duplicates_dict[key]:
            true_negative += 1
        elif -1 in predicted_duplicates_dict[key] and -1 not in true_duplicates_dict[key]:
            false_negative += 1
        elif -1 not in predicted_duplicates_dict[key] and -1 in true_duplicates_dict[key]:
            false_positive += 1
        elif -1 not in predicted_duplicates_dict[key] and -1 not in true_duplicates_dict[key]:
            for bug_id in predicted_duplicates_dict[key]:
                if bug_id in true_duplicates_dict[key]:
                    true_positive += 1

    # print the true_positive, false_positive, true_negative, false_negative
    print("True Positive:", true_positive)
    print("False Positive:", false_positive)
    print("True Negative:", true_negative)
    print("False Negative:", false_negative)

    # calculate the precision, recall, and f1 score
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    f1 = 2 * precision * recall / (precision + recall)
    accuracy_score = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)

    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy Score:", accuracy_score)

    # confusion matrix
    conf_matrix = [[true_negative, false_positive],
                   [false_negative, true_positive]]

    # Print the confusion matrix
    print("Confusion Matrix:")
    print(conf_matrix)


Processing n-gram range: (1, 1)
Processing test example 0
Processing test example 1
Processing test example 2
Processing test example 3
Processing test example 4
Processing test example 5
Processing test example 6
Processing test example 7
Processing test example 8
Processing test example 9
Processing test example 10
Processing test example 11
Processing test example 12
Processing test example 13
Processing test example 14
Processing test example 15
Processing test example 16
Processing test example 17
Processing test example 18
Processing test example 19
Processing test example 20
Processing test example 21
Processing test example 22
Processing test example 23
Processing test example 24
Processing test example 25
Processing test example 26
Processing test example 27
Processing test example 28
Processing test example 29
Processing test example 30
Processing test example 31
Processing test example 32
Processing test example 33
Processing test example 34
Processing test example 35
Proces