In [2]:
import re
import os
import pandas as pd
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords


In [3]:
# Download NLTK stopwords corpus if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
# Define the directory paths
directories = [
    ("..\\new_dataset\\eclipse", "..\\BOW_dataset\\eclipse"),
    ("..\\new_dataset\\firefox", "..\\BOW_dataset\\firefox"),
    ("..\\new_dataset\\netbeans", "..\\BOW_dataset\\netbeans"),
    ("..\\new_dataset\\openoffice", "..\\BOW_dataset\\openoffice")
]


In [26]:
# Function to preprocess bug description text
def preprocess_bug_description(description):

    # convert description to string
    description = str(description)
    
    # Convert to lowercase
    description = description.lower()

    # Remove punctuation and special characters
    description = re.sub(r'[^a-zA-Z0-9\s]', '', description)

    # Remove numbers
    description = re.sub(r'\b\d+\b', '', description)

    # Remove extra whitespace
    description = re.sub(r'\s+', ' ', description)

    return description

In [27]:
# Preprocess Bug Reports
def preprocess_bug_reports(bug_reports):
    preprocessed_reports = []
    for report in bug_reports:
        # Implement your preprocessing steps here
        preprocessed_report = preprocess_bug_description(report)
        preprocessed_reports.append(preprocessed_report)
    return preprocessed_reports

In [28]:
# Calculate Similarity
def calculate_similarity(bow_vectors):
    # Calculate cosine similarity between each pair of bug reports
    num_reports = bow_vectors.shape[0]
    similarities = []
    for i in range(num_reports):
        for j in range(i+1, num_reports):
            similarity_score = cosine_similarity(bow_vectors[i], bow_vectors[j])[0][0]
            similarities.append(similarity_score)
    return similarities

In [29]:
# Define the tfidf_similarity function
def tfidf_similarity(bug1, bug2):
    # Initialize the TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Apply TF-IDF to bug reports
    tfidf_matrix = tfidf_vectorizer.fit_transform([bug1, bug2])

    # Calculate cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

    return similarity[0][0]

In [30]:
# Function to get bug descriptions based on bug IDs
def get_bug_description(bug_id, file_data):

    # convert to string
    bug_id = int(bug_id)
    bug_id = str(bug_id)
    
    # if id is not found in the dataset, return None
    # convert the bug_id column to string
    file_data['bug_id'] = file_data['bug_id'].astype(str)
    
    if bug_id not in file_data['bug_id'].values:
        return None
    
    return file_data[file_data['bug_id'] == bug_id]['description'].values[0]


In [33]:
# Iterate over each directory
for source_dir, target_dir in directories:

    # Iterate over each file in the directory
    for file_name in os.listdir(target_dir):

        # Check if the file is a CSV file and contains "_new_similarity" in its name
        if file_name.endswith(".csv") and "_new_similarity" in file_name:

            # Load the CSV file with similarity data
            pairs_data = pd.read_csv(os.path.join(target_dir, file_name))
            print("Pairs file loaded:", file_name)

            # Load the original bug reports file
            original_file_name = file_name.replace("_new_similarity.csv", "_new.csv")
            bug_reports_file = pd.read_csv(os.path.join(source_dir, original_file_name))
            print("Original bug reports file loaded:", original_file_name)

            # Check if the DataFrame is empty (end of file reached)
            if pairs_data.empty:
                print("End of file reached for:", file_name)
                continue

            # Iterate through each pair of bug reports
            for index, row in pairs_data.iterrows():
                issue_id = row['issue_id']
                duplicate_id = row['duplicate_id']
                
                #print ("issue_id is:", type(issue_id), issue_id)
                #print ("duplicate_id is:", type(duplicate_id), duplicate_id)
                
                if pd.isnull(duplicate_id):
                    continue

                # Get descriptions of the bugs
                description1 = get_bug_description(issue_id, bug_reports_file)
                description2 = get_bug_description(duplicate_id, bug_reports_file)
                
                #print("description1 is:", description1)
                #print("description2 is:", description2)

                # if one of the descriptions is None, skip the comparison
                if description1 is None or description2 is None:
                    print("One of the descriptions is not found in the dataset. Skipping the comparison.")
                    continue

                # Preprocess bug descriptions
                preprocessed_desc1 = preprocess_bug_description(description1)
                preprocessed_desc2 = preprocess_bug_description(description2)

                # Compute TF-IDF similarity between bug descriptions
                similarity = tfidf_similarity(preprocessed_desc1, preprocessed_desc2)

                # Insert the TF-IDF similarity into the 4th column
                pairs_data.loc[index, 'tfidf_similarity'] = similarity

                #print the number of iterations
                print("Iterations:", index)

            # Write the updated DataFrame with TF-IDF similarity into the existing CSV file
            pairs_data.to_csv(os.path.join(target_dir, file_name), index=False)
            print("Updated file saved:", file_name)


Pairs file loaded: eclipse_new_similarity.csv
Original bug reports file loaded: eclipse_new.csv
214301
214611
Iterations: 0
214445
214451
Iterations: 1
214466
214452
Iterations: 2
214577
217601
Iterations: 3
214862
214759
Iterations: 4
215052
214414
Iterations: 5
215052
213299
Iterations: 6
214411
216725
Iterations: 7
214411
216663
Iterations: 8
214411
210304
Iterations: 9
214305
214303
Iterations: 10
214305
214611
Iterations: 11
214305
214301
Iterations: 12
214306
214303
Iterations: 13
214306
214304
Iterations: 14
214306
214305
Iterations: 15
214306
214611
Iterations: 16
214306
214301
Iterations: 17
215040
214988
Iterations: 18
215040
214990


KeyboardInterrupt: 