In [105]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords


In [106]:
# Download NLTK stopwords corpus if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [107]:
# Define the directory paths
directories = [
    #("..\\new_dataset\\eclipse", "..\\BOW_dataset\\eclipse"),
    #("..\\new_dataset\\firefox", "..\\BOW_dataset\\firefox"),
    #("..\\new_dataset\\netbeans", "..\\BOW_dataset\\netbeans"),
    ("..\\new_dataset\\openoffice", "..\\BOW_dataset\\openoffice")
]


In [108]:
# Function to preprocess bug description text
def preprocess_bug_description(description):

    # convert description to string
    description = str(description)
    
    # Convert to lowercase
    description = description.lower()

    # Remove punctuation and special characters
    description = re.sub(r'[^a-zA-Z0-9\s]', '', description)

    # Remove numbers
    description = re.sub(r'\b\d+\b', '', description)

    # Remove extra whitespace
    description = re.sub(r'\s+', ' ', description)

    return description

In [109]:
# Preprocess Bug Reports
def preprocess_bug_reports(bug_reports):
    preprocessed_reports = []
    for report in bug_reports:
        # Implement your preprocessing steps here
        preprocessed_report = preprocess_bug_description(report)
        preprocessed_reports.append(preprocessed_report)
    return preprocessed_reports

In [110]:
# Create a Vocabulary and Generate BoW Vectors
def generate_bow_vectors(bug_reports):
    # Initialize CountVectorizer to create BoW vectors
    vectorizer = CountVectorizer()
    # Fit the vectorizer on preprocessed bug reports to build the vocabulary
    vectorizer.fit(bug_reports)
    # Transform bug reports into BoW vectors
    bow_vectors = vectorizer.transform(bug_reports)
    return bow_vectors

In [111]:
# Calculate Similarity
def calculate_similarity(bow_vectors):
    # Calculate cosine similarity between each pair of bug reports
    num_reports = bow_vectors.shape[0]
    similarities = []
    for i in range(num_reports):
        for j in range(i+1, num_reports):
            similarity_score = cosine_similarity(bow_vectors[i], bow_vectors[j])[0][0]
            similarities.append(similarity_score)
    return similarities

In [112]:
# Function to get bug descriptions based on bug IDs
def get_bug_description(bug_id, file_data):

    # convert to string
    bug_id = str(bug_id)
    
    # if id is not found in the dataset, return None
    # convert the bug_id column to string
    file_data['bug_id'] = file_data['bug_id'].astype(str)
    
    if bug_id not in file_data['bug_id'].values:
        return None
    
    return file_data[file_data['bug_id'] == bug_id]['description'].values[0]


In [115]:
# Get the common English stopwords
common_stop_words = stopwords.words('english')

# Use the common English stopwords as custom stop words
custom_stop_words = common_stop_words

# Initialize CountVectorizer with custom stop words
vectorizer = CountVectorizer(stop_words=custom_stop_words)

# Iterate over each directory
for source_dir, target_dir in directories:

    # Iterate over each file in the directory
    for file_name in os.listdir(source_dir):

        # Check if the file is a CSV file and contains "pairs" in its name
        if file_name.endswith(".csv") and "pairs" in file_name:

            # Load the CSV file
            pairs_data = pd.read_csv(os.path.join(source_dir, file_name))
            print("pairs file Loaded:", file_name)

            # get the full name of pairs file except the extension and _pairs part
            temp = file_name.replace("_pairs_new.csv", "_new.csv")
            
            # load the file with the same name as the pairs file
            file_name = temp
            bug_reports_file = pd.read_csv(os.path.join(source_dir, file_name))
            print("original file Loaded:", file_name)
            
            # Check if the DataFrame is empty (end of file reached)
            if pairs_data.empty:
                print("End of file reached for:", file_name)
                continue

            # Create an empty DataFrame to store results
            results_df = pd.DataFrame(columns=["issue_id", "duplicate_id", "similarity"])
            
            # Iterate through each pair of bug reports
            for index, row in pairs_data.iterrows():
                issue_id = row['issue_id']
                duplicate_id = row['duplicate']

                if pd.isnull(duplicate_id):
                    continue

                duplicate_ids = row['duplicate'].split(';')

                # if iteration number 14328 is reached, break the loop
                if index == 14327:
                    break

                # Iterate through each duplicate ID
                for duplicate_id in duplicate_ids:
                    duplicate_id = duplicate_id.strip()  # Remove any spaces left or right    
                    # print(type(issue_id))
                    # print(type(duplicate_id))

                    # print(f"Comparing Bug {issue_id} and Bug {duplicate_id}")

                    # Get descriptions of the bugs
                    description1 = get_bug_description(issue_id, bug_reports_file)
                    description2 = get_bug_description(duplicate_id, bug_reports_file)
                    # print(f"Bug {issue_id} description: {description1}")
                    # print(f"Bug {duplicate_id} description: {description2}")

                    # if one of the descriptions is None, skip the comparison    
                    if description1 is None or description2 is None:
                        print("One of the descriptions is not found in the dataset. Skipping the comparison.")
                        continue

                    # Preprocess bug descriptions
                    preprocessed_desc1 = preprocess_bug_description(description1)
                    preprocessed_desc2 = preprocess_bug_description(description2)

                    # Check if the documents contain only stop words
                    if all(word in vectorizer.get_stop_words() for word in preprocessed_desc1.split()) or \
                        all(word in vectorizer.get_stop_words() for word in preprocessed_desc2.split()):
                        print("Documents contain only stop words. Skipping the comparison.")
                        continue
                    
                    # Create BOW representation for each bug report
                    bow_corpus = [preprocessed_desc1, preprocessed_desc2]

                    # check if the corpus contain only stop words
                    if all(word in vectorizer.get_stop_words() for word in bow_corpus[0].split()) or \
                        all(word in vectorizer.get_stop_words() for word in bow_corpus[1].split()):
                        print("Corpus contain only stop words. Skipping the comparison.")
                        continue
                    
                    # if iteration number 14328 is reached, break the loop
                    if index == 14327:
                        break

                    bow_matrix = vectorizer.fit_transform(bow_corpus)

                    # Get BOW vectors for each bug report
                    bow1 = bow_matrix[0].toarray()
                    bow2 = bow_matrix[1].toarray()

                    # Compute cosine similarity between BOW representations
                    similarity = cosine_similarity(bow1, bow2)[0][0]

                    # print(f"Similarity between Bug {issue_id} and Bug {duplicate_id}: {similarity}")
    
                    # Inside the loop where you calculate similarity and update results
                    new_row = pd.DataFrame({"issue_id": [issue_id], "duplicate_id": [duplicate_id], "similarity": [similarity]})
                    results_df = pd.concat([results_df, new_row], ignore_index=True)

                # print count of iterations
                print(f"Count of iterations: {index + 1}")

            # Save results DataFrame to a new CSV file in the target directory
            new_file_path = os.path.join(target_dir, file_name.replace(".csv", "_similarity.csv"))
            results_df.to_csv(new_file_path, index=False)


pairs file Loaded: openoffice_pairs_new.csv
