In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
# Load the data from eclipse.csv
eclipse_data = pd.read_csv("eclipse_small.csv")

# Load the pairs data from eclipse_pairs.csv
pairs_data = pd.read_csv("eclipse_small_pairs.csv")

In [35]:
# Function to preprocess bug description text
def preprocess_bug_description(description):

    # Convert to lowercase
    description = description.lower()

    # Remove punctuation and special characters
    description = re.sub(r'[^a-zA-Z0-9\s]', '', description)

    # Remove numbers
    description = re.sub(r'\b\d+\b', '', description)

    # Remove extra whitespace
    description = re.sub(r'\s+', ' ', description)

    return description

In [36]:

# Function to apply TF-IDF to bug reports
def tfidf_similarity(bug1, bug2):
    # Initialize the TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()
    
    # Preprocess the bug descriptions
    preprocessed_bug1 = preprocess_bug_description(bug1)
    preprocessed_bug2 = preprocess_bug_description(bug2)
    
    # Apply TF-IDF
    tfidf_vectors = tfidf_vectorizer.fit_transform([preprocessed_bug1, preprocessed_bug2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity(tfidf_vectors[0], tfidf_vectors[1])
    
    return similarity[0][0]



In [41]:
# Function to get bug descriptions based on bug IDs
def get_bug_description(bug_id):
    # if id is not found in the dataset, return None
    
    return eclipse_data[eclipse_data['bug_id'] == bug_id]['description'].values[0]



In [48]:
# Iterate through pairs data and perform preprocessing, TF-IDF, and cosine similarity
for index, row in pairs_data.iterrows():
    issue_id = row['issue_id']
    duplicate_id = row['duplicate']
    
    print(f"Comparing Bug {issue_id} and Bug {duplicate_id}")

    # Get descriptions of the bugs
    description1 = get_bug_description(issue_id)
    print(f"Bug {issue_id} ==>  description: {description1}")
    
    # remove any spaces left or right the duplicate_id
    duplicate_id = duplicate_id.strip()
    print(f"Bug {duplicate_id} ==> description: {description2}")
    
    description2 = get_bug_description(duplicate_id)

    print(f"Bug {duplicate_id} ==> description: {description2}")

    # if one of the descriptions is None, skip the comparison    
    if description1 is None or description2 is None:
        print("One of the descriptions is not found in the dataset. Skipping the comparison.")
        continue
    
    # Preprocess and apply TF-IDF
    similarity = tfidf_similarity(description1, description2)
    
    print(f"Similarity between Bug {issue_id} and Bug {duplicate_id}: {similarity}")


Comparing Bug 214301 and Bug 214611
Bug 214301 ==>  description: -- Error Log --
Date: Fri Jan 04 07:44:55 CET 2008
Message: Could not load tasklist hyperlink detector extension
Severity: Info
Plugin ID: org.eclipse.mylyn
Stack Trace:
org.eclipse.core.runtime.CoreException: Plug-in org.eclipse.mylyn.java.ui was unable to load class org.eclipse.mylyn.internal.java.ui.JavaStackTraceHyperlinkDetector.
at org.eclipse.core.internal.registry.osgi.RegistryStrategyOSGI.throwException(RegistryStrategyOSGI.java:180)
at org.eclipse.core.internal.registry.osgi.RegistryStrategyOSGI.createExecutableExtension(RegistryStrategyOSGI.java:162)
at org.eclipse.core.internal.registry.ExtensionRegistry.createExecutableExtension(ExtensionRegistry.java:788)
at org.eclipse.core.internal.registry.ConfigurationElement.createExecutableExtension(ConfigurationElement.java:243)
at org.eclipse.core.internal.registry.ConfigurationElementHandle.createExecutableExtension(ConfigurationElementHandle.java:51)
at org.eclipse

IndexError: index 0 is out of bounds for axis 0 with size 0