In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [9]:
def preprocess_bug_description(description):

    # Convert to lowercase
    description = description.lower()

    # Remove punctuation and special characters
    description = re.sub(r'[^a-zA-Z0-9\s]', '', description)

    # Remove numbers
    description = re.sub(r'\b\d+\b', '', description)

    # Remove extra whitespace
    description = re.sub(r'\s+', ' ', description)

    return description

In [10]:
# Preprocess Bug Reports
def preprocess_bug_reports(bug_reports):
    preprocessed_reports = []
    for report in bug_reports:
        # Implement your preprocessing steps here
        preprocessed_report = preprocess_bug_description(report)
        preprocessed_reports.append(preprocessed_report)
    return preprocessed_reports

In [11]:
# Create a Vocabulary and Generate BoW Vectors
def generate_bow_vectors(bug_reports):
    # Initialize CountVectorizer to create BoW vectors
    vectorizer = CountVectorizer()
    # Fit the vectorizer on preprocessed bug reports to build the vocabulary
    vectorizer.fit(bug_reports)
    # Transform bug reports into BoW vectors
    bow_vectors = vectorizer.transform(bug_reports)
    return bow_vectors

In [12]:
# Calculate Similarity
def calculate_similarity(bow_vectors):
    # Calculate cosine similarity between each pair of bug reports
    num_reports = bow_vectors.shape[0]
    similarities = []
    for i in range(num_reports):
        for j in range(i+1, num_reports):
            similarity_score = cosine_similarity(bow_vectors[i], bow_vectors[j])[0][0]
            similarities.append(similarity_score)
    return similarities

In [13]:
# Sample bug report data
bug_reports = [
    "Bug report 1: UnknownHostException is not caught when the host name is invalid",
    "Bug report 2: After having synchronized and released successfully with teamstream (on zrhcvs), I attempted to version the project org.eclipse.jdt.core and got a dialog saying 'CVS communication error'",
    "Bug report 3: When the connection to the repositories fails, InterruptedIOException is not caught"
]

# Preprocess bug reports
preprocessed_bug_reports = preprocess_bug_reports(bug_reports)

# Generate BoW vectors
bow_vectors = generate_bow_vectors(preprocessed_bug_reports)

# Calculate similarity
similarities = calculate_similarity(bow_vectors)

# Print similarity scores
for i, similarity in enumerate(similarities, start=1):
    print(f"Similarity between Bug Report {i} and other bug reports: {similarity:.4f}")


Similarity between Bug Report 1 and other bug reports: 0.1543
Similarity between Bug Report 2 and other bug reports: 0.6211
Similarity between Bug Report 3 and other bug reports: 0.2485
