In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
import numpy as np

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Load the bug reports dataset
data = pd.read_excel("train.xlsx")
bug_reports = data['report']

# Tokenize bug reports and remove empty strings or tokens consisting only of spaces
stop_words = set(stopwords.words('english'))
tokenized_bug_reports = [[word.lower() for word in word_tokenize(report) if word.strip() and word.strip() not in stop_words] for report in bug_reports]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_bug_reports, vector_size=100, window=5, min_count=1, sg=1)





In [8]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_bug_reports, vector_size=100, window=5, min_count=1, sg=1)

In [9]:
tokenized_bug_reports[:3]


[['``',
  'for',
  'event',
  'bookmarked',
  'projects',
  "''",
  'option',
  'sending',
  'notifications',
  'non-member',
  'bookmarked',
  'projects'],
 ['switch', 'using', 'full', 'l10n', 'id', "'s", 'urlbar'],
 ['consider', 'removing', 'hasicon', 'property', 'simplify', 'styling']]

In [10]:
# Save or load Word2Vec model (optional)
# word2vec_model.save("word2vec_model.bin")
# word2vec_model = Word2Vec.load("word2vec_model.bin")

In [11]:
# Get the word vectors for a specific word
test_word = "email"
word_vector = word2vec_model.wv[test_word]

# Print similar words to a given word
similar_words = word2vec_model.wv.most_similar(test_word)

print(similar_words)

[('mails', 0.9240984916687012), ('security', 0.9203272461891174), ('ignoring', 0.9187041521072388), ('e-mail', 0.9171749949455261), ('blockquote', 0.9165360927581787), ('sending', 0.9158561825752258), ('asynchronously', 0.9139227271080017), ('send', 0.91126948595047), ('header', 0.9048137068748474), ('alert', 0.9007105231285095)]


In [12]:
# Print the word embeddings
print("Word Embeddings:")
for word in word2vec_model.wv.index_to_key[:10]:
    print(f"{word}: {word2vec_model.wv[word]}")

Word Embeddings:
]: [-0.53750074  0.26458997  0.2769119   0.2481331  -0.62226945 -0.47160646
  0.21903442  1.0568821  -0.20258772 -0.2385448   0.23223323 -0.2623591
 -0.19035095  0.3412698  -0.27751714 -0.19158882  0.456042    0.12780492
 -0.50588655 -1.0060073   0.38981664  0.14307466  0.5397854  -0.3658165
  0.23994422 -0.25789723 -0.25722277  0.19848445 -0.79220027  0.03981844
 -0.45189622  0.11274085  0.03886789 -0.74818295  0.09493242  0.22749187
  0.3724694  -0.4515283   0.00637098 -0.42331997 -0.20814139  0.18694863
 -0.57525724 -0.24606927  0.40416214 -0.02105139 -0.1035418   0.4530266
  0.13843572  0.48384696  0.01100396 -0.0442043  -0.17185774 -0.02776158
  0.06437865 -0.25580293  0.6165826  -0.4148008  -0.5109503   0.60494375
 -0.13290438 -0.08423628 -0.5988021  -0.3222611  -0.17728297  0.35969967
 -0.19168982  0.16278338 -0.11884812  0.28734705 -0.02211548  0.26191425
  0.05627747 -0.46681976  0.28354335  0.13490966  0.25544313  0.32421553
 -0.01333705 -0.13778739 -0.067629

In [13]:
# Derive document embeddings by averaging word embeddings
document_embeddings = np.array([np.mean([word2vec_model.wv[word] for word in report if word in word2vec_model.wv], axis=0) for report in tokenized_bug_reports])

print(document_embeddings.shape)

(14710, 100)


In [14]:
#  visualize clusters or perform further analysis
import matplotlib.pyplot as plt

# Visualize clusters
def visualize_clusters(embeddings, clusters):
    # Define colors for each cluster (including noise points)
    unique_clusters = np.unique(clusters)
    colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_clusters))]

    # Plot data points with cluster assignments
    for cluster_id, color in zip(unique_clusters, colors):
        if cluster_id == -1:
            # Noise points (cluster_id = -1)
            plt.scatter(embeddings[clusters == cluster_id][:, 0], embeddings[clusters == cluster_id][:, 1], color='black', label='Noise')
        else:
            # Core and border points
            plt.scatter(embeddings[clusters == cluster_id][:, 0], embeddings[clusters == cluster_id][:, 1], color=color, label=f'Cluster {cluster_id}')

    # Add legend and labels
    plt.legend()
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title('Clustering')

    # Show plot
    plt.show()


In [15]:
import csv

def store_cluster_assignments(clusters, bug_reports, output_file):
    # Create a dictionary to store bug reports for each cluster
    cluster_reports = {}
    for i, cluster_label in enumerate(clusters):
        if cluster_label not in cluster_reports:
            cluster_reports[cluster_label] = []
        cluster_reports[cluster_label].append(bug_reports[i])

    # Write cluster assignments to CSV file
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Cluster', 'Bug Report'])
        for cluster_label, reports in cluster_reports.items():
            if cluster_label == -1:
                cluster_name = 'Noise'
            else:
                cluster_name = f'Cluster {cluster_label}'
            for report in reports:
                writer.writerow([cluster_name, report])

    print(f"Cluster assignments saved to '{output_file}'")


In [None]:
from sklearn.cluster import MeanShift

#  the bandwidth parameter controls the size of the kernel used for density estimation
# A larger bandwidth implies a larger neighborhood, resulting in smoother density estimation and potentially fewer clusters.
meanshift = MeanShift(bandwidth=0.7)

# Fit MeanShift clustering on document embeddings
meanshift.fit(document_embeddings)

# Predict cluster labels
meanshift_clusters = meanshift.predict(document_embeddings)



In [None]:
# Visualize clusters (optional)
visualize_clusters(document_embeddings, meanshift_clusters)


In [None]:
# Store cluster assignments in CSV file (optional)
store_cluster_assignments(meanshift_clusters, bug_reports, "meanshift_cluster_assignments.csv")


Cluster assignments saved to 'gmm_cluster_assignments.csv'


In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Compute Silhouette Score
silhouette_avg = silhouette_score(document_embeddings, meanshift_clusters)
print(f"Silhouette Score: {silhouette_avg}")

# Compute Davies–Bouldin Index
db_index = davies_bouldin_score(document_embeddings, meanshift_clusters)
print(f"Davies–Bouldin Index: {db_index}")


#the Silhouette Score is 0.080 and the Davies–Bouldin Index is 2.053.
# These values suggest that the clustering quality may not be optimal,
# as the Silhouette Score is relatively low and the Davies–Bouldin Index is higher than expected.

Silhouette Score: 0.0804663673043251
Davies–Bouldin Index: 2.0533898200278315
