In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load your bug report dataset (replace with your actual data)
TrainingData = pd.read_csv("bug_reports.csv")
bug_reports = TrainingData["Description"].tolist()

Testdata = pd.read_csv("test_data.csv")
bug_reports = Testdata["Description"].tolist()

# Preprocess text (remove stopwords, lowercase, etc.)
stop_words = set(stopwords.words("english"))
bug_reports_cleaned = [
    " ".join([word for word in word_tokenize(report.lower()) if word not in stop_words])
    for report in bug_reports
]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.8, min_df=2)
tfidf_matrix = vectorizer.fit_transform(bug_reports_cleaned)

# LDA topic modeling
lda_model = models.LdaModel(
    corpus=corpora.MmCorpus.sparse2full(tfidf_matrix, num_terms=len(vectorizer.get_feature_names())),
    id2word=dict(enumerate(vectorizer.get_feature_names())),
    num_topics=3,  # Adjust the number of topics
    passes=10,
    random_state=42,
)

# Get topic distributions for each bug report
topic_distributions = lda_model.get_document_topics(corpus=corpora.MmCorpus.sparse2full(tfidf_matrix, num_terms=len(vectorizer.get_feature_names())))

# Clustering (k-means)
num_clusters = 2  # Adjust the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(topic_distributions)

# Assign labels to clusters (manually)
cluster_labels_to_labels = {
    0: "Frontend",
    1: "Backend",
}

# Print results
for i, report in enumerate(bug_reports):
    print(f"Bug Report {i+1}: {report} (Cluster: {cluster_labels_to_labels[cluster_labels[i]]})")
