In [54]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Attention, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [100]:
# Hyperparameters
num_topics = 10
num_clusters = 2
epochs = 50
batch_size = 16

In [101]:
from sklearn.cluster import KMeans

# Step 1: Taxonomy Building

# Load and preprocess the bug data
Train_bug_data = pd.read_csv("/content/preprocessed_train_data2.csv")  # Replace with your train bug data file
Test_bug_data = pd.read_csv("/content/preprocessed_test_data2.csv")  # Replace with your test bug data file

# Handling missing values in bug descriptions
Train_bug_data = Train_bug_data.dropna(subset=["bug_description"])
Test_bug_data = Test_bug_data.dropna(subset=["bug_description"])

train_bug_descriptions = Train_bug_data["bug_description"]
test_bug_descriptions = Test_bug_data["bug_description"]


In [110]:

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000)
train_X = vectorizer.fit_transform(train_bug_descriptions)
test_X = vectorizer.transform(test_bug_descriptions)

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans

# LDA topic modeling
lda_model = LatentDirichletAllocation(
    n_components=2,  # Adjust the number of topics
    random_state=42,
)
topic_distributions = lda_model.fit_transform(train_X)

# Clustering (k-means)
num_clusters = 2  # Adjust the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(topic_distributions)





In [103]:
# Step 2: Attention-based Classification

# NMF for topic modeling on train data
nmf_model = NMF(n_components=num_topics)
topic_vectors_train = nmf_model.fit_transform(train_X)

# Hierarchical Clustering (Agglomerative) for topic vectors
cluster_model_topics = AgglomerativeClustering(n_clusters=num_clusters)
cluster_labels_topics = cluster_model_topics.fit_predict(topic_vectors_train)

# Scale the input data to make it non-negative
scaler_topics = MinMaxScaler()
scaled_topic_vectors_train = scaler_topics.fit_transform(topic_vectors_train)


In [104]:
# Attention-based classification model
input_layer = Input(shape=(scaled_topic_vectors_train.shape[1],))
x = Dense(128, activation="relu")(input_layer)
x = Dropout(0.5)(x)
x = Dense(64, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(32, activation="relu")(x)
attention = Attention()([x, x])
output_layer = Dense(num_clusters, activation="softmax")(attention)
model = Model(inputs=input_layer, outputs=output_layer)


In [105]:
# Compile the model
model.compile(optimizer=Adam(), loss="sparse_categorical_crossentropy", metrics=["accuracy"])


In [106]:
# Train the model
model.fit(scaled_topic_vectors_train, cluster_labels_topics, epochs=epochs, batch_size=batch_size, validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7db72b1c2b60>

## ---------------------------------

In [69]:
from sklearn.cluster import KMeans

# Step 1: Taxonomy Building

# Load and preprocess the bug data
Train_bug_data = pd.read_csv("/content/preprocessed_train_data2.csv")  # Replace with your train bug data file
Test_bug_data = pd.read_csv("/content/preprocessed_test_data2.csv")  # Replace with your test bug data file

# Handling missing values in bug descriptions
Train_bug_data = Train_bug_data.dropna(subset=["bug_description"])
Test_bug_data = Test_bug_data.dropna(subset=["bug_description"])

train_bug_descriptions = Train_bug_data["bug_description"]
test_bug_descriptions = Test_bug_data["bug_description"]


In [70]:

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000)
train_X = vectorizer.fit_transform(train_bug_descriptions)
test_X = vectorizer.transform(test_bug_descriptions)

# Convert sparse matrix to dense numpy array
train_X_dense = train_X.toarray()
test_X_dense = test_X.toarray()

# KMeans clustering for train data
kmeans_model = KMeans(n_clusters=2, random_state=42)  # Adjust number of clusters as needed
cluster_labels_train = kmeans_model.fit_predict(train_X_dense)

# Evaluate clustering using silhouette score
silhouette_avg = silhouette_score(train_X_dense, cluster_labels_train)
print(f"Silhouette Score: {silhouette_avg}")




Silhouette Score: 0.005275638634996257


In [71]:
# Step 2: Attention-based Classification

# NMF for topic modeling on train data
nmf_model = NMF(n_components=2)  # Adjust number of topics as needed
topic_vectors_train = nmf_model.fit_transform(train_X)

# Scale the input data to make it non-negative
scaler = MinMaxScaler()
scaled_topic_vectors_train = scaler.fit_transform(topic_vectors_train)

In [72]:
# Attention-based classification model
input_layer = Input(shape=(scaled_topic_vectors_train.shape[1],))
x = Dense(64, activation="relu")(input_layer)
x = Dropout(0.2)(x)
x = Dense(32, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(8, activation="relu")(x)
attention = Attention()([x, x])
output_layer = Dense(5, activation="softmax")(attention)  # Adjust output classes based on taxonomy
model = Model(inputs=input_layer, outputs=output_layer)

In [73]:
# Compile the model
model.compile(optimizer=Adam(), loss="sparse_categorical_crossentropy", metrics=["accuracy"])


In [74]:

# Train the model
model.fit(scaled_topic_vectors_train, cluster_labels_train, epochs=40, batch_size=32, validation_split=0.2)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x7db72c20bcd0>

In [75]:
# Save the trained model
model.save("attention_based_classification_model.h5")

  saving_api.save_model(


In [76]:
# Load the saved model
loaded_model = tf.keras.models.load_model("attention_based_classification_model.h5")

In [77]:
# Test the model on test data
test_topic_vectors = nmf_model.transform(test_X)
test_scaled_topic_vectors = scaler.transform(test_topic_vectors)
test_predictions = np.argmax(loaded_model.predict(test_scaled_topic_vectors), axis=-1)




In [107]:
# Map Backend to 0 and Frontend to 1
Test_bug_data["class_name"] = Test_bug_data["class_name"].map({"Backend": 0, "Frontend": 1})

# Print the prediction results one by one
for i in range(len(test_predictions)):
    print("Prediction:", test_predictions[i], "Actual:", Test_bug_data["class_name"].iloc[i])


Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 1.0
Prediction: 1 Actual: 1.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: nan
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: nan
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: 1.0
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: nan
Prediction: 0 Actual: 0.0
Prediction: 0 Actual: nan
Prediction: 

In [108]:
# Drop rows with NaN values in "class_name"
Test_bug_data.dropna(subset=["class_name"], inplace=True)

# Ensure that both arrays have the same number of samples
num_samples = min(len(Test_bug_data["class_name"]), len(test_predictions))
Test_bug_data = Test_bug_data[:num_samples]
test_predictions = test_predictions[:num_samples]

# Calculate accuracy
accuracy = accuracy_score(Test_bug_data["class_name"], test_predictions)

# Calculate precision
precision = precision_score(Test_bug_data["class_name"], test_predictions)

# Calculate recall
recall = recall_score(Test_bug_data["class_name"], test_predictions)

# Calculate F1-score
f1 = f1_score(Test_bug_data["class_name"], test_predictions)

# Calculate confusion matrix
conf_matrix = confusion_matrix(Test_bug_data["class_name"], test_predictions)

# Print the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.5746781115879829
Precision: 0.4090909090909091
Recall: 0.00911854103343465
F1 Score: 0.017839444995044598
Confusion Matrix:
[[1330   13]
 [ 978    9]]


In [80]:
# count the number of correct predictions
correct = 0
for i in range(len(test_predictions)):
    if test_predictions[i] == Test_bug_data["class_name"].iloc[i]:
        correct += 1

# Calculate the accuracy
accuracy = correct / len(test_predictions)
print("Accuracy:", accuracy)

print (correct)
print (len(test_predictions))

Accuracy: 0.5549356223175965
1293
2330
