In [1]:
pip install nimfa



In [2]:
!pip install nltk



In [3]:
import pandas as pd
import os
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# remove the stop words from the preprocessed data using nltk
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# path of training data
train_path = '/content/train.xlsx'

# path of testing data
test_path = '/content/test.xlsx'

In [5]:
# show the first 5 rows of the training data
train_df = pd.read_excel(train_path)
print(train_df.head())

# show the first 5 rows of the testing data
test_df = pd.read_excel(test_path)
print(test_df.head())

                                              report class_name  class_index
0  "For any event on my bookmarked projects" opti...    Backend            1
1           Switch to using full l10n id's in urlbar   Frontend            2
2  Consider removing hasicon property to simplify...   Frontend            2
3  Method to obtain current URL from WebBrowserEd...   Frontend            2
4              Fix: migration fails in MS SQL-Server    Backend            1
                                              report class_name  class_index
0  REST API - ability to list sub projects for a ...    Backend            1
1  support selective text on right if set in GNOM...   Frontend            2
2  [meta][userstory] Ship v1 of Pre-populated top...   Frontend            2
3  Include updated_on and passwd_changed_on colum...    Backend            1
4    Problem with email integration to MS Office 365    Backend            1


In [6]:
def convert_lower_case(data):
    return str(data).lower()

In [7]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [8]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [9]:
def remove_numbers(data):
    return re.sub(r'\d+', '', str(data))

In [10]:
def remove_single_characters(tokens):
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [11]:
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [12]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [13]:
# print the first report of the training data
print(train_df['report'][0])


"For any event on my bookmarked projects" option not sending notifications for non-member bookmarked projects


In [14]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('/content/preprocessed_train_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

                                     bug_description class_name
0  for any event on my bookmarked project option ...    Backend
1               switch to using full ln id in urlbar   Frontend
2  consider removing hasicon property to simplify...   Frontend
3  method to obtain current url from webbrowsered...   Frontend
4                fix migration fails in m sql server    Backend


In [15]:
# read the preprocessed data from the new file
preprocessed_test_df = pd.read_csv('/content/preprocessed_test_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_test_df.head())

                                     bug_description class_name
0  rest api ability to list sub project for a pro...    Backend
1  support selective text on right if set in gnom...   Frontend
2  meta userstory ship v of pre populated topsite...   Frontend
3  include updated on and passwd changed on colum...    Backend
4         problem with email integration to m office    Backend


In [16]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(data):
    tokens = word_tokenize(data)
    data = ' '.join([i for i in tokens if not i in stop_words])
    return data

# preprocess the first report of the training data
print(preprocess(train_df['report'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(train_df['report'][0])))

# preprocess the first report of the testing data
print(preprocess(test_df['report'][0]))

for any event on my bookmarked project option not sending notification for non member bookmarked project
event bookmarked project option sending notification non member bookmarked project
rest api ability to list sub project for a project


In [18]:
# Convert non-string values to strings in 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: str(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: str(x))

# Remove stop words from 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: remove_stop_words(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: remove_stop_words(x))

# Show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

# Show the first 5 rows of the preprocessed testing data
print(preprocessed_test_df.head())


                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [19]:
# keep only the reports that has class_name of Frontend, Backend
# Filter the training data
filtered_train_df = preprocessed_train_df[
    (preprocessed_train_df['class_name'] == 'Frontend') |
    (preprocessed_train_df['class_name'] == 'Backend')
]

# Filter the testing data
filtered_test_df = preprocessed_test_df[
    (preprocessed_test_df['class_name'] == 'Frontend') |
    (preprocessed_test_df['class_name'] == 'Backend')
]

# Show the first 5 rows of the filtered training data
print("Filtered Training Data:")
print(filtered_train_df.head())

# Show the first 5 rows of the filtered testing data
print("\nFiltered Testing Data:")
print(filtered_test_df.head())


Filtered Training Data:
                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend

Filtered Testing Data:
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [20]:
# print the unique class names in the training data
print(filtered_train_df['class_name'].unique())

# print the unique class names in the testing data
print(filtered_test_df['class_name'].unique())

['Backend' 'Frontend']
['Backend' 'Frontend']


## Feature Exraction

In [21]:
print(len(filtered_train_df))

13236


In [22]:
cluster_class_mapping = {
    1: 'Frontend',
    0: 'Backend',
}


In [23]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'  # Specify the desired BERT model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define batch size
batch_size = 512

# Tokenize and pad sequences for training data
tokenized_bug_reports_train = [tokenizer.encode(text, add_special_tokens=True) for text in filtered_train_df['bug_description']]

# Tokenize and pad sequences for test data
tokenized_bug_reports_test = [tokenizer.encode(text, add_special_tokens=True) for text in filtered_test_df['bug_description']]

# Function to process a batch and obtain embeddings
def process_batch(batch):
    # Pad sequences to a fixed length within the batch
    max_length = max(len(seq) for seq in batch)
    padded_sequences = [seq + [0]*(max_length-len(seq)) for seq in batch]

    # Convert to PyTorch tensor
    batch_tensors = torch.tensor(padded_sequences)

    # Obtain the embeddings
    with torch.no_grad():
        model.eval()
        embeddings = model(batch_tensors)[0]

    # Apply ReLU activation to make embeddings non-negative
    embeddings = torch.relu(embeddings)

    # Convert embeddings to numpy array
    return embeddings.numpy()

# Define a function to pad sequences within a batch
def pad_batch(batch):
    # Find the maximum length within the batch
    max_length = max(len(seq) for seq in batch)
    # Pad sequences to the maximum length
    padded_sequences = [seq + [0]*(max_length-len(seq)) for seq in batch]
    return padded_sequences

# Process the training data in batches
num_batches_train = len(tokenized_bug_reports_train) // batch_size
if len(tokenized_bug_reports_train) % batch_size != 0:
    num_batches_train += 1

X_train_batches = []
for i in range(num_batches_train):
    print("Processing training batch", i)
    batch_start = i * batch_size
    batch_end = min((i + 1) * batch_size, len(tokenized_bug_reports_train))
    batch = tokenized_bug_reports_train[batch_start:batch_end]
    # Pad the last batch if it has fewer samples
    if len(batch) < batch_size:
        batch = pad_batch(batch)
    batch_embeddings = process_batch(batch)
    X_train_batches.append(batch_embeddings)

# Print the shapes of arrays in X_train_batches
for i, arr in enumerate(X_train_batches):
    print(f"Shape of array {i}: {arr.shape}")

# Find the maximum size along axis 1
max_size_train = max(arr.shape[1] for arr in X_train_batches)

# Pad arrays to have the same size along axis 1 for training data
padded_batches_train = [np.pad(arr, ((0, 0), (0, max_size_train - arr.shape[1]), (0, 0)), mode='constant', constant_values=0) for arr in X_train_batches]

# Concatenate the padded arrays along axis 0 for training data
X_train = np.concatenate(padded_batches_train, axis=0)

# Print the shape of the concatenated training array
print("Shape of X_train:", X_train.shape)

# Process the test data in batches
num_batches_test = len(tokenized_bug_reports_test) // batch_size
if len(tokenized_bug_reports_test) % batch_size != 0:
    num_batches_test += 1

X_test_batches = []
for i in range(num_batches_test):
    print("Processing test batch", i)
    batch_start = i * batch_size
    batch_end = min((i + 1) * batch_size, len(tokenized_bug_reports_test))
    batch = tokenized_bug_reports_test[batch_start:batch_end]
    # Pad the last batch if it has fewer samples
    if len(batch) < batch_size:
        batch = pad_batch(batch)
    batch_embeddings = process_batch(batch)
    X_test_batches.append(batch_embeddings)

# Print the shapes of arrays in X_test_batches
for i, arr in enumerate(X_test_batches):
    print(f"Shape of array {i}: {arr.shape}")

# Find the maximum size along axis 1 for test data
max_size_test = max(arr.shape[1] for arr in X_test_batches)

# Pad arrays to have the same size along axis 1 for test data
padded_batches_test = [np.pad(arr, ((0, 0), (0, max_size_test - arr.shape[1]), (0, 0)), mode='constant', constant_values=0) for arr in X_test_batches]

# Concatenate the padded arrays along axis 0 for test data
X_test = np.concatenate(padded_batches_test, axis=0)

# Print the shape of the concatenated test array
print("Shape of X_test:", X_test.shape)

# Use the same cluster class mapping as before
cluster_class_mapping = {
    1: 'Frontend',
    0: 'Backend',
}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Processing training batch 0


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Processing training batch 1
Processing training batch 2
Processing training batch 3
Processing training batch 4
Processing training batch 5
Processing training batch 6
Processing training batch 7
Processing training batch 8
Processing training batch 9
Processing training batch 10
Processing training batch 11
Processing training batch 12
Processing training batch 13
Processing training batch 14
Processing training batch 15
Processing training batch 16
Processing training batch 17
Processing training batch 18
Processing training batch 19
Processing training batch 20
Processing training batch 21
Processing training batch 22
Processing training batch 23
Processing training batch 24
Processing training batch 25
Shape of array 0: (512, 30, 768)
Shape of array 1: (512, 33, 768)
Shape of array 2: (512, 46, 768)
Shape of array 3: (512, 31, 768)
Shape of array 4: (512, 37, 768)
Shape of array 5: (512, 30, 768)
Shape of array 6: (512, 38, 768)
Shape of array 7: (512, 29, 768)
Shape of array 8: (5

In [24]:
import numpy as np

# Save X_train and X_test to files
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)


In [23]:
import numpy as np

# Load X_train and X_test from files
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')


In [24]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.decomposition import NMF
import joblib

def nmf_topic_modeling(X_train, X_test, class_name, n_components=2, random_state=42, save_path=None):
    """
    Perform Non-negative Matrix Factorization (NMF) topic modeling on the given data and evaluate on test data.

    Parameters:
    - X_train: Array containing the BERT embeddings for training data.
    - X_test: Array containing the BERT embeddings for test data.
    - class_name: Series containing the class names corresponding to the data.
    - n_components: Number of topics to be generated (default is 2).
    - random_state: Random seed for reproducibility (default is 42).
    - save_path: Path to save the trained model (default is None).

    Returns:
    - None
    """
    # Reshape X_train and X_test to have two dimensions
    num_samples_train, num_tokens_train, embedding_size_train = X_train.shape
    X_train_2d = X_train.reshape(num_samples_train * num_tokens_train, embedding_size_train)

    num_samples_test, num_tokens_test, embedding_size_test = X_test.shape
    X_test_2d = X_test.reshape(num_samples_test * num_tokens_test, embedding_size_test)

    # Scale the input data to make it non-negative
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_2d)
    X_test_scaled = scaler.transform(X_test_2d)

    # Initialize the NMF model
    nmf_model = NMF(n_components=n_components, random_state=random_state)

    # Fit the NMF model on the training data
    nmf_model.fit(X_train_scaled)

    # Save the trained model if save_path is provided
    if save_path:
        joblib.dump(nmf_model, save_path)

    # Predict the topics for the test data
    topic_predictions_test = nmf_model.transform(X_test_scaled)

    # Reshape the predictions back to the shape of the original test data
    topic_predictions_test = topic_predictions_test.reshape(num_samples_test, num_tokens_test, n_components)

    # Take only the first 1024 samples from the class_name series
    class_name_subset = class_name[:num_samples_test]

    # Flatten the predictions and class names to align for evaluation
    flat_topic_predictions_test = topic_predictions_test.reshape(-1, n_components)
    flat_class_name_subset = class_name_subset.repeat(num_tokens_test)

    # Map numerical indices to class names
    predicted_class_names_test = flat_class_name_subset.unique()[flat_topic_predictions_test.argmax(axis=1)]

    # Print the classification report for the test data
    print("Classification Report for Test Data:")
    print(classification_report(flat_class_name_subset, predicted_class_names_test))

    # Print the confusion matrix for the test data
    print("Confusion Matrix for Test Data:")
    print(confusion_matrix(flat_class_name_subset, predicted_class_names_test))

    # Calculate and print the accuracy for the test data
    accuracy_test = np.mean(flat_class_name_subset == predicted_class_names_test)
    print("Accuracy for Test Data:", accuracy_test)

    # Calculate and print the precision, recall, and F1-score for the test data
    precision_test, recall_test, f1_score_test, _ = precision_recall_fscore_support(flat_class_name_subset, predicted_class_names_test, average='weighted')
    print("Precision for Test Data:", precision_test)
    print("Recall for Test Data:", recall_test)
    print("F1 Score for Test Data:", f1_score_test)

# Usage example:
nmf_topic_modeling(X_train, X_test, filtered_test_df['class_name'], n_components=2, random_state=42, save_path="nmf_model.pkl")


Classification Report for Test Data:
              precision    recall  f1-score   support

     Backend       0.59      0.76      0.66     57835
    Frontend       0.46      0.28      0.35     42441

    accuracy                           0.56    100276
   macro avg       0.53      0.52      0.51    100276
weighted avg       0.54      0.56      0.53    100276

Confusion Matrix for Test Data:
[[43937 13898]
 [30520 11921]]
Accuracy for Test Data: 0.5570425625274243
Precision for Test Data: 0.5357611873317091
Recall for Test Data: 0.5570425625274243
F1 Score for Test Data: 0.530938236720551


In [47]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.decomposition import LatentDirichletAllocation

def lda_topic_modeling(X_train, X_test, class_name, n_components=2, random_state=42):
    """
    Perform Latent Dirichlet Allocation (LDA) topic modeling on the given data and evaluate on test data.

    Parameters:
    - X_train: Array containing the BERT embeddings for training data.
    - X_test: Array containing the BERT embeddings for test data.
    - class_name: Series containing the class names corresponding to the data.
    - n_components: Number of topics to be generated (default is 2).
    - random_state: Random seed for reproducibility (default is 42).

    Returns:
    - None
    """
    # Reshape X_train and X_test to have two dimensions
    num_samples_train, num_tokens_train, embedding_size_train = X_train.shape
    X_train_2d = X_train.reshape(num_samples_train * num_tokens_train, embedding_size_train)

    num_samples_test, num_tokens_test, embedding_size_test = X_test.shape
    X_test_2d = X_test.reshape(num_samples_test * num_tokens_test, embedding_size_test)

    # Scale the input data to make it non-negative
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_2d)
    X_test_scaled = scaler.transform(X_test_2d)

    # Initialize the LDA model
    lda_model = LatentDirichletAllocation(n_components=n_components, random_state=random_state)

    # Fit the LDA model on the training data
    lda_model.fit(X_train_scaled)

    # Predict the topics for the test data
    topic_predictions_test = lda_model.transform(X_test_scaled)

    # Reshape the predictions back to the shape of the original test data
    topic_predictions_test = topic_predictions_test.reshape(num_samples_test, num_tokens_test, n_components)

    # Take only the first 1024 samples from the class_name series
    class_name_subset = class_name[:num_samples_test]

    # Flatten the predictions and class names to align for evaluation
    flat_topic_predictions_test = topic_predictions_test.reshape(-1, n_components)
    flat_class_name_subset = class_name_subset.repeat(num_tokens_test)

    # Map numerical indices to class names
    predicted_class_names_test = flat_class_name_subset.unique()[flat_topic_predictions_test.argmax(axis=1)]

    # Print the classification report for the test data
    print("Classification Report for Test Data:")
    print(classification_report(flat_class_name_subset, predicted_class_names_test))

    # Print the confusion matrix for the test data
    print("Confusion Matrix for Test Data:")
    print(confusion_matrix(flat_class_name_subset, predicted_class_names_test))

    # Calculate and print the accuracy for the test data
    accuracy_test = np.mean(flat_class_name_subset == predicted_class_names_test)
    print("Accuracy for Test Data:", accuracy_test)

    # Calculate and print the precision, recall, and F1-score for the test data
    precision_test, recall_test, f1_score_test, _ = precision_recall_fscore_support(flat_class_name_subset, predicted_class_names_test, average='weighted')
    print("Precision for Test Data:", precision_test)
    print("Recall for Test Data:", recall_test)
    print("F1 Score for Test Data:", f1_score_test)

# Usage example:
lda_topic_modeling(X_train, X_test, filtered_test_df['class_name'], n_components=2, random_state=42)


KeyboardInterrupt: 

In [51]:
# Create a list of tuples containing the range of n-grams to consider
ngram_ranges = [(i, j) for i in range(1, 16) for j in range(i, 16)]

# Iterate over each n-gram range
for ngram_range in ngram_ranges:
    print("N-gram Range:", ngram_range)
    # Perform topic modeling using LDA
    nmf_topic_modeling(X_train, X_test, filtered_test_df['class_name'], n_components=2, random_state=42)
    print()


N-gram Range: (1, 1)


TypeError: 'tuple' object is not callable