In [1]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'  # Specify the desired BERT model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define batch size
batch_size = 1024

# Tokenize and pad sequences
tokenized_bug_reports = [tokenizer.encode(text, add_special_tokens=True) for text in filtered_train_df['bug_description']]

In [None]:
# Function to process a batch and obtain embeddings
def process_batch(batch):
    # Pad sequences to a fixed length within the batch
    max_length = max(len(seq) for seq in batch)
    padded_sequences = [seq + [0]*(max_length-len(seq)) for seq in batch]
    
    # Convert to PyTorch tensor
    batch_tensors = torch.tensor(padded_sequences)
    
    # Obtain the embeddings
    with torch.no_grad():
        model.eval()
        embeddings = model(batch_tensors)[0]
    
    # Convert embeddings to numpy array
    return embeddings.numpy()

In [None]:
# Define a function to pad sequences within a batch
def pad_batch(batch):
    # Find the maximum length within the batch
    max_length = max(len(seq) for seq in batch)
    # Pad sequences to the maximum length
    padded_sequences = [seq + [0]*(max_length-len(seq)) for seq in batch]
    return padded_sequences

# Process the data in batches
num_batches = len(tokenized_bug_reports) // batch_size
if len(tokenized_bug_reports) % batch_size != 0:
    num_batches += 1

X_train_batches = []
for i in range(num_batches):
    print(i)
    batch_start = i * batch_size
    batch_end = min((i + 1) * batch_size, len(tokenized_bug_reports))
    batch = tokenized_bug_reports[batch_start:batch_end]
    # Pad the last batch if it has fewer samples
    if len(batch) < batch_size:
        batch = pad_batch(batch)
    batch_embeddings = process_batch(batch)
    X_train_batches.append(batch_embeddings)


In [None]:
# Concatenate the batches 
# x_train_batches has different number of samples in each batch and this produces error in the next step
for i in range(len(X_train_batches)):
    if i == 0:
        X_train = X_train_batches[i]
    else:
        X_train = np.concatenate((X_train, X_train_batches[i]), axis=0)



In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.decomposition import NMF

def nmf_topic_modeling(X_train, X_test, class_name, n_components=2, random_state=42):
    """
    Perform Non-negative Matrix Factorization (NMF) topic modeling on the given data and evaluate on test data.

    Parameters:
    - X_train: Array containing the BERT embeddings for training data.
    - X_test: Array containing the BERT embeddings for test data.
    - class_name: Series containing the class names corresponding to the data.
    - n_components: Number of topics to be generated (default is 2).
    - random_state: Random seed for reproducibility (default is 42).

    Returns:
    - None
    """
    # Reshape X_train and X_test to have two dimensions
    num_samples_train, num_tokens_train, embedding_size_train = X_train.shape
    X_train_2d = X_train.reshape(num_samples_train * num_tokens_train, embedding_size_train)
    
    num_samples_test, num_tokens_test, embedding_size_test = X_test.shape
    X_test_2d = X_test.reshape(num_samples_test * num_tokens_test, embedding_size_test)


    # Scale the input data to make it non-negative
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_2d)
    X_test_scaled = scaler.transform(X_test_2d)

    # Initialize the NMF model
    nmf_model = NMF(n_components=n_components, random_state=random_state)

    # Fit the NMF model on the training data
    nmf_model.fit(X_train_scaled)

    # Predict the topics for the test data
    topic_predictions_test = nmf_model.transform(X_test_scaled)

    # Reshape the predictions back to the shape of the original test data
    topic_predictions_test = topic_predictions_test.reshape(num_samples_test, num_tokens_test, n_components)

    # Take only the first 1024 samples from the class_name series
    class_name_subset = class_name[:num_samples_test]

    # Flatten the predictions and class names to align for evaluation
    flat_topic_predictions_test = topic_predictions_test.reshape(-1, n_components)
    flat_class_name_subset = class_name_subset.repeat(num_tokens_test)

    # Map numerical indices to class names
    predicted_class_names_test = flat_class_name_subset.unique()[flat_topic_predictions_test.argmax(axis=1)]

    # Print the classification report for the test data
    print("Classification Report for Test Data:")
    print(classification_report(flat_class_name_subset, predicted_class_names_test))

    # Print the confusion matrix for the test data
    print("Confusion Matrix for Test Data:")
    print(confusion_matrix(flat_class_name_subset, predicted_class_names_test))

    # Calculate and print the accuracy for the test data
    accuracy_test = np.mean(flat_class_name_subset == predicted_class_names_test)
    print("Accuracy for Test Data:", accuracy_test)

    # Calculate and print the precision, recall, and F1-score for the test data
    precision_test, recall_test, f1_score_test, _ = precision_recall_fscore_support(flat_class_name_subset, predicted_class_names_test, average='weighted')
    print("Precision for Test Data:", precision_test)
    print("Recall for Test Data:", recall_test)
    print("F1 Score for Test Data:", f1_score_test)

# Usage example:
nmf_topic_modeling(X_train, X_test, filtered_test_df['class_name'], n_components=2, random_state=42)


In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import joblib

def lda_topic_modeling(X_train, X_test, class_name, n_components=2, random_state=42, model_save_path='lda_model.pkl'):
    """
    Perform Latent Dirichlet Allocation (LDA) topic modeling on the given data and evaluate on test data.

    Parameters:
    - X_train: Array containing the BERT embeddings for training data.
    - X_test: Array containing the BERT embeddings for test data.
    - class_name: Series containing the class names corresponding to the data.
    - n_components: Number of topics to be generated (default is 2).
    - random_state: Random seed for reproducibility (default is 42).
    - model_save_path: Path to save the trained LDA model (default is 'lda_model.pkl').

    Returns:
    - None
    """
    # Reshape X_train and X_test to have two dimensions
    num_samples_train, num_tokens_train, embedding_size_train = X_train.shape
    X_train_2d = X_train.reshape(num_samples_train * num_tokens_train, embedding_size_train)
    
    num_samples_test, num_tokens_test, embedding_size_test = X_test.shape
    X_test_2d = X_test.reshape(num_samples_test * num_tokens_test, embedding_size_test)

    # Apply the Softplus function to X_train and X_test
    X_train_2d_softplus = np.log1p(np.exp(X_train_2d))
    X_test_2d_softplus = np.log1p(np.exp(X_test_2d))

    # Scale the input data to make it non-negative
    scaler = MinMaxScaler()  
    X_train_scaled = scaler.fit_transform(X_train_2d_softplus)
    X_test_scaled = scaler.transform(X_test_2d_softplus)

    # Create an LDA model with desired parameters
    lda_model = LatentDirichletAllocation(
        n_components=n_components,
        random_state=random_state
    )

    # Fit the LDA model on the training data
    lda_model.fit(X_train_scaled)

    # Save the trained LDA model
    joblib.dump(lda_model, model_save_path)
    print("Trained LDA model saved at:", model_save_path)

    # Predict the topics for the test data
    topic_predictions_test = lda_model.transform(X_test_scaled)

    # Map numerical indices to class names
    predicted_class_names_test = [cluster_class_mapping[prediction] for prediction in np.argmax(topic_predictions_test, axis=1)]

    # Print the classification report for the test data
    print("Classification Report for Test Data:")
    print(classification_report(class_name, predicted_class_names_test))

    # Print the confusion matrix for the test data
    print("Confusion Matrix for Test Data:")
    print(confusion_matrix(class_name, predicted_class_names_test))

    # Calculate and print the accuracy for the test data
    accuracy_test = np.mean(class_name == predicted_class_names_test)
    print("Accuracy for Test Data:", accuracy_test)

    # Calculate and print the precision, recall, and F1-score for the test data
    precision_test, recall_test, f1_score_test, _ = precision_recall_fscore_support(class_name, predicted_class_names_test, average='weighted')
    print("Precision for Test Data:", precision_test)
    print("Recall for Test Data:", recall_test)
    print("F1 Score for Test Data:", f1_score_test)

# Usage example:
lda_topic_modeling(X_train, X_test, filtered_test_df['class_name'])


In [None]:
# iterate to cover all possible combinations of ngram_range:
# 1- (1, 1) , (1, 2) , (1, 3)and so on till (1,15)
# 2- (2, 2) , (2, 3) , (2, 4)and so on till (2,15)
# 3- (3, 3) , (3, 4) , (3, 5)and so on till (3,15)
# and so on till (15, 15)

# Create a list of tuples containing the range of n-grams to consider
ngram_ranges = [(i, j) for i in range(1, 16) for j in range(i, 16)]

# Iterate over each n-gram range
for ngram_range in ngram_ranges:
    print("N-gram Range:", ngram_range)
    # Perform topic modeling using LDA
    lda_topic_modeling(X_train, X_test, filtered_test_df['class_name'], n_components=2, random_state=42, model_save_path='lda_model.pkl')
    print()
    