In [None]:
import pandas as pd
import os
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# path of training data
train_path = 'train.xlsx'

# path of testing data
test_path = 'test.xlsx'

In [None]:
# show the first 5 rows of the training data
train_df = pd.read_excel(train_path)
print(train_df.head())

# show the first 5 rows of the testing data
test_df = pd.read_excel(test_path)
print(test_df.head())

In [None]:
def convert_lower_case(data):
    return str(data).lower()

In [None]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [None]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [None]:
def remove_numbers(data):
    return re.sub(r'\d+', '', str(data))

In [None]:
def remove_single_characters(tokens):
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [None]:
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [None]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [None]:
# print the first report of the training data
print(train_df['report'][0])


In [None]:
# preprocess the first report of the training data
print(preprocess(train_df['report'][0]))


In [None]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('preprocessed_train_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

In [None]:
# read the preprocessed data from the new file
preprocessed_test_df = pd.read_csv('preprocessed_test_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_test_df.head())

In [None]:
# remove the stop words from the preprocessed data using nltk
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

import nltk
nltk.download('wordnet')

In [None]:
def remove_stop_words(data):
    tokens = word_tokenize(data)
    data = ' '.join([i for i in tokens if not i in stop_words])
    return data

# preprocess the first report of the training data
print(preprocess(preprocessed_train_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_train_df['bug_description'][0])))

# preprocess the first report of the testing data
print(preprocess(preprocessed_test_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_test_df['bug_description'][0])))


In [None]:
# Convert non-string values to strings in 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: str(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: str(x))

# Remove stop words from 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: remove_stop_words(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: remove_stop_words(x))

# Show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

# Show the first 5 rows of the preprocessed testing data
print(preprocessed_test_df.head())


In [None]:
# keep only the reports that has class_name of Frontend, Backend, Security, Documentation
# Filter the training data
filtered_train_df = preprocessed_train_df[
    (preprocessed_train_df['class_name'] == 'Frontend') | 
    (preprocessed_train_df['class_name'] == 'Backend') |
    (preprocessed_train_df['class_name'] == 'Security')
]

# Filter the testing data
filtered_test_df = preprocessed_test_df[
    (preprocessed_test_df['class_name'] == 'Frontend') | 
    (preprocessed_test_df['class_name'] == 'Backend') |
    (preprocessed_test_df['class_name'] == 'Security') 
]

# Show the first 5 rows of the filtered training data
print("Filtered Training Data:")
print(filtered_train_df.head())

# Show the first 5 rows of the filtered testing data
print("\nFiltered Testing Data:")
print(filtered_test_df.head())


In [None]:
# Define the mapping of class names to the desired order
class_name_mapping = {
    'Backend': 1,
    'Frontend': 0,
    'Security': 2
}

# Map class names in both training and testing data to the desired order
filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)

# order them based on the number of class_label
filtered_train_df = filtered_train_df.sort_values(by=['class_label'])
filtered_test_df = filtered_test_df.sort_values(by=['class_label'])

# Print the unique class names in the training data
print(filtered_train_df['class_name'].unique())

# Print the unique class names in the testing data
print(filtered_test_df['class_name'].unique())


## Feature Exraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Replace NaN values with an empty string
preprocessed_train_df['bug_description'].fillna('', inplace=True)

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed data
X_train = vectorizer.fit_transform(filtered_train_df['bug_description'])

# Print the shape of X_train
print(X_train.shape)

In [None]:
# print the vector representation of the first report
print(X_train[0])

In [None]:
# Print the number of unique class_name in the training data
print(filtered_train_df['class_name'].nunique())

# print their unique values
print(filtered_train_df['class_name'].unique())

# print the number of reports in each class
print(filtered_train_df['class_name'].value_counts())



In [None]:
# Create a mapping between cluster labels and class names
cluster_class_mapping = {
    1: 'Backend',  # Example mapping, adjust based on your actual clusters
    0: 'Frontend',
    2: 'Security'
}


In [None]:
from itertools import permutations
from sklearn.decomposition import NMF
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

def evaluate_cluster_mapping(data, class_name, cluster_class_mappings, n_components=2, ngram_range=(3, 3), random_state=42):
    """
    Evaluate the performance of NMF topic modeling with different cluster mappings.

    Parameters:
    - data: DataFrame containing the text data to be modeled.
    - class_name: Series containing the class names corresponding to the data.
    - cluster_class_mappings: Dictionary containing different cluster mappings.
    - n_components: Number of topics to be generated (default is 2).
    - ngram_range: Tuple specifying the range of n-grams to consider (default is (1, 2)).
    - random_state: Random seed for reproducibility (default is 42).

    Returns:
    - None
    """
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)

    # Fit and transform the data
    X_train = vectorizer.fit_transform(data)

    for mapping_name, cluster_class_mapping in cluster_class_mappings.items():
        print(f"Evaluating cluster mapping: {mapping_name}")

        # Initialize the NMF model
        nmf_model = NMF(n_components=n_components, random_state=random_state)

        # Fit the NMF model
        nmf_model.fit(X_train)

        # Transform the preprocessed test data
        X_test = vectorizer.transform(filtered_test_df['bug_description'])

        # Predict the topics for the test data
        topic_predictions = nmf_model.transform(X_test)

        # Map numerical indices to class names
        predicted_class_names = [cluster_class_mapping[prediction] for prediction in np.argmax(topic_predictions, axis=1)]

        # Print evaluation metrics
        print("Classification Report:")
        print(classification_report(class_name, predicted_class_names))

        print("Confusion Matrix:")
        print(confusion_matrix(class_name, predicted_class_names))

        accuracy = np.mean(class_name == predicted_class_names)
        print("Accuracy:", accuracy)

        precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 Score:", f1_score)

# Define all possible mappings
cluster_class_mappings = {
    "Mapping 1": {0: 'Frontend', 1: 'Backend', 2: 'Security'},
    # Add more mappings if needed
}

# Usage example:
evaluate_cluster_mapping(filtered_train_df['bug_description'], filtered_test_df['class_name'], cluster_class_mappings, n_components=3)


In [None]:
# iterate to cover all possible combinations of ngram_range:
# 1- (2, 2) , (2, 3) , (2, 4)and so on till (2,9)
# 2- (3, 3) , (3, 4) , (3, 5)and so on till (3,9)
# and so on till (3, 9)
# 1 => Backend , 0 => Frontend , 2 => Security

# Define all possible mappings
cluster_class_mappings = {
    "Mapping 2": {0: 'Frontend', 1: 'Backend', 2: 'Security'},
}

# Define all possible n-gram ranges
ngram_ranges = [(3, i) for i in range(3, 10)]

# Iterate over all possible n-gram ranges
for ngram_range in ngram_ranges:
    print(f"Evaluating ngram_range: {ngram_range}")
    evaluate_cluster_mapping(filtered_train_df['bug_description'], filtered_test_df['class_name'], cluster_class_mappings, n_components=2, ngram_range=ngram_range)
    