In [1]:
import pandas as pd
import os
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# path of training data
train_path = 'train.xlsx'

# path of testing data
test_path = 'test.xlsx'

In [3]:
# show the first 5 rows of the training data
train_df = pd.read_excel(train_path)
print(train_df.head())

# show the first 5 rows of the testing data
test_df = pd.read_excel(test_path)
print(test_df.head())

                                              report class_name  class_index
0  "For any event on my bookmarked projects" opti...    Backend            1
1           Switch to using full l10n id's in urlbar   Frontend            2
2  Consider removing hasicon property to simplify...   Frontend            2
3  Method to obtain current URL from WebBrowserEd...   Frontend            2
4              Fix: migration fails in MS SQL-Server    Backend            1
                                              report class_name  class_index
0  REST API - ability to list sub projects for a ...    Backend            1
1  support selective text on right if set in GNOM...   Frontend            2
2  [meta][userstory] Ship v1 of Pre-populated top...   Frontend            2
3  Include updated_on and passwd_changed_on colum...    Backend            1
4    Problem with email integration to MS Office 365    Backend            1


In [4]:
def convert_lower_case(data):
    return str(data).lower()

In [5]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [6]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [7]:
def remove_numbers(data):
    return re.sub(r'\d+', '', str(data))

In [8]:
def remove_single_characters(tokens):
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [9]:
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [10]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [11]:
# print the first report of the training data
print(train_df['report'][0])


"For any event on my bookmarked projects" option not sending notifications for non-member bookmarked projects


In [12]:
# preprocess the first report of the training data
print(preprocess(train_df['report'][0]))


for any event on my bookmarked project option not sending notification for non member bookmarked project


In [None]:
import pandas as pd

# Assuming train_df is your DataFrame containing training data
# Assuming train_df has columns 'id', 'report', and 'class_name'

# Initialize counter
counter = 1

# Iterate over each row in the DataFrame
with open('preprocessed_train_data.csv', 'w', encoding='utf-8') as f:
    f.write('bug_description , class_name\n')  # Write header

    for _, row in train_df.iterrows():
        # Preprocess the 'report' column
        preprocessed_report = preprocess(row['report'])

        # Write data to the file with incremented counter
        f.write(f"{preprocessed_report},{row['class_name']}\n")

# Print the first preprocessed report
print(train_df['report'][0])


In [None]:
# FOR TESTING DATA

# Iterate over each row in the DataFrame
with open('preprocessed_test_data.csv', 'w', encoding='utf-8') as f:
    f.write('bug_description , class_name\n')  # Write header

    for _, row in test_df.iterrows():
        # Preprocess the 'report' column
        preprocessed_report = preprocess(row['report'])

        # Write data to the file with incremented counter
        f.write(f"{preprocessed_report},{row['class_name']}\n")

# Print the first preprocessed report
print(test_df['report'][0])


In [None]:
# Open the preprocessed data file for reading
with open('preprocessed_train_data.csv', 'r', encoding='utf-8') as f:
    # Open the new file for writing preprocessed data with 2 columns
    with open('preprocessed_train_data2.csv', 'w', encoding='utf-8') as f_out:
        # Iterate over each line in the file
        for i, line in enumerate(f):
            # Skip the header
            if i == 0:
                continue
            
            # Split the line into columns based on comma delimiter
            columns = line.strip().split(',')
            
            # Check the number of columns
            if len(columns) == 2:
                # Get the preprocessed report and class name
                preprocessed_report = columns[0]
                class_name = columns[1]
                
                # Write the preprocessed report and class name to the new file
                f_out.write(f"{preprocessed_report},{class_name}\n")
            else:
                # Skip the line if it doesn't have exactly 2 columns
                continue


In [None]:
# FOR TESTING DATA

# Open the preprocessed data file for reading
with open('preprocessed_test_data.csv', 'r', encoding='utf-8') as f:
    # Open the new file for writing preprocessed data with 2 columns
    with open('preprocessed_test_data2.csv', 'w', encoding='utf-8') as f_out:
        # Iterate over each line in the file
        for i, line in enumerate(f):
            # Skip the header
            if i == 0:
                continue
            
            # Split the line into columns based on comma delimiter
            columns = line.strip().split(',')
            
            # Check the number of columns
            if len(columns) == 2:
                # Get the preprocessed report and class name
                preprocessed_report = columns[0]
                class_name = columns[1]
                
                # Write the preprocessed report and class name to the new file
                f_out.write(f"{preprocessed_report},{class_name}\n")
            else:
                # Skip the line if it doesn't have exactly 2 columns
                continue


In [13]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('preprocessed_train_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

                                     bug_description class_name
0  for any event on my bookmarked project option ...    Backend
1               switch to using full ln id in urlbar   Frontend
2  consider removing hasicon property to simplify...   Frontend
3  method to obtain current url from webbrowsered...   Frontend
4                fix migration fails in m sql server    Backend


In [14]:
# read the preprocessed data from the new file
preprocessed_test_df = pd.read_csv('preprocessed_test_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_test_df.head())

                                     bug_description class_name
0  rest api ability to list sub project for a pro...    Backend
1  support selective text on right if set in gnom...   Frontend
2  meta userstory ship v of pre populated topsite...   Frontend
3  include updated on and passwd changed on colum...    Backend
4         problem with email integration to m office    Backend


In [15]:
# remove the stop words from the preprocessed data using nltk
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
def remove_stop_words(data):
    tokens = word_tokenize(data)
    data = ' '.join([i for i in tokens if not i in stop_words])
    return data

# preprocess the first report of the training data
print(preprocess(preprocessed_train_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_train_df['bug_description'][0])))

# preprocess the first report of the testing data
print(preprocess(preprocessed_test_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_test_df['bug_description'][0])))


for any event on my bookmarked project option not sending notification for non member bookmarked project
event bookmarked project option sending notification non member bookmarked project
rest api ability to list sub project for a project
rest api ability list sub project project


In [17]:
# Convert non-string values to strings in 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: str(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: str(x))

# Remove stop words from 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: remove_stop_words(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: remove_stop_words(x))

# Show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

# Show the first 5 rows of the preprocessed testing data
print(preprocessed_test_df.head())


                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [60]:
# keep only the reports that has class_name of Frontend, Backend, Security, Documentation
# Filter the training data
filtered_train_df = preprocessed_train_df[
    (preprocessed_train_df['class_name'] == 'Frontend') | 
    (preprocessed_train_df['class_name'] == 'Backend') |
    (preprocessed_train_df['class_name'] == 'Security') 
]

# Filter the testing data
filtered_test_df = preprocessed_test_df[
    (preprocessed_test_df['class_name'] == 'Frontend') | 
    (preprocessed_test_df['class_name'] == 'Backend') |
    (preprocessed_test_df['class_name'] == 'Security')
]

# Show the first 5 rows of the filtered training data
print("Filtered Training Data:")
print(filtered_train_df.head())

# Show the first 5 rows of the filtered testing data
print("\nFiltered Testing Data:")
print(filtered_test_df.head())


Filtered Training Data:
                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend

Filtered Testing Data:
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [62]:
# Define the mapping of class names to the desired order
class_name_mapping = {
    'Backend': 1,
    'Frontend': 0,
    'Security': 2
}

# Map class names in both training and testing data to the desired order
filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)

# order them based on the number of class_label
filtered_train_df = filtered_train_df.sort_values(by=['class_label'])
filtered_test_df = filtered_test_df.sort_values(by=['class_label'])

# Print the unique class names in the training data
print(filtered_train_df['class_name'].unique())

# Print the unique class names in the testing data
print(filtered_test_df['class_name'].unique())


['Frontend' 'Backend' 'Security']
['Frontend' 'Backend' 'Security']


## Feature Exraction

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Replace NaN values with an empty string
preprocessed_train_df['bug_description'].fillna('', inplace=True)

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed data
X_train = vectorizer.fit_transform(filtered_train_df['bug_description'])

# Print the shape of X_train
print(X_train.shape)

(13603, 7483)


In [64]:
# print the vector representation of the first report
print(X_train[0])

  (0, 6321)	0.39810213343122414
  (0, 1005)	0.48891468317581693
  (0, 6672)	0.5876471675976369
  (0, 3503)	0.507097555059254


In [65]:
# Print the number of unique class_name in the training data
print(filtered_train_df['class_name'].nunique())

# print their unique values
print(filtered_train_df['class_name'].unique())

# print the number of reports in each class
print(filtered_train_df['class_name'].value_counts())



3
['Frontend' 'Backend' 'Security']
Backend     7437
Frontend    5799
Security     367
Name: class_name, dtype: int64


In [66]:
class_name_mapping = {
    'Backend': 1,
    'Frontend': 0
}

In [72]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Define a function to compute classification metrics
def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Define the class name mapping
class_name_mapping = {
    0: 'Frontend',
    1: 'Backend',
    2: 'Security'
}

# Define the n-grams
ngram_ranges = [(3,3), (3,4), (3,5), (3,6), (3,7), (3,8), (3,9), (3,10)]

# Initialize lists to store data and labels
X_data = filtered_train_df['bug_description']
y_labels = filtered_train_df['class_name']

# Initialize arrays to store augmented features
X_train_augmented = None
X_test_augmented = None

# Loop over each n-gram range
for ngram_range in ngram_ranges:
    print(f"Processing n-gram range: {ngram_range}")
    
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    
    # Vectorize the data
    X = vectorizer.fit_transform(X_data)
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.8, random_state=42)
    
    # Train the supervised model (Random Forest) on labeled data
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Use the predictions from the supervised model as additional features for NMF
    labeled_predictions_train = rf_model.predict(X_train)
    labeled_predictions_test = rf_model.predict(X_test)
    
    if X_train_augmented is None:
        X_train_augmented = labeled_predictions_train.reshape(-1, 1)
        X_test_augmented = labeled_predictions_test.reshape(-1, 1)
    else:
        X_train_augmented = np.hstack((X_train_augmented, labeled_predictions_train.reshape(-1, 1)))
        X_test_augmented = np.hstack((X_test_augmented, labeled_predictions_test.reshape(-1, 1)))

# Train NMF on the augmented feature set
nmf_model = NMF(n_components=3, random_state=42)
nmf_model.fit(X_train_augmented)

# Predictions using NMF
test_predictions = nmf_model.transform(X_test_augmented)

# Convert predictions to class names
predicted_class_names = []
for prediction in np.argmax(test_predictions, axis=1):
    if prediction in class_name_mapping:
        predicted_class_names.append(class_name_mapping[prediction])
    else:
        print(f"Unexpected prediction: {prediction}")
        predicted_class_names.append('Unknown')

# Compute metrics
accuracy, precision, recall, f1 = compute_metrics(y_test, predicted_class_names)

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Processing n-gram range: (3, 3)
Processing n-gram range: (3, 4)
Processing n-gram range: (3, 5)
Processing n-gram range: (3, 6)
Processing n-gram range: (3, 7)
Processing n-gram range: (3, 8)
Processing n-gram range: (3, 9)
Processing n-gram range: (3, 10)


ValueError: could not convert string to float: 'Backend'

In [None]:
# Define all possible mappings
cluster_class_mappings = {
#    "Mapping 1": {0: 'Backend', 1: 'Frontend', 2: 'Security'},
    "Mapping 2": {0: 'Frontend', 1: 'Backend', 2: 'Security'},
#    "Mapping 3": {0: 'Security', 1: 'Frontend', 2: 'Backend'},
#    "Mapping 4": {0: 'Backend', 1: 'Security', 2: 'Frontend'},
#    "Mapping 5": {0: 'Frontend', 1: 'Security', 2: 'Backend'},
#    "Mapping 6": {0: 'Security', 1: 'Backend', 2: 'Frontend'},
}

# Create a list of tuples containing the range of n-grams to consider
ngram_ranges = [(i, j) for i in range(1, 16) for j in range(i, 16)]

# Define the number of labeled examples
labeled_data_size = 100  # Adjust as needed

# Define a list of different numbers of components for ensemble models
n_components_list = [2]  # Adjust as needed

# Iterate over each n-gram range
for ngram_range in ngram_ranges:
    # Iterate over each number of components
    for n_components in n_components_list:
        print(f"Processing ngram_range={ngram_range}, n_components={n_components}")

        # Split the data into labeled and unlabeled parts
        labeled_data = filtered_train_df['bug_description'][:labeled_data_size]
        labeled_labels = filtered_train_df['class_name'][:labeled_data_size]
        test_data = filtered_test_df['bug_description']
        test_labels = filtered_test_df['class_name']

        # Usage example:
        evaluate_cluster_mapping(labeled_data, labeled_labels, test_data, test_labels, cluster_class_mappings, n_components=n_components, ngram_range=ngram_range)
        