In [1]:
import pandas as pd
import os
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# path of training data
train_path = 'train.xlsx'

# path of testing data
test_path = 'test.xlsx'

In [3]:
# show the first 5 rows of the training data
train_df = pd.read_excel(train_path)
print(train_df.head())

# show the first 5 rows of the testing data
test_df = pd.read_excel(test_path)
print(test_df.head())

                                              report class_name  class_index
0  "For any event on my bookmarked projects" opti...    Backend            1
1           Switch to using full l10n id's in urlbar   Frontend            2
2  Consider removing hasicon property to simplify...   Frontend            2
3  Method to obtain current URL from WebBrowserEd...   Frontend            2
4              Fix: migration fails in MS SQL-Server    Backend            1
                                              report class_name  class_index
0  REST API - ability to list sub projects for a ...    Backend            1
1  support selective text on right if set in GNOM...   Frontend            2
2  [meta][userstory] Ship v1 of Pre-populated top...   Frontend            2
3  Include updated_on and passwd_changed_on colum...    Backend            1
4    Problem with email integration to MS Office 365    Backend            1


In [4]:
def convert_lower_case(data):
    return str(data).lower()

In [5]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [6]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [7]:
def remove_numbers(data):
    return re.sub(r'\d+', '', str(data))

In [8]:
def remove_single_characters(tokens):
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [9]:
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [10]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [11]:
# print the first report of the training data
print(train_df['report'][0])


"For any event on my bookmarked projects" option not sending notifications for non-member bookmarked projects


In [12]:
# preprocess the first report of the training data
print(preprocess(train_df['report'][0]))


for any event on my bookmarked project option not sending notification for non member bookmarked project


In [14]:
import pandas as pd

# Assuming train_df is your DataFrame containing training data
# Assuming train_df has columns 'id', 'report', and 'class_name'

# Initialize counter
counter = 1

# Iterate over each row in the DataFrame
with open('preprocessed_train_data.csv', 'w', encoding='utf-8') as f:
    f.write('bug_description , class_name\n')  # Write header

    for _, row in train_df.iterrows():
        # Preprocess the 'report' column
        preprocessed_report = preprocess(row['report'])

        # Write data to the file with incremented counter
        f.write(f"{preprocessed_report},{row['class_name']}\n")

# Print the first preprocessed report
print(train_df['report'][0])


"For any event on my bookmarked projects" option not sending notifications for non-member bookmarked projects


In [15]:
# FOR TESTING DATA

# Iterate over each row in the DataFrame
with open('preprocessed_test_data.csv', 'w', encoding='utf-8') as f:
    f.write('bug_description , class_name\n')  # Write header

    for _, row in test_df.iterrows():
        # Preprocess the 'report' column
        preprocessed_report = preprocess(row['report'])

        # Write data to the file with incremented counter
        f.write(f"{preprocessed_report},{row['class_name']}\n")

# Print the first preprocessed report
print(test_df['report'][0])


REST API - ability to list sub projects for a project


In [16]:
# Open the preprocessed data file for reading
with open('preprocessed_train_data.csv', 'r', encoding='utf-8') as f:
    # Open the new file for writing preprocessed data with 2 columns
    with open('preprocessed_train_data2.csv', 'w', encoding='utf-8') as f_out:
        # Iterate over each line in the file
        for i, line in enumerate(f):
            # Skip the header
            if i == 0:
                continue
            
            # Split the line into columns based on comma delimiter
            columns = line.strip().split(',')
            
            # Check the number of columns
            if len(columns) == 2:
                # Get the preprocessed report and class name
                preprocessed_report = columns[0]
                class_name = columns[1]
                
                # Write the preprocessed report and class name to the new file
                f_out.write(f"{preprocessed_report},{class_name}\n")
            else:
                # Skip the line if it doesn't have exactly 2 columns
                continue


In [17]:
# FOR TESTING DATA

# Open the preprocessed data file for reading
with open('preprocessed_test_data.csv', 'r', encoding='utf-8') as f:
    # Open the new file for writing preprocessed data with 2 columns
    with open('preprocessed_test_data2.csv', 'w', encoding='utf-8') as f_out:
        # Iterate over each line in the file
        for i, line in enumerate(f):
            # Skip the header
            if i == 0:
                continue
            
            # Split the line into columns based on comma delimiter
            columns = line.strip().split(',')
            
            # Check the number of columns
            if len(columns) == 2:
                # Get the preprocessed report and class name
                preprocessed_report = columns[0]
                class_name = columns[1]
                
                # Write the preprocessed report and class name to the new file
                f_out.write(f"{preprocessed_report},{class_name}\n")
            else:
                # Skip the line if it doesn't have exactly 2 columns
                continue


In [13]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('preprocessed_train_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

                                     bug_description class_name
0  for any event on my bookmarked project option ...    Backend
1               switch to using full ln id in urlbar   Frontend
2  consider removing hasicon property to simplify...   Frontend
3  method to obtain current url from webbrowsered...   Frontend
4                fix migration fails in m sql server    Backend


In [14]:
# read the preprocessed data from the new file
preprocessed_test_df = pd.read_csv('preprocessed_test_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_test_df.head())

                                     bug_description class_name
0  rest api ability to list sub project for a pro...    Backend
1  support selective text on right if set in gnom...   Frontend
2  meta userstory ship v of pre populated topsite...   Frontend
3  include updated on and passwd changed on colum...    Backend
4         problem with email integration to m office    Backend


In [15]:
# remove the stop words from the preprocessed data using nltk
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def remove_stop_words(data):
    tokens = word_tokenize(data)
    data = ' '.join([i for i in tokens if not i in stop_words])
    return data

# preprocess the first report of the training data
print(preprocess(train_df['report'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(train_df['report'][0])))

# preprocess the first report of the testing data
print(preprocess(test_df['report'][0]))

for any event on my bookmarked project option not sending notification for non member bookmarked project
event bookmarked project option sending notification non member bookmarked project
rest api ability to list sub project for a project


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
# Convert non-string values to strings in 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: str(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: str(x))

# Remove stop words from 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: remove_stop_words(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: remove_stop_words(x))

# Show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

# Show the first 5 rows of the preprocessed testing data
print(preprocessed_test_df.head())


                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [29]:
# keep only the reports that has class_name of Frontend, Backend, Security, Documentation
# Filter the training data
filtered_train_df = preprocessed_train_df[
    (preprocessed_train_df['class_name'] == 'Frontend') | 
    (preprocessed_train_df['class_name'] == 'Backend') |
    (preprocessed_train_df['class_name'] == 'Security') 
]

# Filter the testing data
filtered_test_df = preprocessed_test_df[
    (preprocessed_test_df['class_name'] == 'Frontend') | 
    (preprocessed_test_df['class_name'] == 'Backend') |
    (preprocessed_test_df['class_name'] == 'Security') 
]

# Show the first 5 rows of the filtered training data
print("Filtered Training Data:")
print(filtered_train_df.head())

# Show the first 5 rows of the filtered testing data
print("\nFiltered Testing Data:")
print(filtered_test_df.head())


Filtered Training Data:
                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend

Filtered Testing Data:
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [30]:
# Define the mapping of class names to the desired order
class_name_mapping = {
    'Backend': 1,
    'Frontend': 0,
    'Security': 2
}

# Map class names in both training and testing data to the desired order
filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)

# order them based on the number of class_label
filtered_train_df = filtered_train_df.sort_values(by=['class_label'])
filtered_test_df = filtered_test_df.sort_values(by=['class_label'])

# Print the unique class names in the training data
print(filtered_train_df['class_name'].unique())

# Print the unique class names in the testing data
print(filtered_test_df['class_name'].unique())


['Frontend' 'Backend' 'Security']
['Frontend' 'Backend' 'Security']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)


## Feature Exraction

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Replace NaN values with an empty string
preprocessed_train_df['bug_description'].fillna('', inplace=True)

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed data
X_train = vectorizer.fit_transform(filtered_train_df['bug_description'])

# Print the shape of X_train
print(X_train.shape)

(13603, 7483)


In [32]:
# print the vector representation of the first report
print(X_train[0])

  (0, 6321)	0.39810213343122414
  (0, 1005)	0.48891468317581693
  (0, 6672)	0.5876471675976369
  (0, 3503)	0.507097555059254


In [33]:
# Print the number of unique class_name in the training data
print(filtered_train_df['class_name'].nunique())

# print their unique values
print(filtered_train_df['class_name'].unique())

# print the number of reports in each class
print(filtered_train_df['class_name'].value_counts())



3
['Frontend' 'Backend' 'Security']
Backend     7437
Frontend    5799
Security     367
Name: class_name, dtype: int64


In [35]:
# Create a mapping between cluster labels and class names
cluster_class_mapping = {
    1: 'Backend',  # Example mapping, adjust based on your actual clusters
    0: 'Frontend',
    2: 'Security'
}


In [40]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

def lda_topic_modeling(data, class_name, n_components=2, ngram_range=(3, 5), random_state=42):
    """
    Perform Latent Dirichlet Allocation (LDA) topic modeling on the given data and evaluate on test data.

    Parameters:
    - data: DataFrame containing the text data to be modeled.
    - class_name: Series containing the class names corresponding to the data.
    - n_components: Number of topics to be generated (default is 2).
    - ngram_range: Tuple specifying the range of n-grams to consider (default is (1, 2)).
    - random_state: Random seed for reproducibility (default is 42).

    Returns:
    - None
    """
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)

    # Fit and transform the data
    X_train = vectorizer.fit_transform(data)

    # Initialize the LDA model
    lda_model = LatentDirichletAllocation(n_components=n_components, random_state=random_state)

    # Fit the LDA model
    lda_model.fit(X_train)

    # Print the top words for each topic
    for i, topic in enumerate(lda_model.components_):
        print(f"Top words for topic #{i}:")
        print([vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]])
        print("\n")

    # Transform the preprocessed test data
    X_test = vectorizer.transform(filtered_test_df['bug_description'])

    # Predict the topics for the test data
    topic_predictions = lda_model.transform(X_test)

    # Map numerical indices to class names
    predicted_class_names = [cluster_class_mapping[prediction] for prediction in np.argmax(topic_predictions, axis=1)]

    # Print the classification report
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    # Print the confusion matrix
    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    # Calculate and print the accuracy
    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    # precision
    precision = np.mean(class_name == predicted_class_names)
    print("Precision:", precision)

    # recall
    recall = np.mean(class_name == predicted_class_names)
    print("Recall:", recall)

    # f1-score
    f1_score = np.mean(class_name == predicted_class_names)
    print("F1-Score:", f1_score)

# Usage example:
lda_topic_modeling(filtered_train_df['bug_description'], filtered_test_df['class_name'])

Top words for topic #0:
['issue email notification', 'due date redmine', 'email configuration error', 'cant delete account', 'unify email notification', 'smtp authentication failed', 'optional email header', 'abstraction layer notification', 'internal server error', 'mention user username']


Top words for topic #1:
['author name mail', 'picture email notification', 'modify email template', 'changed email template', 'issue assigned group', 'notification removing assignment', 'option project manager', 'email dropped timeout', 'email notification error', 'send email notification']


Classification Report:
              precision    recall  f1-score   support

     Backend       0.78      0.32      0.46      1345
    Frontend       0.47      0.89      0.62       987
    Security       0.00      0.00      0.00        70

    accuracy                           0.55      2402
   macro avg       0.42      0.40      0.36      2402
weighted avg       0.63      0.55      0.51      2402

Confusio

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
# iterate to cover all possible combinations of ngram_range:
# 1- (2, 2) , (2, 3) , (2, 4)and so on till (2,8)
# 2- (3, 3) , (3, 4) , (3, 5)and so on till (3,8)
# and so on till (5, 8)

# Define the n-gram ranges to be considered
ngram_ranges = [(2, i) for i in range(2, 9)] + [(3, i) for i in range(3, 9)] + [(4, i) for i in range(4, 9)] + [(5, i) for i in range(5, 9)]

# Iterate over each n-gram range
for ngram_range in ngram_ranges:
    print(f"Processing ngram_range: {ngram_range}")
    lda_topic_modeling(filtered_train_df['bug_description'], filtered_test_df['class_name'], ngram_range=ngram_range)
    print("------------------------------------------------------------------")


Processing ngram_range: (2, 2)
Top words for topic #0:
['search engine', 'via rest', 'content assist', 'user api', 'internal server', 'custom field', 'server error', 'email notification', 'new tab', 'rest api']


Top words for topic #1:
['doesnt work', 'user group', 'attachment rest', 'wiki page', 'address bar', 'org eclipse', 'intermittent browser', 'send email', 'email notification', 'rest api']


Top words for topic #2:
['doe work', 'due date', 'send notification', 'error message', 'context menu', 'notification email', 'new tab', 'rest api', 'custom field', 'email notification']


Classification Report:
              precision    recall  f1-score   support

     Backend       0.59      0.32      0.42      1345
    Frontend       0.43      0.43      0.43       987
    Security       0.03      0.30      0.06        70

    accuracy                           0.37      2402
   macro avg       0.35      0.35      0.30      2402
weighted avg       0.51      0.37      0.41      2402

Confu