In [1]:
import pandas as pd
import os
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# path of training data
train_path = 'train.xlsx'

# path of testing data
test_path = 'test.xlsx'

In [3]:
# show the first 5 rows of the training data
train_df = pd.read_excel(train_path)
print(train_df.head())

# show the first 5 rows of the testing data
test_df = pd.read_excel(test_path)
print(test_df.head())

                                              report class_name  class_index
0  "For any event on my bookmarked projects" opti...    Backend            1
1           Switch to using full l10n id's in urlbar   Frontend            2
2  Consider removing hasicon property to simplify...   Frontend            2
3  Method to obtain current URL from WebBrowserEd...   Frontend            2
4              Fix: migration fails in MS SQL-Server    Backend            1
                                              report class_name  class_index
0  REST API - ability to list sub projects for a ...    Backend            1
1  support selective text on right if set in GNOM...   Frontend            2
2  [meta][userstory] Ship v1 of Pre-populated top...   Frontend            2
3  Include updated_on and passwd_changed_on colum...    Backend            1
4    Problem with email integration to MS Office 365    Backend            1


In [4]:
def convert_lower_case(data):
    return str(data).lower()

In [5]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [6]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [7]:
def remove_numbers(data):
    return re.sub(r'\d+', '', str(data))

In [8]:
def remove_single_characters(tokens):
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [9]:
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [10]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [11]:
# print the first report of the training data
print(train_df['report'][0])


"For any event on my bookmarked projects" option not sending notifications for non-member bookmarked projects


In [12]:
# preprocess the first report of the training data
print(preprocess(train_df['report'][0]))


for any event on my bookmarked project option not sending notification for non member bookmarked project


In [14]:
import pandas as pd

# Assuming train_df is your DataFrame containing training data
# Assuming train_df has columns 'id', 'report', and 'class_name'

# Initialize counter
counter = 1

# Iterate over each row in the DataFrame
with open('preprocessed_train_data.csv', 'w', encoding='utf-8') as f:
    f.write('bug_description , class_name\n')  # Write header

    for _, row in train_df.iterrows():
        # Preprocess the 'report' column
        preprocessed_report = preprocess(row['report'])

        # Write data to the file with incremented counter
        f.write(f"{preprocessed_report},{row['class_name']}\n")

# Print the first preprocessed report
print(train_df['report'][0])


"For any event on my bookmarked projects" option not sending notifications for non-member bookmarked projects


In [15]:
# FOR TESTING DATA

# Iterate over each row in the DataFrame
with open('preprocessed_test_data.csv', 'w', encoding='utf-8') as f:
    f.write('bug_description , class_name\n')  # Write header

    for _, row in test_df.iterrows():
        # Preprocess the 'report' column
        preprocessed_report = preprocess(row['report'])

        # Write data to the file with incremented counter
        f.write(f"{preprocessed_report},{row['class_name']}\n")

# Print the first preprocessed report
print(test_df['report'][0])


REST API - ability to list sub projects for a project


In [16]:
# Open the preprocessed data file for reading
with open('preprocessed_train_data.csv', 'r', encoding='utf-8') as f:
    # Open the new file for writing preprocessed data with 2 columns
    with open('preprocessed_train_data2.csv', 'w', encoding='utf-8') as f_out:
        # Iterate over each line in the file
        for i, line in enumerate(f):
            # Skip the header
            if i == 0:
                continue
            
            # Split the line into columns based on comma delimiter
            columns = line.strip().split(',')
            
            # Check the number of columns
            if len(columns) == 2:
                # Get the preprocessed report and class name
                preprocessed_report = columns[0]
                class_name = columns[1]
                
                # Write the preprocessed report and class name to the new file
                f_out.write(f"{preprocessed_report},{class_name}\n")
            else:
                # Skip the line if it doesn't have exactly 2 columns
                continue


In [17]:
# FOR TESTING DATA

# Open the preprocessed data file for reading
with open('preprocessed_test_data.csv', 'r', encoding='utf-8') as f:
    # Open the new file for writing preprocessed data with 2 columns
    with open('preprocessed_test_data2.csv', 'w', encoding='utf-8') as f_out:
        # Iterate over each line in the file
        for i, line in enumerate(f):
            # Skip the header
            if i == 0:
                continue
            
            # Split the line into columns based on comma delimiter
            columns = line.strip().split(',')
            
            # Check the number of columns
            if len(columns) == 2:
                # Get the preprocessed report and class name
                preprocessed_report = columns[0]
                class_name = columns[1]
                
                # Write the preprocessed report and class name to the new file
                f_out.write(f"{preprocessed_report},{class_name}\n")
            else:
                # Skip the line if it doesn't have exactly 2 columns
                continue


In [13]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('preprocessed_train_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

                                     bug_description class_name
0  for any event on my bookmarked project option ...    Backend
1               switch to using full ln id in urlbar   Frontend
2  consider removing hasicon property to simplify...   Frontend
3  method to obtain current url from webbrowsered...   Frontend
4                fix migration fails in m sql server    Backend


In [14]:
# read the preprocessed data from the new file
preprocessed_test_df = pd.read_csv('preprocessed_test_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_test_df.head())

                                     bug_description class_name
0  rest api ability to list sub project for a pro...    Backend
1  support selective text on right if set in gnom...   Frontend
2  meta userstory ship v of pre populated topsite...   Frontend
3  include updated on and passwd changed on colum...    Backend
4         problem with email integration to m office    Backend


## Feature Exraction

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Replace NaN values with an empty string
preprocessed_train_df['bug_description'].fillna('', inplace=True)

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed data
X_train = vectorizer.fit_transform(preprocessed_train_df['bug_description'])

# Print the shape of the transformed data
print(X_train.shape)


(13866, 7663)


In [16]:
# print the vector representation of the first report
print(X_train[0])

  (0, 3991)	0.2538660078657135
  (0, 4326)	0.22188781122260037
  (0, 4360)	0.16200943672815057
  (0, 5895)	0.26840990743892756
  (0, 4344)	0.12359962887671232
  (0, 4562)	0.18653498271477986
  (0, 5086)	0.3360730020928219
  (0, 692)	0.6032988290912195
  (0, 4234)	0.2431528345726809
  (0, 4482)	0.13533242661691539
  (0, 2189)	0.24482610182777964
  (0, 299)	0.25719129168101634
  (0, 2527)	0.2398225552777558


In [17]:
# Print the number of unique class_name in the training data
print(preprocessed_train_df['class_name'].nunique())

# print their unique values
print(preprocessed_train_df['class_name'].unique())

# print the number of reports in each class
print(preprocessed_train_df['class_name'].value_counts())



5
['Backend' 'Frontend' 'Security' 'Documentation' 'Performance']
Backend          7437
Frontend         5799
Security          367
Documentation     174
Performance        89
Name: class_name, dtype: int64


In [20]:
# let's try hierarchical clustering
from sklearn.cluster import AgglomerativeClustering

# Initialize the AgglomerativeClustering
agg_clustering = AgglomerativeClustering(n_clusters=5)

# Fit the model
clusters = agg_clustering.fit_predict(X_train.toarray())

# Print the cluster labels
print(clusters)

# Print the number of reports in each cluster
print(pd.Series(clusters).value_counts())



[3 0 0 ... 0 0 0]
0    10188
3     2723
2      489
1      327
4      139
dtype: int64
[3 0 0 0 3]


In [25]:
# print the number of reports that belong to each class based on class_name
print(preprocessed_train_df.groupby('class_name').size())


class_name
Backend          7437
Documentation     174
Frontend         5799
Performance        89
Security          367
dtype: int64


In [26]:
# Create a mapping between cluster labels and class names
cluster_class_mapping = {
    0: 'Backend',  # Example mapping, adjust based on your actual clusters
    3: 'Frontend',
    2: 'Security',
    1: 'Documentation',
    4: 'Performance'
}


In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Map numerical indices to class names
predicted_class_names = [cluster_class_mapping[label] for label in clusters]

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(preprocessed_train_df['class_name'], predicted_class_names))

# Print the accuracy
accuracy = accuracy_score(preprocessed_train_df['class_name'], predicted_class_names)
print("Accuracy:", accuracy)

# Print the precision
precision = precision_score(preprocessed_train_df['class_name'], predicted_class_names, average='weighted')
print("Precision:", precision)

# Print the recall
recall = recall_score(preprocessed_train_df['class_name'], predicted_class_names, average='weighted')
print("Recall:", recall)

# Print the F1 score
f1 = f1_score(preprocessed_train_df['class_name'], predicted_class_names, average='weighted')
print("F1 Score:", f1)


Confusion Matrix:
[[4029  327 2469  132  480]
 [ 143    0   28    2    1]
 [5584    0  205    3    7]
 [  73    0   13    2    1]
 [ 359    0    8    0    0]]
Accuracy: 0.30549545651233234
Precision: 0.24368463378857327
Recall: 0.30549545651233234
F1 Score: 0.26544705592175916
