In [1]:
pip install nimfa

Collecting nimfa
  Downloading nimfa-1.4.0-py2.py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nimfa
Successfully installed nimfa-1.4.0


In [2]:
!pip install nltk



In [3]:
import pandas as pd
import os
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# remove the stop words from the preprocessed data using nltk
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
def convert_lower_case(data):
    return str(data).lower()

In [9]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [10]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [11]:
def remove_numbers(data):
    return re.sub(r'\d+', '', str(data))

In [12]:
def remove_single_characters(tokens):
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [13]:
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [14]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [16]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('/content/preprocessed_train_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

                                     bug_description class_name
0  for any event on my bookmarked project option ...    Backend
1               switch to using full ln id in urlbar   Frontend
2  consider removing hasicon property to simplify...   Frontend
3  method to obtain current url from webbrowsered...   Frontend
4                fix migration fails in m sql server    Backend


In [17]:
# read the preprocessed data from the new file
preprocessed_test_df = pd.read_csv('/content/preprocessed_test_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_test_df.head())

                                     bug_description class_name
0  rest api ability to list sub project for a pro...    Backend
1  support selective text on right if set in gnom...   Frontend
2  meta userstory ship v of pre populated topsite...   Frontend
3  include updated on and passwd changed on colum...    Backend
4         problem with email integration to m office    Backend


In [24]:
# remove the stop words from the preprocessed data using nltk
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
def remove_stop_words(data):
    tokens = word_tokenize(data)
    data = ' '.join([i for i in tokens if not i in stop_words])
    return data

# preprocess the first report of the training data
print(preprocess(preprocessed_train_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_train_df['bug_description'][0])))

# preprocess the first report of the testing data
print(preprocess(preprocessed_test_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_test_df['bug_description'][0])))


event bookmarked project option sending notification non member bookmarked project
event bookmarked project option sending notification non member bookmarked project
rest api ability list sub project project
rest api ability list sub project project


In [26]:
# Convert non-string values to strings in 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: str(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: str(x))

# Remove stop words from 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: remove_stop_words(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: remove_stop_words(x))

# Show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

# Show the first 5 rows of the preprocessed testing data
print(preprocessed_test_df.head())


                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [29]:
# keep only the reports that has class_name of Frontend, Backend, Security, Documentation
# Filter the training data
filtered_train_df = preprocessed_train_df[
    (preprocessed_train_df['class_name'] == 'Frontend') |
    (preprocessed_train_df['class_name'] == 'Backend') |
    (preprocessed_train_df['class_name'] == 'Security') |
    (preprocessed_train_df['class_name'] == 'Documentation')
]

# Filter the testing data
filtered_test_df = preprocessed_test_df[
    (preprocessed_test_df['class_name'] == 'Frontend') |
    (preprocessed_test_df['class_name'] == 'Backend') |
    (preprocessed_test_df['class_name'] == 'Security') |
    (preprocessed_test_df['class_name'] == 'Documentation')
]

# Show the first 5 rows of the filtered training data
print("Filtered Training Data:")
print(filtered_train_df.head())

# Show the first 5 rows of the filtered testing data
print("\nFiltered Testing Data:")
print(filtered_test_df.head())


Filtered Training Data:
                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend

Filtered Testing Data:
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [30]:
# print the unique class names in the training data
print(filtered_train_df['class_name'].unique())

# print the unique class names in the testing data
print(filtered_test_df['class_name'].unique())

['Backend' 'Frontend' 'Security' 'Documentation']
['Backend' 'Frontend' 'Documentation' 'Security']


## Feature Exraction

In [31]:
print(len(filtered_train_df))

13777


In [32]:
# Define the mapping of class names to the desired order
class_name_mapping = {
    'Backend': 1,
    'Frontend': 0,
    'Security': 2,
    'Documentation': 3
}

In [34]:
import torchtext.vocab as vocab

# Load pre-trained GloVe embeddings
glove = vocab.GloVe(name='6B', dim=300)


.vector_cache/glove.6B.zip: 862MB [02:39, 5.40MB/s]                           
100%|█████████▉| 399999/400000 [01:05<00:00, 6118.55it/s]


In [35]:
# Tokenize bug reports and map tokens to GloVe embeddings
def tokenize_and_map_to_glove(text):
    tokens = text.split()
    embeddings = [glove[token.lower()] for token in tokens if token.lower() in glove.stoi]
    return embeddings

In [36]:
# Example usage:
tokenized_bug_reports_train = [tokenize_and_map_to_glove(text) for text in filtered_train_df['bug_description']]
tokenized_bug_reports_test = [tokenize_and_map_to_glove(text) for text in filtered_test_df['bug_description']]

In [37]:
# Aggregate token embeddings (e.g., by averaging)
def aggregate_embeddings(embeddings):
    if embeddings:
        return torch.stack(embeddings).mean(dim=0)
    else:
        # Return a zero vector if no embeddings are found
        return torch.zeros(glove.vectors.shape[1])

In [39]:
import torch

# Example usage:
X_train = torch.stack([aggregate_embeddings(embeddings) for embeddings in tokenized_bug_reports_train])
X_test = torch.stack([aggregate_embeddings(embeddings) for embeddings in tokenized_bug_reports_test])

# Now you can use X_train and X_test as features for your classification model


In [40]:
import numpy as np

# Save X_train and X_test to files
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)


In [None]:
import numpy as np

# Load X_train and X_test from files
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')


In [41]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the classifier
svm_classifier.fit(X_train, filtered_train_df['class_name'])

# Predict class labels for the test data
predicted_labels = svm_classifier.predict(X_test)

# Evaluate the classifier
print("Classification Report:")
print(classification_report(filtered_test_df['class_name'], predicted_labels))

print("Confusion Matrix:")
print(confusion_matrix(filtered_test_df['class_name'], predicted_labels))


Classification Report:
               precision    recall  f1-score   support

      Backend       0.81      0.84      0.83      1345
Documentation       0.56      0.24      0.33        21
     Frontend       0.76      0.77      0.77       987
     Security       0.60      0.26      0.36        70

     accuracy                           0.79      2423
    macro avg       0.68      0.53      0.57      2423
 weighted avg       0.78      0.79      0.78      2423

Confusion Matrix:
[[1129    1  211    4]
 [  12    5    4    0]
 [ 216    3  760    8]
 [  33    0   19   18]]
