In [55]:
import pandas as pd
import os
import numpy as np
import string
import re
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import joblib

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [56]:
def convert_lower_case(data):
    return str(data).lower()

In [57]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [58]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [59]:

def remove_numbers(data):
    return re.sub(r'\d+', '', str(data))

In [60]:
def remove_single_characters(tokens):
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [61]:
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [62]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [63]:
def remove_stop_words(data):
    tokens = word_tokenize(data)
    data = ' '.join([i for i in tokens if not i in stop_words])
    return data

In [64]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('train.csv')
preprocessed_test_df = pd.read_csv('test.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())
print(preprocessed_test_df.head())

                                     bug_description class_name
0  for any event on my bookmarked project option ...    Backend
1               switch to using full ln id in urlbar   Frontend
2  consider removing hasicon property to simplify...   Frontend
3  method to obtain current url from webbrowsered...   Frontend
4                fix migration fails in m sql server    Backend
                                     bug_description class_name
0  rest api ability to list sub project for a pro...    Backend
1  support selective text on right if set in gnom...   Frontend
2  meta userstory ship v of pre populated topsite...   Frontend
3  include updated on and passwd changed on colum...    Backend
4         problem with email integration to m office    Backend


In [65]:
import csv


input_file = 'test.txt'
output_file = 'test_backend_preprocessed.csv'

# Read the contents of the text file
with open(input_file, 'r') as txt_file:
    lines = txt_file.read().splitlines()

# Preprocess each line and write to a CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    for line in lines:
        preprocessed_data = preprocess(line)
        csv_writer.writerow([preprocessed_data, 'Backend'])

In [66]:
# Convert non-string values to strings in 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: str(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: str(x))

# Remove stop words from 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: remove_stop_words(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: remove_stop_words(x))

print( preprocessed_test_df['bug_description'][0] )
print( preprocessed_train_df['bug_description'][0] )


rest api ability list sub project project
event bookmarked project option sending notification non member bookmarked project


In [67]:
# keep only the reports that has class_name of Frontend, Backend, Security, Documentation
# Filter the training data
filtered_train_df = preprocessed_train_df[
    (preprocessed_train_df['class_name'] == 'Frontend') |
    (preprocessed_train_df['class_name'] == 'Backend') |
    (preprocessed_train_df['class_name'] == 'Security') |
    (preprocessed_train_df['class_name'] == 'Documentation')
]

# Filter the testing data
filtered_test_df = preprocessed_test_df[
    (preprocessed_test_df['class_name'] == 'Frontend') |
    (preprocessed_test_df['class_name'] == 'Backend') |
    (preprocessed_test_df['class_name'] == 'Security') |
    (preprocessed_test_df['class_name'] == 'Documentation')
]

# Show the first 5 rows of the filtered training data
print("Filtered Training Data:")
print(filtered_train_df.head())

# Show the first 5 rows of the filtered testing data
print("\nFiltered Testing Data:")
print(filtered_test_df.head())


Filtered Training Data:
                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend

Filtered Testing Data:
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [68]:
# Define the mapping of class names to the desired order
class_name_mapping = {
    'Frontend': 0,
    'Backend': 1,
    'Security': 2,
    'Documentation' : 3,
}

# Map class names in both training and testing data to the desired order
filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)

# order them based on the number of class_label
filtered_train_df = filtered_train_df.sort_values(by=['class_label'])
filtered_test_df = filtered_test_df.sort_values(by=['class_label'])

# Print the unique class names in the training data
print(filtered_train_df['class_name'].unique())

# print(filtered_train_df.head())


['Frontend' 'Backend' 'Security' 'Documentation']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)


In [69]:
import csv
from collections import defaultdict

# Specify the path to your preprocessed CSV file
input_file = 'train.csv'

# Dictionary to store counts of each category
category_counts = defaultdict(int)

# Read the CSV file and count occurrences of each category
with open(input_file, 'r', newline='', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        if len(row) == 2:  # Ensure the row has both report and category
            _, category = row
            category_counts[category] += 1

# Print the counts of each category
for category, count in category_counts.items():
    print(f"Category: {category}, Count: {count}")

Category: class_name, Count: 1
Category: Backend, Count: 8788
Category: Frontend, Count: 6792
Category: Security, Count: 3573
Category: Documentation, Count: 3148
Category: Performance, Count: 89


## Feature Exraction

In [70]:
num_features = 1000  # Number of features to select

def try_ngram_combinations(data, ngram_range):
    results = []

    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)

    # Fit and transform the data
    X_transformed = vectorizer.fit_transform(data)

    # Apply TruncatedSVD to reduce to 1000 dimensions
    # svd = TruncatedSVD(n_components=num_features)
    # X_transformed = svd.fit_transform(X_transformed)


    return X_transformed, vectorizer


data1, vectorizer1 = try_ngram_combinations(filtered_train_df['bug_description'], (1, 2))


In [71]:
print(data1.shape)

(22301, 61416)


In [72]:
# SVM model
model = SVC(C = 100)


print(f"Model trained using n-gram range: {vectorizer1.ngram_range}")

# Perform cross-validation to evaluate the model
scores = cross_val_score(model, data1, filtered_train_df['class_name'], cv=5)
print(f"Cross-Validation Scores: {scores}")

# Fit the model on the entire training data
model.fit(data1, filtered_train_df['class_name'])



Model trained using n-gram range: (1, 2)
Cross-Validation Scores: [0.92154226 0.90717489 0.89484305 0.85403587 0.91928251]


In [73]:
# Predict the class labels for the testing data
X_test_transformed = vectorizer1.transform(filtered_test_df['bug_description'])
y_pred = model.predict(X_test_transformed)

# Print the classification report
print(classification_report(filtered_test_df['class_name'], y_pred, target_names=filtered_test_df['class_name'].unique()))

# Print the confusion matrix
print(pd.crosstab(filtered_test_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))



               precision    recall  f1-score   support

     Frontend       0.94      0.94      0.94      1794
      Backend       1.00      0.95      0.97       449
     Security       0.90      0.93      0.92      1244
Documentation       0.98      0.96      0.97       704

     accuracy                           0.94      4191
    macro avg       0.96      0.95      0.95      4191
 weighted avg       0.94      0.94      0.94      4191

Predicted      Backend  Documentation  Frontend  Security
Actual                                                   
Backend           1679              1       105         9
Documentation       13            427         9         0
Frontend            79              0      1162         3
Security            11              0        15       678


In [74]:
# Print the accuracy
accuracy = model.score(X_test_transformed, filtered_test_df['class_name'])
print(f"Accuracy: {round(accuracy, 3) * 100}%")

Accuracy: 94.19999999999999%


In [75]:
# Save the model and the vectorizer
joblib.dump(model, 'svm_model.pkl')
joblib.dump(vectorizer1, 'vectorizer.pkl')
print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.


In [76]:
# Load the model and the vectorizer
loaded_model = joblib.load('svm_model.pkl')
loaded_vectorizer = joblib.load('vectorizer.pkl')

# Example new data for prediction
new_data = [
    "We have some problems in api and it slows down the system.",       # Backend
    "Manual guide of the installation is very bad.",           # Documentation
    "customer wants to add button on the main page to show products",   # Frontend
    "add warning when there is an error within the certificate"         # Security
    ]

# Transform the new data using the loaded vectorizer
new_data_transformed = loaded_vectorizer.transform(new_data)

# Predict the class label for the new data
new_pred = loaded_model.predict(new_data_transformed)

# Print the prediction
print(f"Predicted class for the new input: {new_pred}")


Predicted class for the new input: ['Backend' 'Documentation' 'Frontend' 'Security']


## ***SVM from scratch***

In [None]:

import numpy as np                  # for basic operations over arrays
from scipy.spatial import distance  # to compute the Gaussian kernel
import cvxopt                       # to solve the dual optimization problem
import copy                         # to copy numpy arrays
from scipy.sparse import csr_matrix

class SVM:
    linear = lambda x, xࠤ , c=0: x @ xࠤ .T
    polynomial = lambda x, xࠤ , Q=5: (1 + x @ xࠤ.T)**Q
    rbf = lambda x, xࠤ , γ=10: np.exp(-γ * distance.cdist(x, xࠤ,'sqeuclidean'))
    kernel_funs = {'linear': linear, 'polynomial': polynomial, 'rbf': rbf}

    def __init__(self, kernel='rbf', C=1, k=2):
        # set the hyperparameters
        self.kernel_str = kernel
        self.kernel = SVM.kernel_funs[kernel]
        self.C = C                  # regularization parameter
        self.k = k                  # kernel parameter

        # training data and support vectors
        self.X, y = None, None
        self.αs = None

        # for multi-class classification
        self.multiclass = False
        self.clfs = []

    def fit(self, X, y, eval_train=False):
      if len(np.unique(y)) > 2:
          self.multiclass = True
          return self.multi_fit(X, y, eval_train)

      # relabel if needed
      if set(np.unique(y)) == {0, 1}: y[y == 0] = -1


      # ensure y has dimensions Nx1
      self.y = y.reshape(-1, 1).astype(np.double) # Has to be a column vector



      self.X = X
      N = X.shape[0]

      # compute the kernel over all possible pairs of (x, x') in the data
      self.K = self.kernel(X, X, self.k)
      # For 1/2 x^T P x + q^T x
      P = cvxopt.matrix(self.y @ self.y.T * self.K)
      q = cvxopt.matrix(-np.ones((N, 1)))
      # For Ax = b
      A = cvxopt.matrix(self.y.T)
      b = cvxopt.matrix(np.zeros(1))
      # For Gx <= h
      G = cvxopt.matrix(np.vstack((-np.identity(N), np.identity(N))))
      h = cvxopt.matrix(np.vstack((np.zeros((N,1)), np.ones((N,1)) * self.C)))

      # Solve
      cvxopt.solvers.options['show_progress'] = True
      sol = cvxopt.solvers.qp(P, q, G, h, A, b)
      self.αs = np.array(sol["x"])

      # Maps into support vectors
      self.is_sv = ((self.αs > 1e-3) & (self.αs <= self.C)).squeeze()
      self.margin_sv = np.argmax((1e-3 < self.αs) & (self.αs < self.C - 1e-3))

      if eval_train:
        print(f"Finished training with accuracy {self.evaluate(X, y)}")

    def multi_fit(self, X, y, eval_train=False):
        self.k = len(np.unique(y))      # number of classes
        y = np.array(y)
        # for each pair of classes
        for i in range(self.k):
            # get the data for the pair
            Xs, Ys = X, copy.copy(y)

            # change the labels to -1 and 1
            Ys[Ys!=i], Ys[Ys==i] = -1, +1
            # fit the classifier
            clf = SVM(kernel=self.kernel_str, C=self.C, k=self.k)
            print('class : ,' , i)
            clf.fit(Xs, Ys)
            # save the classifier
            self.clfs.append(clf)
            print('Appended class : ' , i)
        if eval_train:
          print(f"Finished training with accuracy {self.evaluate(X, y)}")


    def predict(self, X_t):
        if self.multiclass: return self.multi_predict(X_t)
        xₛ, yₛ = self.X[self.margin_sv, np.newaxis], self.y[self.margin_sv]
        αs, y, X= self.αs[self.is_sv], self.y[self.is_sv], self.X[self.is_sv]

        b = yₛ - np.sum(αs * y * self.kernel(X, xₛ, self.k), axis=0)
        score = np.sum(αs * y * self.kernel(X, X_t, self.k), axis=0) + b
        return np.sign(score).astype(int), score

    def multi_predict(self, X):
        # get the predictions from all classifiers
        preds = np.zeros((X.shape[0], self.k))
        for i, clf in enumerate(self.clfs):
            _, preds[:, i] = clf.predict(X)

        # get the argmax and the corresponding score
        return np.argmax(preds, axis=1)

    def evaluate(self, X,y):
      outputs, _ = self.predict(X)
      accuracy = np.sum(outputs == y) / len(y)
      return round(accuracy, 2)


# Initialize the SVM model
model = SVM(C = 100)


# Fit the model on the entire training data
model.fit(data1, filtered_train_df['class_label'])

# Predict the class labels for the testing data
X_test_transformed = vectorizer1.transform(filtered_test_df['bug_description'])




class : , 0
     pcost       dcost       gap    pres   dres
 0:  3.6174e+06 -8.2146e+07  2e+08  6e-01  2e-11


In [None]:
# Apply TruncatedSVD to reduce to 1000 dimensions
svd = TruncatedSVD(n_components=num_features)
X_test_transformed = svd.fit_transform(X_test_transformed)
y_pred = model.predict(X_test_transformed)

In [None]:
filtered_test = np.array(filtered_test_df['class_label'])

# Print the classification report
# print(classification_report(filtered_test, y_pred, target_names=[0 ,1 ,2, 3]))

# Print the confusion matrix
# print(pd.crosstab(filtered_test_df['class_label'], y_pred, rownames=['Actual'], colnames=['Predicted']))

# print(X_test_transformed.shape)
# print(filtered_test_df['class_label'])

# Print the accuracy
# accuracy = model.score(X_test_transformed, filtered_test_df['class_label'])
# print(f"Accuracy: {accuracy}")

from sklearn.metrics import accuracy_score


# Calculate accuracy
accuracy = accuracy_score(filtered_test, y_pred)

# Print the accuracy
print(f"Accuracy: {accuracy}")