In [None]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import csv
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict

import joblib
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))


In [40]:
def convert_lower_case(data):
    """
        Converts all characters in the input text data to lowercase.

        Parameters:
        -----------
        data : str
            The input text data to be converted to lowercase.

        Returns:
        --------
        str
            The text data with all characters in lowercase.
    """

    return str(data).lower()

In [41]:
def remove_punctuation(data):
    """
        Removes punctuation from the input text data.

        Parameters:
        -----------
        data : str
            The input text data from which punctuation will be removed.

        Returns:
        --------
        str
            The text data with punctuation replaced by spaces.
    """
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [42]:
def remove_apostrophe(data):
    """
        Removes apostrophes from the input text data.

        Parameters:
        -----------
        data : str
            The input text data from which apostrophes will be removed.

        Returns:
        --------
        str
            The text data with apostrophes removed.

    """
    return np.char.replace(data, "'", "")

In [43]:

def remove_numbers(data):
    """
        Removes all numerical digits from the input text data.

        Parameters:
        -----------
        data : str
            The input text data from which numbers will be removed.

        Returns:
        --------
        str
            The text data with all numerical digits removed.
    """
    return re.sub(r'\d+', '', str(data))

In [44]:
def remove_single_characters(tokens):
    """
        Removes single-character tokens from a list of tokens.

        Parameters:
        -----------
        tokens : list of str
            A list of tokens (words) from which single-character tokens will be removed.

        Returns:
        --------
        str
            A string with single-character tokens removed.
    """
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [45]:
def lemmatization(data):
    """
        Performs lemmatization on the input text data, reducing words to their base or root form.

        Parameters:
        -----------
        data : str
            The input text data to be lemmatized.

        Returns:
        --------
        str
            The lemmatized text data.
    """
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [46]:
def preprocess(data):
    """
        Preprocesses the input text data by applying a series of transformations:
        converting to lowercase, removing punctuation, removing apostrophes,
        removing numbers, and lemmatizing.

        Parameters:
        -----------
        data : str
            The input text data to be preprocessed.

        Returns:
        --------
        str
            The preprocessed text data.
    """
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [47]:
def remove_stop_words(data):
    """
        Removes common stop words from the input text data.

        Parameters:
        -----------
        data : str
            The input text data from which stop words will be removed.

        Returns:
        --------
        str
            The text data with stop words removed.
    """
    tokens = word_tokenize(data)
    data = ' '.join([i for i in tokens if not i in stop_words])
    return data

In [None]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('train.csv')
preprocessed_test_df = pd.read_csv('test.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())
print(preprocessed_test_df.head())

In [None]:
# Convert non-string values to strings in 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: str(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: str(x))

# Remove stop words from 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: remove_stop_words(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: remove_stop_words(x))

print( preprocessed_test_df['bug_description'][0] )
print( preprocessed_train_df['bug_description'][0] )


In [None]:
# keep only the reports that has class_name of Frontend, Backend, Security, Documentation
# Filter the training data
filtered_train_df = preprocessed_train_df[
    (preprocessed_train_df['class_name'] == 'Frontend') |
    (preprocessed_train_df['class_name'] == 'Backend') |
    (preprocessed_train_df['class_name'] == 'Security') |
    (preprocessed_train_df['class_name'] == 'Documentation')
]

# Filter the testing data
filtered_test_df = preprocessed_test_df[
    (preprocessed_test_df['class_name'] == 'Frontend') |
    (preprocessed_test_df['class_name'] == 'Backend') |
    (preprocessed_test_df['class_name'] == 'Security') |
    (preprocessed_test_df['class_name'] == 'Documentation')
]

# Show the first 5 rows of the filtered training data
print("Filtered Training Data:")
print(filtered_train_df.head())

# Show the first 5 rows of the filtered testing data
print("\nFiltered Testing Data:")
print(filtered_test_df.head())


In [None]:
# Define the mapping of class names to the desired order
class_name_mapping = {
    'Frontend': 0,
    'Backend': 1,
    'Security': 2,
    'Documentation' : 3,
}

# Map class names in both training and testing data to the desired order
filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)

# order them based on the number of class_label
filtered_train_df = filtered_train_df.sort_values(by=['class_label'])
filtered_test_df = filtered_test_df.sort_values(by=['class_label'])

# Print the unique class names in the training data
print(filtered_train_df['class_name'].unique())

# print(filtered_train_df.head())


In [None]:


# Specify the path to your preprocessed CSV file
input_file = 'train.csv'

# Dictionary to store counts of each category
category_counts = defaultdict(int)

# Read the CSV file and count occurrences of each category
with open(input_file, 'r', newline='', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        if len(row) == 2:  # Ensure the row has both report and category
            _, category = row
            category_counts[category] += 1

# Print the counts of each category
for category, count in category_counts.items():
    print(f"Category: {category}, Count: {count}")

## Feature Exraction

In [53]:
def try_ngram_combinations(data, ngram_range):
    """
        Generates TF-IDF weighted n-gram combinations from the input text data.

        Parameters:
        -----------
        data : list of str
            A list of textual data (documents) to be transformed into n-gram combinations.

        ngram_range : tuple (min_n, max_n)
            The lower and upper boundary of the range of n-values for different n-grams to be extracted.

        Returns:
        --------
        X_transformed : sparse matrix of shape (n_samples, n_features)
            The transformed data as a TF-IDF weighted term-document matrix.

        vectorizer : TfidfVectorizer
            The TfidfVectorizer instance that was used to perform the transformation, which includes
            the fitted vocabulary and IDF values.
        """
    results = []

    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)

    # Fit and transform the data
    X_transformed = vectorizer.fit_transform(data)


    return X_transformed, vectorizer


data1, vectorizer1 = try_ngram_combinations(filtered_train_df['bug_description'], (1, 2))


## ***SVM from scratch***

In [54]:
import numpy as np
from scipy.spatial import distance  # to compute the Gaussian kernel
import cvxopt                       # to solve the dual optimization problem
import copy
from scipy.sparse import csr_matrix

class SVM:
    linear = lambda x, x_dash , c=0: x @ x_dash .T
    polynomial = lambda x, x_dash , Q=5: (1 + x @ x_dash.T)**Q
    rbf = lambda x, x_dash , gamma=10: np.exp(-gamma * distance.cdist(x, x_dash,'sqeuclidean'))
    kernel_functions = {'linear': linear, 'polynomial': polynomial, 'rbf': rbf}

    def __init__(self, kernel='linear', C=1, k=2):
        # setting the hyperparameters
        self.kernel_str = kernel
        self.kernel = SVM.kernel_functions[kernel]
        self.C = C                  # regularization parameter
        self.k = k                  # kernel hyperparameter

        # training data and support vectors
        self.X, y = None, None
        self.alpha = None
        self.multiclass = False
        self.classifiers = []

    def fit(self, X, y, eval_train=False):
      if len(np.unique(y)) > 2:
          self.multiclass = True
          return self.multi_fit(X, y, eval_train)

      # relabel if needed
      if set(np.unique(y)) == {0, 1}: y[y == 0] = -1


      # ensure y has dimensions Nx1
      self.y = y.reshape(-1, 1).astype(np.double) # Has to be a column vector

      self.X = X
      N = X.shape[0]

      # compute the kernel over all possible pairs of (x, x') in the data
      self.K = self.kernel(X, X, self.k)

      # For 1/2 x^T P x + q^T x
      P = cvxopt.matrix(self.y @ self.y.T * self.K)
      q = cvxopt.matrix(-np.ones((N, 1)))

      # For Ax = b
      A = cvxopt.matrix(self.y.T)
      b = cvxopt.matrix(np.zeros(1))

      # For Gx <= h
      G = cvxopt.matrix(np.vstack((-np.identity(N), np.identity(N))))
      h = cvxopt.matrix(np.vstack((np.zeros((N,1)), np.ones((N,1)) * self.C)))

      # Solve
      cvxopt.solvers.options['show_progress'] = False
      sol = cvxopt.solvers.qp(P, q, G, h, A, b)
      self.alpha = np.array(sol["x"])

      # Maps into support vectors
      self.isSupportVector = ((self.alpha > 1e-3) & (self.alpha <= self.C)).squeeze()
      self.marginSupportVector = np.argmax((1e-3 < self.alpha) & (self.alpha < self.C - 1e-3))

    def multi_fit(self, X, y, eval_train=False):
        self.k = len(np.unique(y))      # number of classes
        y = np.array(y)
        # for each pair of classes
        for i in range(self.k):
            # get the data for the pair
            Xs, Ys = X, copy.copy(y)

            # change the labels to -1 and 1
            Ys[Ys!=i], Ys[Ys==i] = -1, +1

            # fit the classifier
            classifier = SVM(kernel=self.kernel_str, C=self.C, k=self.k)
            classifier.fit(Xs, Ys)

            # save the classifier
            self.classifiers.append(classifier)


    def predict(self, X_t):
        if self.multiclass: return self.multi_predict(X_t)
        x_s, y_s = self.X[self.marginSupportVector, np.newaxis], self.y[self.marginSupportVector]
        alpha, y, X= self.alpha[self.isSupportVector], self.y[self.isSupportVector], self.X[self.isSupportVector]

        b = y_s - np.sum(alpha * y * self.kernel(X, x_s, self.k), axis=0)
        score = np.sum(alpha * y * self.kernel(X, X_t, self.k), axis=0) + b
        return np.sign(score).astype(int), score

    def multi_predict(self, X):
        # get the predictions from all classifiers
        preds = np.zeros((X.shape[0], self.k))
        for i, classifier in enumerate(self.classifiers):
            _, preds[:, i] = classifier.predict(X)

        # get the argmax and the corresponding score
        return np.argmax(preds, axis=1)


# Initialize the SVM model
model = SVM(C = 100)


# Fit the model on the entire training data
model.fit(data1, filtered_train_df['class_label'])

# Predict the class labels for the testing data
X_test_transformed = vectorizer1.transform(filtered_test_df['bug_description'])

In [None]:
# Predict the class labels for the test data
predictions = model.predict(X_test_transformed)

# Get the true labels for the test data
true_labels = filtered_test_df['class_label']

# Evaluate the model
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')
f1 = f1_score(true_labels, predictions, average='weighted')

# Print the evaluation results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
# Save the model and the vectorizer
joblib.dump(model, 'svm_model.pkl')
joblib.dump(vectorizer1, 'vectorizer.pkl')
print("Model and vectorizer saved successfully.")

In [None]:
# Load the model and the vectorizer
loaded_model = joblib.load('svm_model.pkl')
loaded_vectorizer = joblib.load('vectorizer.pkl')

# Example new data for prediction
new_data = [
    "We have some problems in api and it slows down the system.",       # Backend
    "Manual guide of the installation is very bad.",           # Documentation
    "customer wants to add button on the main page to show products",   # Frontend
    "add warning when there is an error within the certificate"         # Security
    ]

# Transform the new data using the loaded vectorizer
new_data_transformed = loaded_vectorizer.transform(new_data)

# Predict the class label for the new data
new_pred = loaded_model.predict(new_data_transformed)

# Print the prediction
print(f"Predicted class for the new input: {new_pred}")
