**Import Libraries**

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, top_k_accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
from collections import defaultdict, Counter
import math
from scipy.sparse import csr_matrix

**Load The Dataset**

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)
dataset = dataset.head(50000)

In [None]:
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org
...,...,...,...
49995,"['css', 'setter', 'css', 'runtim', 'notat', 'a...",css setter css runtim notat api dont take cons...,mdt-papyrus-inbox
49996,"['ocl', 'support', 'calcul', 'tag', 'valu', 'd...",ocl support calcul tag valu deriv properti def...,mdt-papyrus-inbox
49997,"['layer', 'layer', 'explor', 'isnt', 'refresh'...",layer layer explor isnt refresh layersstack at...,mdt-papyrus-inbox
49998,"['cssappear', 'style', 'changeevolut', 'need',...",cssappear style changeevolut need propag relat...,mdt-papyrus-inbox


**Encode the labels**



In [None]:
# Calculate the occurrences of each value in the owner column
value_counts = dataset['Assignee'].value_counts()

# Filter the dataset to include only rows where the value in 'Assignee' has at least 5 occurrences
dataset = dataset[dataset['Assignee'].isin(value_counts[value_counts >= 10].index)]

In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Assignee' column to numeric labels
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

In [None]:
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee,Assignee_Class
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org,58
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org,18
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org,899
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org,840
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org,959
...,...,...,...,...
49995,"['css', 'setter', 'css', 'runtim', 'notat', 'a...",css setter css runtim notat api dont take cons...,mdt-papyrus-inbox,587
49996,"['ocl', 'support', 'calcul', 'tag', 'valu', 'd...",ocl support calcul tag valu deriv properti def...,mdt-papyrus-inbox,587
49997,"['layer', 'layer', 'explor', 'isnt', 'refresh'...",layer layer explor isnt refresh layersstack at...,mdt-papyrus-inbox,587
49998,"['cssappear', 'style', 'changeevolut', 'need',...",cssappear style changeevolut need propag relat...,mdt-papyrus-inbox,587


In [None]:
# compute the number of unique values for each column in the DataFrame training_data.
dataset.nunique()

Summary_Stemmed      46953
processed_summary    46953
Assignee              1000
Assignee_Class        1000
dtype: int64

**Split the dataset**

In [None]:
print(dataset.shape)  # Check the shape of the dataset
print(dataset['Assignee_Class'].value_counts())  # Check class distribution

(46989, 4)
Assignee_Class
690    1377
280     851
732     845
87      470
587     436
       ... 
256      10
183      10
485      10
187      10
490      10
Name: count, Length: 1000, dtype: int64


In [None]:
# Create the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# Split the dataset into train and test sets
train_idx, test_idx = next(sss.split(dataset, dataset['Assignee_Class']))
train_df = dataset.iloc[train_idx].reset_index(drop=True)
test_df = dataset.iloc[test_idx].reset_index(drop=True)

In [None]:
# Create another StratifiedShuffleSplit object for the train-validation split
sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# Split the initial train set into train and validation sets
for train_idx, val_idx in sss_val.split(train_df, train_df['Assignee_Class']):
    final_train_df = train_df.iloc[train_idx].reset_index(drop=True)
    val_df = train_df.iloc[val_idx].reset_index(drop=True)


In [None]:
print(final_train_df.shape)
print(val_df.shape)
print(test_df.shape)

(30072, 4)
(7519, 4)
(9398, 4)


In [None]:
# Separate features (X) and labels (y) for train, validation, and test sets
X_train, y_train = final_train_df['processed_summary'], final_train_df['Assignee_Class']
X_val, y_val = val_df['processed_summary'], val_df['Assignee_Class']
X_test, y_test = test_df['processed_summary'], test_df['Assignee_Class']

**Apply TF-IDF Transformation**

In [None]:
# Function to compute Term Frequency
def compute_tf(doc):
    tf_dict = Counter(doc)
    total_terms = len(doc)
    tf_dict = {word: count / total_terms for word, count in tf_dict.items()}
    return tf_dict

# Function to compute Inverse Document Frequency
def compute_idf(corpus):
    idf_dict = defaultdict(int)
    num_docs = len(corpus)

    # Count the number of documents containing each word
    for doc in corpus:
        for word in set(doc):
            idf_dict[word] += 1

    # Calculate IDF for each word
    idf_dict = {word: math.log(num_docs / count) for word, count in idf_dict.items()}
    return idf_dict

# Function to compute TF-IDF
def compute_tfidf(tf, idf, default_idf):
    tfidf = {word: tf_val * idf.get(word, default_idf) for word, tf_val in tf.items()}
    return tfidf

def custom_tokenize(text):
    # Tokenize the text
    tokens = text.split()  # Or use a more sophisticated tokenizer
    # Filter out single-character tokens
    tokens = [token for token in tokens if len(token) > 1]
    return tokens

# Tokenize the dataset using the custom tokenizer
X_train_tokens = [custom_tokenize(doc) for doc in X_train]
X_val_tokens = [custom_tokenize(doc) for doc in X_val]
X_test_tokens = [custom_tokenize(doc) for doc in X_test]

# Compute IDF using the training data
idf = compute_idf(X_train_tokens)
default_idf = math.log(len(X_train_tokens) / 1)  # Default IDF for unseen words

# Compute TF-IDF for each document in the training set
X_train_tfidf = [compute_tfidf(compute_tf(doc), idf, default_idf) for doc in X_train_tokens]

# Transform validation and test sets using the IDF from the training set
X_val_tfidf = [compute_tfidf(compute_tf(doc), idf, default_idf) for doc in X_val_tokens]
X_test_tfidf = [compute_tfidf(compute_tf(doc), idf, default_idf) for doc in X_test_tokens]

# Function to convert TF-IDF vectors to sparse matrix
def tfidf_to_sparse_matrix(tfidf_list, vocab, default_idx):
    rows, cols, data = [], [], []
    for row, tfidf in enumerate(tfidf_list):
        for word, val in tfidf.items():
            col = vocab.get(word, default_idx)
            rows.append(row)
            cols.append(col)
            data.append(val)
    return csr_matrix((data, (rows, cols)), shape=(len(tfidf_list), len(vocab) + 1))

# Build vocabulary
vocab = {word: idx for idx, word in enumerate(idf.keys())}

# Use an index for unseen words
default_idx = len(vocab)

# Convert the TF-IDF vectors to a sparse matrix
X_train_tfidf_matrix = tfidf_to_sparse_matrix(X_train_tfidf, vocab, default_idx)
X_val_tfidf_matrix = tfidf_to_sparse_matrix(X_val_tfidf, vocab, default_idx)
X_test_tfidf_matrix = tfidf_to_sparse_matrix(X_test_tfidf, vocab, default_idx)

print("Training TF-IDF Sparse Matrix Shape:", X_train_tfidf_matrix.shape)
print("Validation TF-IDF Sparse Matrix Shape:", X_val_tfidf_matrix.shape)
print("Test TF-IDF Sparse Matrix Shape:", X_test_tfidf_matrix.shape)

Training TF-IDF Sparse Matrix Shape: (75123, 37433)
Validation TF-IDF Sparse Matrix Shape: (18781, 37433)
Test TF-IDF Sparse Matrix Shape: (23477, 37433)


**SVM from Scratch**

In [None]:
import numpy as np                  # for basic operations over arrays
from scipy.spatial import distance  # to compute the Gaussian kernel
import cvxopt                       # to solve the dual optimization problem
import copy                         # to copy numpy arrays
from scipy.sparse import csr_matrix

class SVM:
    linear = lambda x, xࠤ , c=0: x @ xࠤ .T
    polynomial = lambda x, xࠤ , Q=5: (1 + x @ xࠤ.T)**Q
    rbf = lambda x, xࠤ , γ=10: np.exp(-γ * distance.cdist(x, xࠤ,'sqeuclidean'))
    kernel_funs = {'linear': linear, 'polynomial': polynomial, 'rbf': rbf}

    def __init__(self, kernel='rbf', C=1, k=2):
        # set the hyperparameters
        self.kernel_str = kernel
        self.kernel = SVM.kernel_funs[kernel]
        self.C = C                  # regularization parameter
        self.k = k                  # kernel parameter

        # training data and support vectors
        self.X, y = None, None
        self.αs = None

        # for multi-class classification
        self.multiclass = False
        self.clfs = []

    def fit(self, X, y, eval_train=False):
      if len(np.unique(y)) > 2:
          self.multiclass = True
          return self.multi_fit(X, y, eval_train)

      # relabel if needed
      if set(np.unique(y)) == {0, 1}: y[y == 0] = -1


      # ensure y has dimensions Nx1
      self.y = y.reshape(-1, 1).astype(np.double) # Has to be a column vector
      self.X = X
      N = X.shape[0]

      # compute the kernel over all possible pairs of (x, x') in the data
      self.K = self.kernel(X, X, self.k)
      # For 1/2 x^T P x + q^T x
      P = cvxopt.matrix(self.y @ self.y.T * self.K)
      q = cvxopt.matrix(-np.ones((N, 1)))
      # For Ax = b
      A = cvxopt.matrix(self.y.T)
      b = cvxopt.matrix(np.zeros(1))
      # For Gx <= h
      G = cvxopt.matrix(np.vstack((-np.identity(N), np.identity(N))))
      h = cvxopt.matrix(np.vstack((np.zeros((N,1)), np.ones((N,1)) * self.C)))

      # Solve
      cvxopt.solvers.options['show_progress'] = True
      sol = cvxopt.solvers.qp(P, q, G, h, A, b)
      self.αs = np.array(sol["x"])

      # Maps into support vectors
      self.is_sv = ((self.αs > 1e-3) & (self.αs <= self.C)).squeeze()
      self.margin_sv = np.argmax((1e-3 < self.αs) & (self.αs < self.C - 1e-3))

      if eval_train:
        print(f"Finished training with accuracy {self.evaluate(X, y)}")

    def multi_fit(self, X, y, eval_train=False):
        self.k = len(np.unique(y))      # number of classes
        y = np.array(y)
        # for each pair of classes
        for i in range(self.k):
            # get the data for the pair
            Xs, Ys = X, copy.copy(y)

            # change the labels to -1 and 1
            Ys[Ys!=i], Ys[Ys==i] = -1, +1
            # fit the classifier
            clf = SVM(kernel=self.kernel_str, C=self.C, k=self.k)
            print('class : ,' , i)
            clf.fit(Xs, Ys)
            # save the classifier
            self.clfs.append(clf)
            print('Appended class : ' , i)
        if eval_train:
          print(f"Finished training with accuracy {self.evaluate(X, y)}")


    def predict(self, X_t):
        if self.multiclass: return self.multi_predict(X_t)
        xₛ, yₛ = self.X[self.margin_sv, np.newaxis], self.y[self.margin_sv]
        αs, y, X= self.αs[self.is_sv], self.y[self.is_sv], self.X[self.is_sv]

        b = yₛ - np.sum(αs * y * self.kernel(X, xₛ, self.k), axis=0)
        score = np.sum(αs * y * self.kernel(X, X_t, self.k), axis=0) + b
        return np.sign(score).astype(int), score

    def multi_predict(self, X):
        # get the predictions from all classifiers
        preds = np.zeros((X.shape[0], self.k))
        for i, clf in enumerate(self.clfs):
            _, preds[:, i] = clf.predict(X)

        # get the argmax and the corresponding score
        return np.argmax(preds, axis=1)

    def evaluate(self, X,y):
      outputs, _ = self.predict(X)
      accuracy = np.sum(outputs == y) / len(y)
      return round(accuracy, 2)

**Train an SVM Classifier**

**Evaluate the Classifier on the Validation Set**

**Test the Final Model on the Test Set**