**Import Libraries**

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, top_k_accuracy_score, make_scorer
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
import joblib
import numpy as np
from collections import defaultdict, Counter
import math
from scipy.sparse import csr_matrix
import pickle

**Load The Dataset**

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
file_path = '/content/drive/My Drive/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

In [5]:
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org
...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis


**Encode the labels**



In [6]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Assignee' column to numeric labels
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

In [7]:
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee,Assignee_Class
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org,124
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org,52
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org,2118
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org,1972
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org,2282
...,...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa,278
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa,278
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa,278
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis,1685


**Show the number of unique classes**

In [8]:
# compute the number of unique values for each column in the DataFrame training_data.
dataset.nunique()

Summary_Stemmed      117103
processed_summary    117103
Assignee               2370
Assignee_Class         2370
dtype: int64

**Split the dataset**

In [9]:
print(dataset.shape)  # Check the shape of the dataset
print(dataset['Assignee_Class'].value_counts())  # Check class distribution

(117381, 4)
Assignee_Class
1014    2478
1408    2412
1009    1467
1643    1377
1013    1162
        ... 
1704       5
607        5
899        5
351        5
947        5
Name: count, Length: 2370, dtype: int64


In [10]:
# Create the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# Split the dataset into train and test sets
train_idx, test_idx = next(sss.split(dataset, dataset['Assignee_Class']))
train_df = dataset.iloc[train_idx].reset_index(drop=True)
test_df = dataset.iloc[test_idx].reset_index(drop=True)

In [11]:
print(train_df.shape)
#print(val_df.shape)
print(test_df.shape)

(93904, 4)
(23477, 4)


**Separate Features And Labels**

In [12]:
# Separate features (X) and labels (y) for train, and test sets
X_train, y_train = train_df['processed_summary'], train_df['Assignee_Class']
X_test, y_test = test_df['processed_summary'], test_df['Assignee_Class']

**Apply TF-IDF Transformation**

In [13]:
# Function to compute Term Frequency
def compute_tf(doc):
    tf_dict = Counter(doc)
    total_terms = len(doc)
    tf_dict = {word: count / total_terms for word, count in tf_dict.items()}
    return tf_dict

# Function to compute Inverse Document Frequency
def compute_idf(corpus):
    idf_dict = defaultdict(int)
    num_docs = len(corpus)

    # Count the number of documents containing each word
    for doc in corpus:
        for word in set(doc):
            idf_dict[word] += 1

    # Calculate IDF for each word
    idf_dict = {word: math.log(num_docs / count) for word, count in idf_dict.items()}
    return idf_dict

# Function to compute TF-IDF
def compute_tfidf(tf, idf, default_idf):
    tfidf = {word: tf_val * idf.get(word, default_idf) for word, tf_val in tf.items()}
    return tfidf

def custom_tokenize(text):
    # Tokenize the text
    tokens = text.split()  # Or use a more sophisticated tokenizer
    # Filter out single-character tokens
    tokens = [token for token in tokens if len(token) > 1]
    return tokens

# Tokenize the dataset using the custom tokenizer
X_train_tokens = [custom_tokenize(doc) for doc in X_train]
X_test_tokens = [custom_tokenize(doc) for doc in X_test]

# Compute IDF using the training data
idf = compute_idf(X_train_tokens)
default_idf = math.log(len(X_train_tokens) / 1)  # Default IDF for unseen words

# Compute TF-IDF for each document in the training set
X_train_tfidf = [compute_tfidf(compute_tf(doc), idf, default_idf) for doc in X_train_tokens]

# Transform validation and test sets using the IDF from the training set
X_test_tfidf = [compute_tfidf(compute_tf(doc), idf, default_idf) for doc in X_test_tokens]

# Function to convert TF-IDF vectors to sparse matrix
def tfidf_to_sparse_matrix(tfidf_list, vocab, default_idx):
    rows, cols, data = [], [], []
    for row, tfidf in enumerate(tfidf_list):
        for word, val in tfidf.items():
            col = vocab.get(word, default_idx)
            rows.append(row)
            cols.append(col)
            data.append(val)
    return csr_matrix((data, (rows, cols)), shape=(len(tfidf_list), len(vocab) + 1))

# Build vocabulary
vocab = {word: idx for idx, word in enumerate(idf.keys())}

# Use an index for unseen words
default_idx = len(vocab)

# Convert the TF-IDF vectors to a sparse matrix
X_train_tfidf_matrix = tfidf_to_sparse_matrix(X_train_tfidf, vocab, default_idx)
X_test_tfidf_matrix = tfidf_to_sparse_matrix(X_test_tfidf, vocab, default_idx)

print("Training TF-IDF Sparse Matrix Shape:", X_train_tfidf_matrix.shape)
print("Test TF-IDF Sparse Matrix Shape:", X_test_tfidf_matrix.shape)

Training TF-IDF Sparse Matrix Shape: (93904, 43649)
Test TF-IDF Sparse Matrix Shape: (23477, 43649)


**Save the Computed IDF and Vocabulary**

In [14]:
# Save the IDF dictionary and vocabulary
with open('custom_tfidf.pkl', 'wb') as f:
    pickle.dump({'idf': idf, 'vocab': vocab, 'default_idf': default_idf, 'default_idx': default_idx}, f)

**Load the Saved Data**

In [15]:
# Load the saved IDF dictionary and vocabulary
with open('custom_tfidf.pkl', 'rb') as f:
    tfidf_data = pickle.load(f)

idf = tfidf_data['idf']
vocab = tfidf_data['vocab']
default_idf = tfidf_data['default_idf']
default_idx = tfidf_data['default_idx']


**Train SVM Classifier**

**Initialize the model**

In [17]:
# Initialize the SVM classifier
svm_classifier = SVC(C=10, kernel='linear', probability=True)

**Perform 5-fold cross-validation**

In [None]:
# Perform 5-fold cross-validation
cv_scores = cross_val_score(svm_classifier, X_train_tfidf, y_train, cv=5, scoring='accuracy')

# Print cross-validation accuracy for each fold
print("Cross-validation accuracies for each fold:", cv_scores)

# Print mean cross-validation accuracy
mean_cv_accuracy = cv_scores.mean()
print("Mean cross-validation accuracy:", mean_cv_accuracy)

**Perform 5-fold cross-validation with top-k accuracy**

In [None]:
# Define the custom scoring function for top-k accuracy
k = 5
top_k_scorer = make_scorer(top_k_accuracy_score, k=k)

# Perform 5-fold cross-validation with top-k accuracy
cv_scores = cross_val_score(svm_classifier, X_train_tfidf, y_train, cv=5, scoring=top_k_scorer)

# Print cross-validation top-k accuracies for each fold
print(f"Top-{k} cross-validation accuracies for each fold:", cv_scores)

# Print mean cross-validation top-k accuracy
mean_cv_accuracy = cv_scores.mean()
print(f"Mean top-{k} cross-validation accuracy:", mean_cv_accuracy)



**Save The SVM model**

In [None]:
# Save the model to Google Drive
model_filename = '/content/drive/My Drive/checkpoints/svm_classifier_model_with_probability.joblib'
joblib.dump(svm_classifier, model_filename)

**Load The SVM model**

In [None]:
# Load the model from Google Drive
model_filename = '/content/drive/MyDrive/checkpoints/svm_classifier_model_with_probability.joblib'
svm_classifier = joblib.load(model_filename)

**Evaluate the Final Model on the Test Set**

In [None]:
# Predict labels on the test set
test_predictions = svm_classifier.predict(X_test_tfidf)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.25778421433743665


In [None]:
# Predict probabilities on the test set
test_probabilities = svm_classifier.predict_proba(X_test_tfidf)

In [None]:
# Calculate top-3 accuracy
top_k = 3
test_top_k_accuracy = top_k_accuracy_score(y_test, test_probabilities, k=top_k)
print(f"Test Top-{top_k} Accuracy:", test_top_k_accuracy)

In [None]:
# Calculate top-3 accuracy
top_k = 5
test_top_k_accuracy = top_k_accuracy_score(y_test, test_probabilities, k=top_k)
print(f"Test Top-{top_k} Accuracy:", test_top_k_accuracy)

In [None]:
# Calculate top-3 accuracy
top_k = 10
test_top_k_accuracy = top_k_accuracy_score(y_test, test_probabilities, k=top_k)
print(f"Test Top-{top_k} Accuracy:", test_top_k_accuracy)