**Import Libraries**

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import top_k_accuracy_score, make_scorer
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from collections import defaultdict, Counter
import math
from scipy.sparse import csr_matrix
import pickle

**Load The Dataset**

In [3]:
# Mount Google Drive to access files and save outputs
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Specify the file path in Google Drive and load the dataset after preprocessing
file_path = '/content/drive/My Drive/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

In [5]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org
...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis


**Encode the labels**



In [6]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder to the 'Assignee' column and transform it to numeric labels
# This step converts categorical labels in the 'Assignee' column to numeric labels,
# which is necessary for training machine learning models.
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

# The LabelEncoder in scikit-learn assigns numeric labels to the unique categories in alphabetical order 
# (or lexicographical order for strings). 
# This means that the first unique category in alphabetical order is labeled as 0, the second as 1, and so on.

In [7]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after labeling
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee,Assignee_Class
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org,124
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org,52
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org,2118
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org,1972
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org,2282
...,...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa,278
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa,278
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa,278
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis,1685


**Show the number of unique classes**

In [8]:
# Display the number of unique values in each column of the dataset
dataset.nunique()

Summary_Stemmed      117103
processed_summary    117103
Assignee               2370
Assignee_Class         2370
dtype: int64

**Split the dataset**

In [9]:
# Print the shape of the dataset
print(dataset.shape)  
# This line prints the shape of the dataset, which includes the number of rows and columns.
# It helps to understand the dimensions of the dataset.

# Print the class distribution of 'Assignee_Class'
print(dataset['Assignee_Class'].value_counts())  
# This line prints the count of each unique value in the 'Assignee_Class' column.
# It provides insight into the distribution of classes, which is useful for understanding class imbalance.

(117381, 4)
Assignee_Class
1014    2478
1408    2412
1009    1467
1643    1377
1013    1162
        ... 
1704       5
607        5
899        5
351        5
947        5
Name: count, Length: 2370, dtype: int64


In [10]:
# Create the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# This line creates a StratifiedShuffleSplit object with the following parameters:
# - n_splits=1: Specifies that there will be only one split.
# - test_size=0.2: Indicates that 20% of the dataset will be used as the test set.
# - random_state=42: Ensures reproducibility by using a fixed seed for the random number generator.

# Split the dataset into train and test sets
train_idx, test_idx = next(sss.split(dataset, dataset['Assignee_Class']))

# This line performs the split based on the 'Assignee_Class' column to ensure that the train and test sets 
# have a similar class distribution. 'sss.split' returns the indices of the train and test samples.

# Create the training DataFrame
train_df = dataset.iloc[train_idx].reset_index(drop=True)

# This line creates a training DataFrame using the indices obtained from the split. 
# 'iloc' is used to select the rows corresponding to the train indices. 
# 'reset_index(drop=True)' resets the index of the training DataFrame.

# Create the test DataFrame
test_df = dataset.iloc[test_idx].reset_index(drop=True)

# This line creates a test DataFrame using the indices obtained from the split. 
# 'iloc' is used to select the rows corresponding to the test indices. 
# 'reset_index(drop=True)' resets the index of the test DataFrame.

In [11]:
print(train_df.shape)
print(test_df.shape)

(93904, 4)
(23477, 4)


**Separate Features And Labels**

In [12]:
# Output the shapes of hathe resulting DataFrames
X_train, y_train = train_df['processed_summary'], train_df['Assignee_Class']
X_test, y_test = test_df['processed_summary'], test_df['Assignee_Class']

**Apply TF-IDF Transformation**

Training TF-IDF Sparse Matrix Shape: (93904, 43649)
Test TF-IDF Sparse Matrix Shape: (23477, 43649)


**Train SVM Classifier**

In [None]:
# Initialize the TF-IDF vectorizer with n-gram range (1, 2)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
"""
Purpose:
- Initializes a TF-IDF vectorizer object with a specified n-gram range (1, 2).
- Fits the vectorizer on the training data and transforms it into a TF-IDF matrix.

Parameters:
- ngram_range=(1, 2): Specifies to extract unigrams and bigrams.

Outputs:
- X_train_tfidf: TF-IDF matrix for training data where rows are documents and columns are TF-IDF features.
"""

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)
"""
Purpose:
- Transforms the test data using the same TF-IDF vectorizer fitted on the training data.

Outputs:
- X_test_tfidf: TF-IDF matrix for test data using the fitted TF-IDF vectorizer.
"""

**Initialize the model**

In [17]:
# Initialize the SVM classifier
svm_classifier = SVC(C=10, kernel='linear', probability=True)
"""
Purpose:
- Initializes a Support Vector Machine (SVM) classifier with the specified parameters.

Parameters:
- C=10: Penalty parameter C of the error term.
- kernel='linear': Specifies the linear kernel for the SVM.
- probability=True: Enables probability estimation, necessary for Platt scaling.

Outputs:
- svm_classifier: Initialized SVM classifier object configured with specified parameters.
"""

**Perform 5-fold cross-validation**

In [None]:
# Perform 5-fold cross-validation with SVM classifier
cv_scores = cross_val_score(svm_classifier, X_train_tfidf, y_train, cv=5, scoring='accuracy')

# Print cross-validation accuracy for each fold
print("Cross-validation accuracies for each fold:", cv_scores)

# Print mean cross-validation accuracy
mean_cv_accuracy = cv_scores.mean()
print("Mean cross-validation accuracy:", mean_cv_accuracy)

"""
Purpose: Executes 5-fold cross-validation to assess the SVM classifier's performance on the training data.

Indicates the use of cross_val_score to perform cross-validation with 5 folds.
Displays the accuracy scores obtained for each fold during cross-validation.
Calculates and prints the mean accuracy across all folds as an overall performance metric.
"""

**Perform 5-fold cross-validation with top-k accuracy**

In [None]:
# Define the custom scoring function for top-k accuracy
k = 5
top_k_scorer = make_scorer(top_k_accuracy_score, k=k)

# Perform 5-fold cross-validation with top-k accuracy
cv_scores = cross_val_score(svm_classifier, X_train_tfidf, y_train, cv=5, scoring=top_k_scorer)

# Print cross-validation top-k accuracies for each fold
print(f"Top-{k} cross-validation accuracies for each fold:", cv_scores)

# Print mean cross-validation top-k accuracy
mean_cv_accuracy = cv_scores.mean()
print(f"Mean top-{k} cross-validation accuracy:", mean_cv_accuracy)

"""
Purpose: Conducts 5-fold cross-validation using top-k accuracy scoring to evaluate the SVM classifier's performance.

Defines make_scorer to create a scoring function based on top_k_accuracy_score with k = 5.
Applies cross_val_score to perform cross-validation with 5 folds using the defined scoring function.
Outputs the top-5 accuracy scores achieved for each fold during cross-validation.
Calculates and prints the mean top-5 accuracy across all folds as an aggregate performance measure.
"""

