**Import Libraries**

In [4]:
# Import necessary libraries
import pandas as pd  # Pandas for data manipulation and analysis
from sklearn.preprocessing import LabelEncoder  # LabelEncoder for encoding categorical target labels
from sklearn.svm import SVC # SVC (Support Vector Classifier) for SVM classification
from sklearn.metrics import accuracy_score, top_k_accuracy_score # Metrics for evaluating model performance
from sklearn.model_selection import StratifiedShuffleSplit # StratifiedShuffleSplit for train-test splitting
import joblib # Joblib for saving and loading models
import tf_idf_utils # Importing the custom module tf_idf_utils
import math # Importing the math module for mathematical functions  

**Load The Dataset**

In [5]:
# Mount Google Drive to access files and save outputs
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Specify the file path in Google Drive and load the dataset after preprocessing
file_path = '/content/drive/My Drive/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

In [7]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org
...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis


**Encode the labels**



In [8]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder to the 'Assignee' column and transform it to numeric labels
# This step converts categorical labels in the 'Assignee' column to numeric labels,
# which is necessary for training machine learning models.
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

# The LabelEncoder in scikit-learn assigns numeric labels to the unique categories in alphabetical order
# (or lexicographical order for strings).
# This means that the first unique category in alphabetical order is labeled as 0, the second as 1, and so on.

In [9]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after labeling
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee,Assignee_Class
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org,124
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org,52
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org,2118
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org,1972
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org,2282
...,...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa,278
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa,278
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa,278
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis,1685


**Train only on 20 classes of the data**

In [10]:
dataset= dataset[dataset['Assignee_Class']<20]

In [11]:
# Display the number of unique values in each column of the dataset
dataset.nunique()

Summary_Stemmed      2747
processed_summary    2747
Assignee               20
Assignee_Class         20
dtype: int64

**Split the dataset**

In [12]:
# Print the shape of the dataset
print(dataset.shape)
# This line prints the shape of the dataset, which includes the number of rows and columns.
# It helps to understand the dimensions of the dataset.

# Print the class distribution of 'Assignee_Class'
print(dataset['Assignee_Class'].value_counts())
# This line prints the count of each unique value in the 'Assignee_Class' column.
# It provides insight into the distribution of classes, which is useful for understanding class imbalance.

(2748, 4)
Assignee_Class
11    779
0     641
19    560
1     251
16    115
13    113
8      80
5      42
4      40
7      31
17     24
18     15
12     13
15      9
6       7
3       7
10      6
14      5
2       5
9       5
Name: count, dtype: int64


In [13]:
# Create the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# This line creates a StratifiedShuffleSplit object with the following parameters:
# - n_splits=1: Specifies that there will be only one split.
# - test_size=0.2: Indicates that 20% of the dataset will be used as the test set.
# - random_state=42: Ensures reproducibility by using a fixed seed for the random number generator.

# Split the dataset into train and test sets
train_idx, test_idx = next(sss.split(dataset, dataset['Assignee_Class']))

# This line performs the split based on the 'Assignee_Class' column to ensure that the train and test sets
# have a similar class distribution. 'sss.split' returns the indices of the train and test samples.

# Create the training DataFrame
train_df = dataset.iloc[train_idx].reset_index(drop=True)

# This line creates a training DataFrame using the indices obtained from the split.
# 'iloc' is used to select the rows corresponding to the train indices.
# 'reset_index(drop=True)' resets the index of the training DataFrame.

# Create the test DataFrame
test_df = dataset.iloc[test_idx].reset_index(drop=True)

# This line creates a test DataFrame using the indices obtained from the split.
# 'iloc' is used to select the rows corresponding to the test indices.
# 'reset_index(drop=True)' resets the index of the test DataFrame.

In [14]:
# Output the shapes of hathe resulting DataFrames
print(train_df.shape)
print(test_df.shape)

(2198, 4)
(550, 4)


In [15]:
# Separate features (X) and labels (y) for train, validation, and test sets
X_train, y_train = train_df['processed_summary'], train_df['Assignee_Class']
X_test, y_test = test_df['processed_summary'], test_df['Assignee_Class']

**Apply TF-IDF Transformation**

In [16]:
# Tokenize the dataset using the custom tokenizer from tf_idf_utils module
X_train_tokens = [tf_idf_utils.custom_tokenize(doc) for doc in X_train]
X_test_tokens = [tf_idf_utils.custom_tokenize(doc) for doc in X_test]

# Compute IDF (Inverse Document Frequency) using the training data tokens
idf = tf_idf_utils.compute_idf(X_train_tokens)
default_idf = math.log(len(X_train_tokens) / 1)  # Default IDF for unseen words

# Compute TF-IDF (Term Frequency-Inverse Document Frequency) for each document in the training set
X_train_tfidf = [tf_idf_utils.compute_tfidf(tf_idf_utils.compute_tf(doc), idf, default_idf) for doc in X_train_tokens]

# Transform test sets using the IDF from the training set
X_test_tfidf = [tf_idf_utils.compute_tfidf(tf_idf_utils.compute_tf(doc), idf, default_idf) for doc in X_test_tokens]
"""
When transforming test data (X_test) into TF-IDF vectors, 
it's essential to maintain the same scale and feature representation as the training data (X_train). 
Using IDF values from X_train ensures that the test set is processed in the same context as the training set, 
preventing any bias or information leakage that could occur if test set statistics were used.
"""

# Build vocabulary using unique words from IDF dictionary keys
vocab = {word: idx for idx, word in enumerate(idf.keys())}

# Use an index for unseen words in the vocabulary
default_idx = len(vocab)

# Convert the TF-IDF vectors to a sparse matrix format
X_train_tfidf_matrix = tf_idf_utils.tfidf_to_sparse_matrix(X_train_tfidf, vocab, default_idx)
X_test_tfidf_matrix = tf_idf_utils.tfidf_to_sparse_matrix(X_test_tfidf, vocab, default_idx)

# Print the shapes of the TF-IDF sparse matrices
print("Training TF-IDF Sparse Matrix Shape:", X_train_tfidf_matrix.shape)
print("Test TF-IDF Sparse Matrix Shape:", X_test_tfidf_matrix.shape)

"""
Purpose: Tokenizes text data, computes TF-IDF values, builds vocabulary, and converts TF-IDF vectors to sparse matrices.

Uses the custom_tokenize function from the tf_idf_utils module to tokenize each document in X_train and X_test.
Calculates IDF using the compute_idf function from tf_idf_utils based on X_train_tokens.
Computes TF-IDF for each document in X_train_tokens and X_test_tokens using compute_tf and compute_tfidf functions from tf_idf_utils.
Constructs a vocabulary (vocab) mapping each word to its index based on the IDF keys.
Assigns default_idx to the length of vocab to handle unseen words during matrix conversion.
Converts TF-IDF vectors (X_train_tfidf and X_test_tfidf) to sparse matrices (X_train_tfidf_matrix and X_test_tfidf_matrix) using tfidf_to_sparse_matrix function from tf_idf_utils.
Prints the shapes of the resulting sparse matrices (X_train_tfidf_matrix and X_test_tfidf_matrix).
"""

Training TF-IDF Sparse Matrix Shape: (2198, 15888)
Test TF-IDF Sparse Matrix Shape: (550, 15888)


'\nPurpose: Tokenizes text data, computes TF-IDF values, builds vocabulary, and converts TF-IDF vectors to sparse matrices.\n\nUses the custom_tokenize function from the tf_idf_utils module to tokenize each document in X_train and X_test.\nCalculates IDF using the compute_idf function from tf_idf_utils based on X_train_tokens.\nComputes TF-IDF for each document in X_train_tokens and X_test_tokens using compute_tf and compute_tfidf functions from tf_idf_utils.\nConstructs a vocabulary (vocab) mapping each word to its index based on the IDF keys.\nAssigns default_idx to the length of vocab to handle unseen words during matrix conversion.\nConverts TF-IDF vectors (X_train_tfidf and X_test_tfidf) to sparse matrices (X_train_tfidf_matrix and X_test_tfidf_matrix) using tfidf_to_sparse_matrix function from tf_idf_utils.\nPrints the shapes of the resulting sparse matrices (X_train_tfidf_matrix and X_test_tfidf_matrix).\n'

In [25]:
# Save the components
joblib.dump((idf, default_idf, vocab, default_idx), '/content/drive/MyDrive/checkpoints/custom_tfidf_vectorizer_final.pkl')

['/content/drive/MyDrive/checkpoints/custom_tfidf_vectorizer_final.pkl']

**Train an SVM Classifier**

In [17]:
# Initialize the SVM classifier
svm_classifier = SVC(C=10, kernel='linear',gamma = 'scale', class_weight = 'balanced' ,probability=True)
"""
Purpose:
- Initializes a Support Vector Machine (SVM) classifier with the specified parameters.

Parameters:
- C=10: Penalty parameter C of the error term.
- kernel='linear': Specifies the linear kernel for the SVM.
- probability=True: Enables probability estimation, necessary for Platt scaling.

Outputs:
- svm_classifier: Initialized SVM classifier object configured with specified parameters.
"""

# Train the classifier on the TF-IDF transformed training data
svm_classifier.fit(X_train_tfidf_matrix, y_train)
"""
Purpose:
- Trains the SVM classifier on the TF-IDF transformed training data.

Parameters:
- X_train_tfidf_matrix: Training data features transformed into TF-IDF format.
- y_train: Training data labels.

Outputs:
- Trained svm_classifier: SVM classifier fitted to the training data.
"""

# Save the trained model to Google Drive
model_filename = '/content/drive/MyDrive/checkpoints/svm_classifier_model_with_tf_idf_implementation_final.joblib'
joblib.dump(svm_classifier, model_filename)
"""
Purpose:
- Saves the trained SVM classifier model to a specified file location on Google Drive.

Parameters:
- svm_classifier: Trained SVM classifier object.
- model_filename: File path where the trained model will be saved.

Outputs:
- Saved model file: Persists the trained SVM classifier to the specified file location.
"""

'\nPurpose:\n- Saves the trained SVM classifier model to a specified file location on Google Drive.\n\nParameters:\n- svm_classifier: Trained SVM classifier object.\n- model_filename: File path where the trained model will be saved.\n\nOutputs:\n- Saved model file: Persists the trained SVM classifier to the specified file location.\n'

In [None]:
# Load the model from Google Drive
model_filename = '/content/drive/MyDrive/checkpoints/svm_classifier_model_with_tf_idf_implementation_final.joblib'
svm_classifier = joblib.load(model_filename)
"""
Purpose: Loads a pre-trained SVM classifier model from a specified file path in Google Drive.
"""

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


'\nPurpose: Loads a pre-trained SVM classifier model from a specified file path in Google Drive.\n'

**Test the Final Model on the Test Set**

In [18]:
# Predict labels on the test set
test_predictions = svm_classifier.predict(X_test_tfidf_matrix)
"""
Purpose:
- Predicts the class labels for the test set using the trained SVM classifier.

Parameters:
- svm_classifier: Trained SVM classifier object.
- X_test_tfidf_matrix: Test set features transformed into TF-IDF format.

Outputs:
- test_predictions: Predicted class labels for the test set.
"""

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)
"""
Purpose:
- Computes the accuracy of the SVM classifier on the test set.

Parameters:
- y_test: True class labels of the test set.
- test_predictions: Predicted class labels obtained from the SVM classifier.

Outputs:
- Prints the Test accuracy score.
"""

Test Accuracy: 0.7418181818181818


'\nPurpose:\n- Computes the accuracy of the SVM classifier on the test set.\n\nParameters:\n- y_test: True class labels of the test set.\n- test_predictions: Predicted class labels obtained from the SVM classifier.\n\nOutputs:\n- Prints the Test accuracy score.\n'

In [19]:
# Predict probabilities on the test set
test_probabilities = svm_classifier.predict_proba(X_test_tfidf_matrix)
"""
Purpose:
- Predicts class probabilities for each sample in the test set using the trained SVM classifier.

Parameters:
- svm_classifier: SVM classifier model previously trained and loaded.
- X_test_tfidf_matrix: TF-IDF transformed Test set features.

Outputs:
- test_probabilities: Predicted probabilities of each class for each sample in the test set.
"""

'\nPurpose:\n- Predicts class probabilities for each sample in the test set using the trained SVM classifier.\n\nParameters:\n- svm_classifier: SVM classifier model previously trained and loaded.\n- X_test_tfidf_matrix: TF-IDF transformed Test set features.\n\nOutputs:\n- test_probabilities: Predicted probabilities of each class for each sample in the test set.\n'

In [20]:
# Calculate top-3 accuracy
top_k = 3
test_top_k_accuracy = top_k_accuracy_score(y_test, test_probabilities, k=top_k)
"""
Purpose:
- Calculates the top-K accuracy for the test set predictions based on predicted probabilities.

Parameters:
- y_test: True labels of the test set.
- test_probabilities: Predicted probabilities of each class for each sample in the test set.
- k: Number of top predictions to consider for accuracy calculation.

Outputs:
- test_top_k_accuracy: Top-K accuracy score for the test set predictions.
"""
print(f"Test Top-{top_k} Accuracy:", test_top_k_accuracy)
"""
Purpose:
- Prints the computed top-K accuracy score for the test set predictions.
"""

Test Top-3 Accuracy: 0.9381818181818182


'\nPurpose:\n- Prints the computed top-K accuracy score for the test set predictions.\n'

In [21]:
# Calculate top-5 accuracy
top_k = 5
test_top_k_accuracy = top_k_accuracy_score(y_test, test_probabilities, k=top_k)
"""
Purpose:
- Calculates the top-K accuracy for the test set predictions based on predicted probabilities.

Parameters:
- y_test: True labels of the test set.
- test_probabilities: Predicted probabilities of each class for each sample in the test set.
- k: Number of top predictions to consider for accuracy calculation.

Outputs:
- test_top_k_accuracy: Top-K accuracy score for the test set predictions.
"""
print(f"Test Top-{top_k} Accuracy:", test_top_k_accuracy)
"""
Purpose:
- Prints the computed top-K accuracy score for the test set predictions.
"""

Test Top-5 Accuracy: 0.9654545454545455


'\nPurpose:\n- Prints the computed top-K accuracy score for the test set predictions.\n'

In [22]:
# Calculate top-10 accuracy
top_k = 10
test_top_k_accuracy = top_k_accuracy_score(y_test, test_probabilities, k=top_k)
"""
Purpose:
- Calculates the top-K accuracy for the test set predictions based on predicted probabilities.

Parameters:
- y_test: True labels of the test set.
- test_probabilities: Predicted probabilities of each class for each sample in the test set.
- k: Number of top predictions to consider for accuracy calculation.

Outputs:
- test_top_k_accuracy: Top-K accuracy score for the test set predictions.
"""
print(f"Test Top-{top_k} Accuracy:", test_top_k_accuracy)
"""
Purpose:
- Prints the computed top-K accuracy score for the test set predictions.
"""

Test Top-10 Accuracy: 0.990909090909091


'\nPurpose:\n- Prints the computed top-K accuracy score for the test set predictions.\n'