**Import Libraries**

In [1]:
# Import necessary libraries
import pandas as pd  # Pandas for data manipulation and analysis
from sklearn.preprocessing import LabelEncoder  # LabelEncoder for encoding categorical target labels
from sklearn.feature_extraction.text import TfidfVectorizer  # TfidfVectorizer for converting text data to TF-IDF features
from sklearn.svm import SVC  # SVC (Support Vector Classifier) for SVM classification
from sklearn.metrics import accuracy_score, top_k_accuracy_score  # Metrics for evaluating model performance
from sklearn.model_selection import StratifiedShuffleSplit  # StratifiedShuffleSplit for train-test splitting
import joblib  # Joblib for saving and loading models
from sklearn.decomposition import TruncatedSVD  # TruncatedSVD for dimensionality reduction using Singular Value Decomposition (SVD)

**Load The Dataset**

In [2]:
# Mount Google Drive to access files and save outputs
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [141]:
# Specify the file path in Google Drive and load the dataset after preprocessing
file_path = '/content/drive/My Drive/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

In [142]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org
...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis


**Encode the labels**



In [143]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder to the 'Assignee' column and transform it to numeric labels
# This step converts categorical labels in the 'Assignee' column to numeric labels,
# which is necessary for training machine learning models.
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

# The LabelEncoder in scikit-learn assigns numeric labels to the unique categories in alphabetical order
# (or lexicographical order for strings).
# This means that the first unique category in alphabetical order is labeled as 0, the second as 1, and so on.

In [144]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after labeling
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee,Assignee_Class
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org,124
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org,52
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org,2118
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org,1972
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org,2282
...,...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa,278
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa,278
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa,278
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis,1685


In [145]:
dataset= dataset[dataset['Assignee_Class']<20]

In [146]:
# Display the number of unique values in each column of the dataset
dataset.nunique()

Summary_Stemmed      2747
processed_summary    2747
Assignee               20
Assignee_Class         20
dtype: int64

In [147]:
# Print the shape of the dataset
print(dataset.shape)
# This line prints the shape of the dataset, which includes the number of rows and columns.
# It helps to understand the dimensions of the dataset.

# Print the class distribution of 'Assignee_Class'
print(dataset['Assignee_Class'].value_counts())
# This line prints the count of each unique value in the 'Assignee_Class' column.
# It provides insight into the distribution of classes, which is useful for understanding class imbalance.

(2748, 4)
Assignee_Class
11    779
0     641
19    560
1     251
16    115
13    113
8      80
5      42
4      40
7      31
17     24
18     15
12     13
15      9
6       7
3       7
10      6
14      5
2       5
9       5
Name: count, dtype: int64


**Split the dataset**

In [148]:
# Create the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# This line creates a StratifiedShuffleSplit object with the following parameters:
# - n_splits=1: Specifies that there will be only one split.
# - test_size=0.2: Indicates that 20% of the dataset will be used as the test set.
# - random_state=42: Ensures reproducibility by using a fixed seed for the random number generator.

# Split the dataset into train and test sets
train_idx, test_idx = next(sss.split(dataset, dataset['Assignee_Class']))

# This line performs the split based on the 'Assignee_Class' column to ensure that the train and test sets
# have a similar class distribution. 'sss.split' returns the indices of the train and test samples.

# Create the training DataFrame
train_df = dataset.iloc[train_idx].reset_index(drop=True)

# This line creates a training DataFrame using the indices obtained from the split.
# 'iloc' is used to select the rows corresponding to the train indices.
# 'reset_index(drop=True)' resets the index of the training DataFrame.

# Create the test DataFrame
test_df = dataset.iloc[test_idx].reset_index(drop=True)

# This line creates a test DataFrame using the indices obtained from the split.
# 'iloc' is used to select the rows corresponding to the test indices.
# 'reset_index(drop=True)' resets the index of the test DataFrame.

In [149]:
# Create another StratifiedShuffleSplit object for the train-validation split
sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# This line creates a new StratifiedShuffleSplit object specifically for splitting the initial training set
# into train and validation sets with the following parameters:
# - n_splits=1: Specifies that there will be only one split.
# - test_size=0.2: Indicates that 20% of the training set will be used as the validation set.
# - random_state=42: Ensures reproducibility by using a fixed seed for the random number generator.

# Split the initial train set into train and validation sets
for train_idx, val_idx in sss_val.split(train_df, train_df['Assignee_Class']):
    final_train_df = train_df.iloc[train_idx].reset_index(drop=True)
    val_df = train_df.iloc[val_idx].reset_index(drop=True)

# This loop performs the split based on the 'Assignee_Class' column to ensure that the train and validation sets
# have a similar class distribution. 'sss_val.split' returns the indices of the train and validation samples.

# Create the final training DataFrame
final_train_df = train_df.iloc[train_idx].reset_index(drop=True)

# This line creates a final training DataFrame using the indices obtained from the split.
# 'iloc' is used to select the rows corresponding to the train indices.
# 'reset_index(drop=True)' resets the index of the final training DataFrame.

# Create the validation DataFrame
val_df = train_df.iloc[val_idx].reset_index(drop=True)

# This line creates a validation DataFrame using the indices obtained from the split.
# 'iloc' is used to select the rows corresponding to the validation indices.
# 'reset_index(drop=True)' resets the index of the validation DataFrame.

In [150]:
# Output the shapes of hathe resulting DataFrames
print(final_train_df.shape)
print(val_df.shape)
print(test_df.shape)

(1758, 4)
(440, 4)
(550, 4)


In [151]:
# Separate features (X) and labels (y) for train, validation, and test sets
X_train, y_train = final_train_df['processed_summary'], final_train_df['Assignee_Class']
X_val, y_val = val_df['processed_summary'], val_df['Assignee_Class']
X_test, y_test = test_df['processed_summary'], test_df['Assignee_Class']

**Apply TF-IDF Transformation**

In [152]:
# Initialize the TF-IDF vectorizer with n-gram range (1, 2)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
"""
Purpose:
- Initializes a TF-IDF vectorizer object with a specified n-gram range (1, 2).
- Fits the vectorizer on the training data and transforms it into a TF-IDF matrix.

Parameters:
- ngram_range=(1, 2): Specifies to extract unigrams and bigrams.

Outputs:
- X_train_tfidf: TF-IDF matrix for training data where rows are documents and columns are TF-IDF features.
"""

# Transform the validation data
X_val_tfidf = tfidf_vectorizer.transform(X_val)
"""
Purpose:
- Transforms the validation data using the same TF-IDF vectorizer fitted on the training data.

Outputs:
- X_val_tfidf: TF-IDF matrix for validation data using the fitted TF-IDF vectorizer.
"""

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)
"""
Purpose:
- Transforms the test data using the same TF-IDF vectorizer fitted on the training data.

Outputs:
- X_test_tfidf: TF-IDF matrix for test data using the fitted TF-IDF vectorizer.
"""

'\nPurpose:\n- Transforms the test data using the same TF-IDF vectorizer fitted on the training data.\n\nOutputs:\n- X_test_tfidf: TF-IDF matrix for test data using the fitted TF-IDF vectorizer.\n'

**Apply SVD**

In [None]:
# Apply SVD to reduce dimensionality
svd = TruncatedSVD(n_components=1100, random_state=42)
"""
Purpose:
- Initializes a TruncatedSVD object to reduce the dimensionality of the TF-IDF vectors.

Parameters:
- n_components=1100: Specifies the number of components (dimensions) to retain after dimensionality reduction.
- random_state=42: Ensures reproducibility of results by fixing the random seed.

Outputs:
- svd: TruncatedSVD object configured with specified parameters.
"""

# Fit and transform the training data
X_train_svd = svd.fit_transform(X_train_tfidf)
"""
Purpose:
- Fits the TruncatedSVD model on the training TF-IDF data and transforms it to reduce dimensions.

Outputs:
- X_train_svd: Reduced-dimensional representation of the training TF-IDF data.
"""

# Transform the validation data
X_val_svd = svd.transform(X_val_tfidf)
"""
Purpose:
- Transforms the validation TF-IDF data using the fitted TruncatedSVD model.

Outputs:
- X_val_svd: Reduced-dimensional representation of the validation TF-IDF data using the fitted SVD.
"""

# Transform the test data
X_test_svd = svd.transform(X_test_tfidf)
"""
Purpose:
- Transforms the test TF-IDF data using the fitted TruncatedSVD model.

Outputs:
- X_test_svd: Reduced-dimensional representation of the test TF-IDF data using the fitted SVD.
"""

**Train an SVM Classifier**

In [153]:
# Initialize the SVM classifier
svm_classifier = SVC(C=10, kernel='linear', probability=True)
"""
Purpose:
- Initializes a Support Vector Machine (SVM) classifier with the specified parameters.

Parameters:
- C=10: Penalty parameter C of the error term.
- kernel='linear': Specifies the linear kernel for the SVM.
- probability=True: Enables probability estimation, necessary for Platt scaling.

Outputs:
- svm_classifier: Initialized SVM classifier object configured with specified parameters.
"""

# Train the classifier on the TF-IDF transformed training data
svm_classifier.fit(X_train_tfidf, y_train)
"""
Purpose:
- Trains the SVM classifier on the TF-IDF transformed training data.

Parameters:
- X_train_tfidf: Training data features transformed into TF-IDF format.
- y_train: Training data labels.

Outputs:
- Trained svm_classifier: SVM classifier fitted to the training data.
"""

# Save the trained model to Google Drive
model_filename = '/content/drive/MyDrive/checkpoints/svm_classifier_model_with_probability.joblib'
#joblib.dump(svm_classifier, model_filename)
"""
Purpose:
- Saves the trained SVM classifier model to a specified file location on Google Drive.

Parameters:
- svm_classifier: Trained SVM classifier object.
- model_filename: File path where the trained model will be saved.

Outputs:
- Saved model file: Persists the trained SVM classifier to the specified file location.
"""

'\nPurpose:\n- Saves the trained SVM classifier model to a specified file location on Google Drive.\n\nParameters:\n- svm_classifier: Trained SVM classifier object.\n- model_filename: File path where the trained model will be saved.\n\nOutputs:\n- Saved model file: Persists the trained SVM classifier to the specified file location.\n'

**Evaluate the Classifier on the Validation Set**

In [154]:
# Predict labels on the validation set
val_predictions = svm_classifier.predict(X_val_tfidf)
"""
Purpose:
- Predicts the class labels for the validation set using the trained SVM classifier.

Parameters:
- svm_classifier: Trained SVM classifier object.
- X_val_tfidf: Validation set features transformed into TF-IDF format.

Outputs:
- val_predictions: Predicted class labels for the validation set.
"""

# Calculate accuracy on the validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)
"""
Purpose:
- Computes the accuracy of the SVM classifier on the validation set.

Parameters:
- y_val: True class labels of the validation set.
- val_predictions: Predicted class labels obtained from the SVM classifier.

Outputs:
- Prints the validation accuracy score.
"""

Validation Accuracy: 0.7659090909090909


'\nPurpose:\n- Computes the accuracy of the SVM classifier on the validation set.\n\nParameters:\n- y_val: True class labels of the validation set.\n- val_predictions: Predicted class labels obtained from the SVM classifier.\n\nOutputs:\n- Prints the validation accuracy score.\n'

In [55]:
# Load the model from Google Drive
model_filename = '/content/drive/MyDrive/checkpoints/svm_classifier_model_with_probability.joblib'
svm_classifier = joblib.load(model_filename)
"""
Purpose:
- Loads a pre-trained SVM classifier model from a specified file.

Parameters:
- model_filename: File path where the SVM classifier model is stored.

Outputs:
- svm_classifier: Loaded SVM classifier object ready for inference.
"""

'\nPurpose:\n- Loads a pre-trained SVM classifier model from a specified file.\n\nParameters:\n- model_filename: File path where the SVM classifier model is stored.\n\nOutputs:\n- svm_classifier: Loaded SVM classifier object ready for inference.\n'

In [155]:
# Predict probabilities on the validation set
val_probabilities = svm_classifier.predict_proba(X_val_tfidf)
"""
Purpose:
- Predicts class probabilities for each sample in the validation set using the trained SVM classifier.

Parameters:
- svm_classifier: SVM classifier model previously trained and loaded.
- X_val_tfidf: TF-IDF transformed validation set features.

Outputs:
- val_probabilities: Predicted probabilities of each class for each sample in the validation set.
"""

'\nPurpose:\n- Predicts class probabilities for each sample in the validation set using the trained SVM classifier.\n\nParameters:\n- svm_classifier: SVM classifier model previously trained and loaded.\n- X_val_tfidf: TF-IDF transformed validation set features.\n\nOutputs:\n- val_probabilities: Predicted probabilities of each class for each sample in the validation set.\n'

In [156]:
# Calculate top-3 accuracy
top_k = 3
val_top_k_accuracy = top_k_accuracy_score(y_val, val_probabilities, k=top_k)
"""
Purpose:dataset
- Calculates the top-K accuracy for the validation set predictions based on predicted probabilities.

Parameters:
- y_val: True labels of the validation set.
- val_probabilities: Predicted probabilities of each class for each sample in the validation set.
- k: Number of top predictions to consider for accuracy calculation.

Outputs:
- val_top_k_accuracy: Top-K accuracy score for the validation set predictions.
"""

print(f"Validation Top-{top_k} Accuracy:", val_top_k_accuracy)
"""
Purpose:
- Prints the computed top-K accuracy score for the validation set predictions.
"""

Validation Top-3 Accuracy: 0.925


'\nPurpose:\n- Prints the computed top-K accuracy score for the validation set predictions.\n'

In [157]:
# Calculate top-5 accuracy
top_k = 5
val_top_k_accuracy = top_k_accuracy_score(y_val, val_probabilities, k=top_k)
"""
Purpose:
- Calculates the top-K accuracy for the validation set predictions based on predicted probabilities.

Parameters:
- y_val: True labels of the validation set.
- val_probabilities: Predicted probabilities of each class for each sample in the validation set.
- k: Number of top predictions to consider for accuracy calculation.

Outputs:
- val_top_k_accuracy: Top-K accuracy score for the validation set predictions.
"""

print(f"Validation Top-{top_k} Accuracy:", val_top_k_accuracy)
"""
Purpose:
- Prints the computed top-K accuracy score for the validation set predictions.
"""

Validation Top-5 Accuracy: 0.9659090909090909


'\nPurpose:\n- Prints the computed top-K accuracy score for the validation set predictions.\n'

In [158]:
# Calculate top-10 accuracy
top_k = 10
val_top_k_accuracy = top_k_accuracy_score(y_val, val_probabilities, k=top_k)
"""
Purpose:
- Calculates the top-K accuracy for the validation set predictions based on predicted probabilities.

Parameters:
- y_val: True labels of the validation set.
- val_probabilities: Predicted probabilities of each class for each sample in the validation set.
- k: Number of top predictions to consider for accuracy calculation.

Outputs:
- val_top_k_accuracy: Top-K accuracy score for the validation set predictions.
"""

print(f"Validation Top-{top_k} Accuracy:", val_top_k_accuracy)
"""
Purpose:
- Prints the computed top-K accuracy score for the validation set predictions.
"""

Validation Top-10 Accuracy: 0.9863636363636363


'\nPurpose:\n- Prints the computed top-K accuracy score for the validation set predictions.\n'