**Import Libraries**

In [44]:
# Import necessary libraries
import pandas as pd  # Pandas for data manipulation and analysis
from sklearn.preprocessing import LabelEncoder  # LabelEncoder for encoding categorical target labels
from sklearn.feature_extraction.text import TfidfVectorizer  # TfidfVectorizer for converting text data to TF-IDF features
from sklearn.svm import SVC  # SVC (Support Vector Classifier) for SVM classification
from sklearn.metrics import accuracy_score, top_k_accuracy_score  # Metrics for evaluating model performance
from sklearn.model_selection import StratifiedShuffleSplit  # StratifiedShuffleSplit for train-test splitting
import joblib  # Joblib for saving and loading models
from sklearn.model_selection import GridSearchCV

**Load The Dataset**

In [None]:
# Mount Google Drive to access files and save outputs
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Specify the file path in Google Drive and load the dataset after preprocessing
file_path = '/content/drive/My Drive/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

In [None]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org
...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis


**Encode the labels**



In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder to the 'Assignee' column and transform it to numeric labels
# This step converts categorical labels in the 'Assignee' column to numeric labels,
# which is necessary for training machine learning models.
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

# The LabelEncoder in scikit-learn assigns numeric labels to the unique categories in alphabetical order
# (or lexicographical order for strings).
# This means that the first unique category in alphabetical order is labeled as 0, the second as 1, and so on.

In [None]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after labeling
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee,Assignee_Class
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org,124
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org,52
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org,2118
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org,1972
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org,2282
...,...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa,278
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa,278
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa,278
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis,1685


**Train only on 20 classes of the data**

In [None]:
dataset= dataset[dataset['Assignee_Class']<20]

In [None]:
# Display the number of unique values in each column of the dataset
dataset.nunique()

Summary_Stemmed      2747
processed_summary    2747
Assignee               20
Assignee_Class         20
dtype: int64

**Split the dataset**

In [None]:
# Print the shape of the dataset
print(dataset.shape)
# This line prints the shape of the dataset, which includes the number of rows and columns.
# It helps to understand the dimensions of the dataset.

# Print the class distribution of 'Assignee_Class'
print(dataset['Assignee_Class'].value_counts())
# This line prints the count of each unique value in the 'Assignee_Class' column.
# It provides insight into the distribution of classes, which is useful for understanding class imbalance.

(2748, 4)
Assignee_Class
11    779
0     641
19    560
1     251
16    115
13    113
8      80
5      42
4      40
7      31
17     24
18     15
12     13
15      9
6       7
3       7
10      6
14      5
2       5
9       5
Name: count, dtype: int64


In [None]:
# Create the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# This line creates a StratifiedShuffleSplit object with the following parameters:
# - n_splits=1: Specifies that there will be only one split.
# - test_size=0.2: Indicates that 20% of the dataset will be used as the test set.
# - random_state=42: Ensures reproducibility by using a fixed seed for the random number generator.

# Split the dataset into train and test sets
train_idx, test_idx = next(sss.split(dataset, dataset['Assignee_Class']))

# This line performs the split based on the 'Assignee_Class' column to ensure that the train and test sets
# have a similar class distribution. 'sss.split' returns the indices of the train and test samples.

# Create the training DataFrame
train_df = dataset.iloc[train_idx].reset_index(drop=True)

# This line creates a training DataFrame using the indices obtained from the split.
# 'iloc' is used to select the rows corresponding to the train indices.
# 'reset_index(drop=True)' resets the index of the training DataFrame.

# Create the test DataFrame
test_df = dataset.iloc[test_idx].reset_index(drop=True)

# This line creates a test DataFrame using the indices obtained from the split.
# 'iloc' is used to select the rows corresponding to the test indices.
# 'reset_index(drop=True)' resets the index of the test DataFrame.

In [None]:
# Output the shapes of the resulting DataFrames
print(train_df.shape)
print(test_df.shape)

(2198, 4)
(550, 4)


In [None]:
# Separate features (X) and labels (y) for train and test sets
X_train, y_train = train_df['processed_summary'], train_df['Assignee_Class']
X_test, y_test = test_df['processed_summary'], test_df['Assignee_Class']

**Apply TF-IDF Transformation**

In [None]:
# Initialize the TF-IDF vectorizer with n-gram range (1, 2)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
"""
Purpose:
- Initializes a TF-IDF vectorizer object with a specified n-gram range (1, 2).
- Fits the vectorizer on the training data and transforms it into a TF-IDF matrix.

Parameters:
- ngram_range=(1, 2): Specifies to extract unigrams and bigrams.

Outputs:
- X_train_tfidf: TF-IDF matrix for training data where rows are documents and columns are TF-IDF features.
"""

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)
"""
Purpose:
- Transforms the test data using the same TF-IDF vectorizer fitted on the training data.

Outputs:
- X_test_tfidf: TF-IDF matrix for test data using the fitted TF-IDF vectorizer.
"""

'\nPurpose:\n- Transforms the test data using the same TF-IDF vectorizer fitted on the training data.\n\nOutputs:\n- X_test_tfidf: TF-IDF matrix for test data using the fitted TF-IDF vectorizer.\n'

In [None]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(2198, 15887)
(550, 15887)


In [47]:
# Save the tvectorizer to Google Drive
model_filename = '/content/drive/MyDrive/checkpoints/tf_idf_vectorizer_final.joblib'
joblib.dump(tfidf_vectorizer, model_filename)

['/content/drive/MyDrive/checkpoints/tf_idf_vectorizer_final.joblib']

**Train an SVM Classifier**

In [None]:
# Define the SVM classifier
svm_classifier = SVC(probability=True)

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'kernel': ['linear', 'rbf'],
    'class_weight': ['balanced', None]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform grid search
grid_search.fit(X_train_tfidf, y_train)

# Print the best parameters found
print("Best parameters found: ", grid_search.best_params_)

# Optionally, print the best score achieved
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))



Best parameters found:  {'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation accuracy: 0.71


In [None]:
# Initialize the SVM classifier
svm_classifier = SVC(C=10, kernel='linear',gamma = 'scale', class_weight = 'balanced' ,probability=True)
"""
Purpose:
- Initializes a Support Vector Machine (SVM) classifier with the specified parameters.

Parameters:
- C=10: Penalty parameter C of the error term.
- kernel='linear': Specifies the linear kernel for the SVM.
- probability=True: Enables probability estimation, necessary for Platt scaling.

Outputs:
- svm_classifier: Initialized SVM classifier object configured with specified parameters.
"""

# Train the classifier on the TF-IDF transformed training data
svm_classifier.fit(X_train_tfidf, y_train)
"""
Purpose:
- Trains the SVM classifier on the TF-IDF transformed training data.

Parameters:
- X_train_tfidf: Training data features transformed into TF-IDF format.
- y_train: Training data labels.

Outputs:
- Trained svm_classifier: SVM classifier fitted to the training data.
"""

# Save the trained model to Google Drive
model_filename = '/content/drive/MyDrive/checkpoints/svm_classifier_model_final.joblib'
joblib.dump(svm_classifier, model_filename)
"""
Purpose:
- Saves the trained SVM classifier model to a specified file location on Google Drive.

Parameters:
- svm_classifier: Trained SVM classifier object.
- model_filename: File path where the trained model will be saved.

Outputs:
- Saved model file: Persists the trained SVM classifier to the specified file location.
"""

'\nPurpose:\n- Saves the trained SVM classifier model to a specified file location on Google Drive.\n\nParameters:\n- svm_classifier: Trained SVM classifier object.\n- model_filename: File path where the trained model will be saved.\n\nOutputs:\n- Saved model file: Persists the trained SVM classifier to the specified file location.\n'

In [45]:
# Load the model from Google Drive
model_filename = '/content/drive/MyDrive/checkpoints/svm_classifier_model_final.joblib'
svm_classifier = joblib.load(model_filename)
"""
Purpose: Loads a pre-trained SVM classifier model from a specified file path in Google Drive.
"""

'\nPurpose: Loads a pre-trained SVM classifier model from a specified file path in Google Drive.\n'

**Test the Final Model on the Test Set**

In [46]:
# Predict labels on the test set
test_predictions = svm_classifier.predict(X_test_tfidf)
"""
Purpose:
- Predicts the class labels for the test set using the trained SVM classifier.

Parameters:
- svm_classifier: Trained SVM classifier object.
- X_test_tfidf: Test set features transformed into TF-IDF format.

Outputs:
- test_predictions: Predicted class labels for the test set.
"""

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)
"""
Purpose:
- Computes the accuracy of the SVM classifier on the test set.

Parameters:
- y_test: True class labels of the test set.
- test_predictions: Predicted class labels obtained from the SVM classifier.

Outputs:
- Prints the Test accuracy score.
"""

Test Accuracy: 0.7763636363636364


'\nPurpose:\n- Computes the accuracy of the SVM classifier on the test set.\n\nParameters:\n- y_test: True class labels of the test set.\n- test_predictions: Predicted class labels obtained from the SVM classifier.\n\nOutputs:\n- Prints the Test accuracy score.\n'

In [None]:
# Predict probabilities on the test set
test_probabilities = svm_classifier.predict_proba(X_test_tfidf)
"""
Purpose:
- Predicts class probabilities for each sample in the test set using the trained SVM classifier.

Parameters:
- svm_classifier: SVM classifier model previously trained and loaded.
- X_test_tfidf: TF-IDF transformed test set features.

Outputs:
- test_probabilities: Predicted probabilities of each class for each sample in the test set.
"""

'\nPurpose:\n- Predicts class probabilities for each sample in the test set using the trained SVM classifier.\n\nParameters:\n- svm_classifier: SVM classifier model previously trained and loaded.\n- X_test_tfidf: TF-IDF transformed test set features.\n\nOutputs:\n- test_probabilities: Predicted probabilities of each class for each sample in the test set.\n'

In [None]:
# Calculate top-3 accuracy
top_k = 3
test_top_k_accuracy = top_k_accuracy_score(y_test, test_probabilities, k=top_k)
"""
Purpose:
- Calculates the top-K accuracy for the test set predictions based on predicted probabilities.

Parameters:
- y_test: True labels of the test set.
- test_probabilities: Predicted probabilities of each class for each sample in the test set.
- k: Number of top predictions to consider for accuracy calculation.

Outputs:
- test_top_k_accuracy: Top-K accuracy score for the test set predictions.
"""
print(f"Test Top-{top_k} Accuracy:", test_top_k_accuracy)
"""
Purpose:
- Prints the computed top-K accuracy score for the test set predictions.
"""

Test Top-3 Accuracy: 0.9145454545454546


'\nPurpose:\n- Prints the computed top-K accuracy score for the test set predictions.\n'

In [None]:
# Calculate top-5 accuracy
top_k = 5
test_top_k_accuracy = top_k_accuracy_score(y_test, test_probabilities, k=top_k)
"""
Purpose:
- Calculates the top-K accuracy for the test set predictions based on predicted probabilities.

Parameters:
- y_test: True labels of the test set.
- test_probabilities: Predicted probabilities of each class for each sample in the test set.
- k: Number of top predictions to consider for accuracy calculation.

Outputs:
- test_top_k_accuracy: Top-K accuracy score for the test set predictions.
"""
print(f"Test Top-{top_k} Accuracy:", test_top_k_accuracy)
"""
Purpose:
- Prints the computed top-K accuracy score for the test set predictions.
"""

Test Top-5 Accuracy: 0.9636363636363636


'\nPurpose:\n- Prints the computed top-K accuracy score for the test set predictions.\n'

In [None]:
# Calculate top-10 accuracy
top_k = 10
test_top_k_accuracy = top_k_accuracy_score(y_test, test_probabilities, k=top_k)
"""
Purpose:
- Calculates the top-K accuracy for the test set predictions based on predicted probabilities.

Parameters:
- y_test: True labels of the test set.
- test_probabilities: Predicted probabilities of each class for each sample in the test set.
- k: Number of top predictions to consider for accuracy calculation.

Outputs:
- test_top_k_accuracy: Top-K accuracy score for the test set predictions.
"""
print(f"Test Top-{top_k} Accuracy:", test_top_k_accuracy)
"""
Purpose:
- Prints the computed top-K accuracy score for the test set predictions.
"""

Test Top-10 Accuracy: 0.990909090909091


'\nPurpose:\n- Prints the computed top-K accuracy score for the test set predictions.\n'