**Import Libraries**

In [1]:
# Import necessary libraries
import pandas as pd  # Pandas for data manipulation and analysis
from sklearn.preprocessing import LabelEncoder  # LabelEncoder for encoding categorical target labels
from sklearn.svm import SVC  # SVC (Support Vector Classifier) for SVM classification
from sklearn.metrics import accuracy_score  # Metrics for evaluating model performance
from sklearn.model_selection import StratifiedShuffleSplit  # StratifiedShuffleSplit for train-test splitting
import joblib  # Joblib for saving and loading models
from gensim.models import Word2Vec  # Import Word2Vec from gensim for word embeddings
import numpy as np  # for numerical operations.

**Load The Dataset**

In [2]:
# Mount Google Drive to access files and save outputs
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Specify the file path in Google Drive and load the dataset after preprocessing
file_path = '/content/drive/My Drive/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

**Show the dataset**

In [4]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org
...,...,...,...
117376,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",updat gleanj dashboard ignor glean sdk data vpn,brosa
117377,"['autocomplet', 'type', 'valid', 'valu', 'pass...",autocomplet type valid valu pass record,brosa
117378,"['intermitt', 'slow', 'see', 'ping', 'show', '...",intermitt slow see ping show debug ping viewer,brosa
117379,"['investig', 'string', 'metric', 'type', 'adeq...",investig string metric type adequ captur gfxad...,pmcmanis


**Encode the labels**



In [5]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder to the 'Assignee' column and transform it to numeric labels
# This step converts categorical labels in the 'Assignee' column to numeric labels,
# which is necessary for training machine learning models.
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

# The LabelEncoder in scikit-learn assigns numeric labels to the unique categories in alphabetical order
# (or lexicographical order for strings).
# This means that the first unique category in alphabetical order is labeled as 0, the second as 1, and so on.

y = dataset['Assignee_Class']

**Train only on 20 classes of the data**

In [6]:
dataset= dataset[dataset['Assignee_Class']<20]

In [7]:
y = dataset['Assignee_Class']

**Display the number of unique values in each column of the dataset**

In [8]:
# Display the number of unique values in each column of the dataset
dataset.nunique()

Summary_Stemmed      2747
processed_summary    2747
Assignee               20
Assignee_Class         20
dtype: int64

**Split the dataset**

In [9]:
# Print the shape of the dataset
print(dataset.shape)
# This line prints the shape of the dataset, which includes the number of rows and columns.
# It helps to understand the dimensions of the dataset.

# Print the class distribution of 'Assignee_Class'
print(dataset['Assignee_Class'].value_counts())
# This line prints the count of each unique value in the 'Assignee_Class' column.
# It provides insight into the distribution of classes, which is useful for understanding class imbalance.

(2748, 4)
Assignee_Class
11    779
0     641
19    560
1     251
16    115
13    113
8      80
5      42
4      40
7      31
17     24
18     15
12     13
15      9
6       7
3       7
10      6
14      5
2       5
9       5
Name: count, dtype: int64


**Convert input text into a list of words.**

In [10]:
def ensure_list_of_words(text):
    """
    Converts input text into a list of words.

    Parameters:
    - text (str or list): Input text to be converted. If a string, it will be split into a list of words.

    Returns:
    - list: List of words extracted from the input text.

    Notes:
    - If the input is already a list, it will be returned unchanged.
    - Uses split() method to separate words based on whitespace.

    Example:
    >>> ensure_list_of_words("Hello world!")
    ['Hello', 'world!']
    >>> ensure_list_of_words(["Hello", "world!"])
    ['Hello', 'world!']
    """
    if isinstance(text, str):
        return text.split()
    return text

In [11]:
dataset["Summary_Stemmed"] = dataset["Summary_Stemmed"].apply(ensure_list_of_words)
"""
Purpose:
This line of code applies the ensure_list_of_words function to each element in the 'Summary_Stemmed' column of the dataset.

Operation:
dataset["Summary_Stemmed"].apply(ensure_list_of_words): Uses the apply method to iterate over each element in the 'Summary_Stemmed' column.
ensure_list_of_words: The function ensure_list_of_words is called for each element. It ensures that each element (which is presumably a string or list) is converted into a list of words.

Effect:
Modifies the 'Summary_Stemmed' column in dataset so that each entry is now represented as a list of words.

Example:
If dataset["Summary_Stemmed"] originally contained strings like "run running runner", after applying ensure_list_of_words, it would be transformed into a list like ["run", "running", "runner"].
"""

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["Summary_Stemmed"] = dataset["Summary_Stemmed"].apply(ensure_list_of_words)


'\nPurpose:\nThis line of code applies the ensure_list_of_words function to each element in the \'Summary_Stemmed\' column of the dataset.\n\nOperation:\ndataset["Summary_Stemmed"].apply(ensure_list_of_words): Uses the apply method to iterate over each element in the \'Summary_Stemmed\' column.\nensure_list_of_words: The function ensure_list_of_words is called for each element. It ensures that each element (which is presumably a string or list) is converted into a list of words.\n\nEffect:\nModifies the \'Summary_Stemmed\' column in dataset so that each entry is now represented as a list of words.\n\nExample:\nIf dataset["Summary_Stemmed"] originally contained strings like "run running runner", after applying ensure_list_of_words, it would be transformed into a list like ["run", "running", "runner"].\n'

**Initialize and train a Word2Vec model using the data from the 'Summary_Stemmed' column of the dataset**

In [12]:
# Train a Word2Vec model on the 'Summary_Stemmed' column of the dataset
w2v_model = Word2Vec(
    sentences=dataset["Summary_Stemmed"],  # Input data: list of tokenized sentences or documents
    vector_size=500,                       # Dimensionality of the word vectors
    window=20,                             # Maximum distance between the current and predicted word within a sentence
    min_count=2,                           # Ignore all words with total frequency lower than this
    workers=4,                             # Number of threads to use for training
    epochs=250                             # Number of iterations (epochs) over the corpus
)

"""
Purpose:
This code initializes and trains a Word2Vec model using the data from the 'Summary_Stemmed' column of the dataset.

Parameters:
sentences: Input data for training, expected as a list where each element is a list of words (tokens) representing a sentence or document.
vector_size: Dimensionality of the word vectors produced by the model.
window: Maximum distance between the current and predicted word within a sentence. A larger window size considers more context words.
min_count: Ignores all words with a total frequency lower than this value across the corpus. Helps in filtering out infrequent words.
workers: Number of threads used for training the model, to speed up the training process.
epochs: Number of iterations (epochs) over the corpus during training, where each epoch processes the entire dataset once.

Training Process:
The Word2Vec model (w2v_model) is trained on the tokenized sentences provided in dataset["Summary_Stemmed"].
During training, the model learns to map words into a high-dimensional vector space such that words with similar contexts are closer in this space.

Output:
After training, w2v_model contains the learned word vectors that can be used to infer vectors for new words or sentences and perform similarity queries.
"""

'\nPurpose:\nThis code initializes and trains a Word2Vec model using the data from the \'Summary_Stemmed\' column of the dataset.\n\nParameters:\nsentences: Input data for training, expected as a list where each element is a list of words (tokens) representing a sentence or document.\nvector_size: Dimensionality of the word vectors produced by the model.\nwindow: Maximum distance between the current and predicted word within a sentence. A larger window size considers more context words.\nmin_count: Ignores all words with a total frequency lower than this value across the corpus. Helps in filtering out infrequent words.\nworkers: Number of threads used for training the model, to speed up the training process.\nepochs: Number of iterations (epochs) over the corpus during training, where each epoch processes the entire dataset once.\n\nTraining Process:\nThe Word2Vec model (w2v_model) is trained on the tokenized sentences provided in dataset["Summary_Stemmed"].\nDuring training, the model

**Save the trained word2vec model on the drive**

In [None]:
# Save the Word2Vec model to a specified path on Google Drive
model_path = '/content/drive/MyDrive/word2vec_model'
w2v_model.save(model_path)

"""
Purpose:
This code snippet saves the trained Word2Vec model (w2v_model) to a specified path on Google Drive.

Parameters:
model_path: Specifies the path where the Word2Vec model will be saved. It should include the desired file name or directory structure.

Functionality:
w2v_model.save(model_path): This method saves the trained Word2Vec model to the location specified by model_path.
The model is typically saved in a format that includes both the trained word vectors and any additional metadata necessary for reloading the model.
"""

**Load the trained word2vec model from the drive**

In [None]:
# Load the Word2Vec model from a specified path on Google Drive
model_path = '/content/drive/MyDrive/word2vec_model'
w2v_model = Word2Vec.load(model_path)
"""
Purpose:
This code snippet loads a pre-trained Word2Vec model (w2v_model) from a specified path on Google Drive.

Parameters:
model_path: Specifies the path from which the Word2Vec model will be loaded. It should point to the location where the model was previously saved.

Functionality:
Word2Vec.load(model_path): This function loads the Word2Vec model stored at model_path into memory.
The loaded model can be used to perform tasks such as word vectorization, semantic similarity calculation, or any other operations supported by Word2Vec embeddings.
"""

**Convert each document to a fixed-size vector by averaging word vectors**

In [13]:
# Convert each document to a fixed-size vector by averaging word vectors
def document_vector(w2v_model, doc):
    """
    Generate a fixed-size vector representation for a document by averaging its word vectors.

    Parameters:
    - w2v_model (Word2Vec): The pre-trained Word2Vec model.
    - doc (list of str): A list of words representing a document.

    Returns:
    - np.ndarray: A numpy array representing the document vector.
    """
    # Filter out words not in the model's vocabulary
    doc = [word for word in doc if word in w2v_model.wv]

    # If the document has no words in the vocabulary, return a zero vector
    if len(doc) == 0:
        return np.zeros(w2v_model.vector_size)

    # Calculate the mean of word vectors for the document
    return np.mean(w2v_model.wv[doc], axis=0)

# Apply document_vector function to each document in the dataset
X = np.array([document_vector(w2v_model, doc) for doc in dataset["Summary_Stemmed"]])

**Split the dataset into training, validation and test sets**

In [14]:
# Create the StratifiedShuffleSplit object
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# This line creates a StratifiedShuffleSplit object with the following parameters:
# - n_splits=1: Specifies that there will be only one split.
# - test_size=0.2: Indicates that 20% of the dataset will be used as the test set.
# - random_state=42: Ensures reproducibility by using a fixed seed for the random number generator.


# Split the dataset into train and test sets
for train_val_index, test_index in sss1.split(X, y):
    X_train_val, X_test = X[train_val_index], X[test_index]      # Features split
    y_train_val, y_test = y.iloc[train_val_index], y.iloc[test_index]        # Labels split

#for train_val_index, test_index in sss1.split(X, y):: Iterates over the indices generated by sss1.
#X[train_val_index], X[test_index]: Splits the feature data X into X_train_val (train+validation) and X_test (test) sets based on the indices.
#y.iloc[train_val_index], y.iloc[test_index]: Splits the label data y into y_train_val (train+validation) and y_test (test) sets using the corresponding indices.

In [15]:
# StratifiedShuffleSplit setup for train and validation split
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
# This line creates a new StratifiedShuffleSplit object specifically for splitting the initial training set
# into train and validation sets with the following parameters:
# - n_splits=1: Specifies that there will be only one split.
# - test_size=0.2: Indicates that 20% of the training set will be used as the validation set.
# - random_state=42: Ensures reproducibility by using a fixed seed for the random number generator.

# Split the initial train set into train and validation sets
for train_index, val_index in sss2.split(X_train_val, y_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[val_index]        # Features split
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]          # Labels split

# for train_index, val_index in sss2.split(X_train_val, y_train_val):: Iterates over the indices generated by sss2.
# X_train_val[train_index], X_train_val[val_index]: Splits the feature data X_train_val into X_train (train) and X_val (validation) sets based on the indices.
# y_train_val.iloc[train_index], y_train_val.iloc[val_index]: Splits the label data y_train_val into y_train (train) and y_val (validation) sets using the corresponding indices.

**Initialize and train an SVM Classifier**

In [16]:
# Initialize the SVM classifier
svm_classifier = SVC(C = 1, kernel = 'linear')
"""
Purpose:
- Initializes a Support Vector Machine (SVM) classifier with the specified parameters.

Parameters:
- C=10: Penalty parameter C of the error term.
- kernel='linear': Specifies the linear kernel for the SVM.
- probability=True: Enables probability estimation, necessary for Platt scaling.

Outputs:
- svm_classifier: Initialized SVM classifier object configured with specified parameters.
"""

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)
"""
Purpose:
- Trains the SVM classifier on the training data.

Parameters:
- X_train: Training data vectors after applying word2vec.
- y_train: Training data labels.

Outputs:
- Trained svm_classifier: SVM classifier fitted to the training data.
"""

'\nPurpose:\n- Trains the SVM classifier on the training data.\n\nParameters:\n- X_train: Training data vectors after applying word2vec.\n- y_train: Training data labels.\n\nOutputs:\n- Trained svm_classifier: SVM classifier fitted to the training data.\n'

**Save the trained SVM model on the drive**

In [None]:
# Save the model to Google Drive
model_filename = '/content/drive/MyDrive/svm_classifier_model_word2vec.joblib'
joblib.dump(svm_classifier, model_filename)
"""
Purpose:
- Saves the trained SVM classifier model to a specified file location on Google Drive.

Parameters:
- svm_classifier: Trained SVM classifier object.
- model_filename: File path where the trained model will be saved.

Outputs:
- Saved model file: Persists the trained SVM classifier to the specified file location.
"""

['/content/drive/MyDrive/svm_classifier_model_word2vec.joblib']

**Evaluate the Classifier on the Validation Set**

In [None]:
# Load the model from Google Drive
model_filename = '/content/drive/MyDrive/svm_classifier_model_word2vec.joblib'
svm_classifier = joblib.load(model_filename)

In [17]:
# Predict labels on the validation set
val_predictions = svm_classifier.predict(X_val)
"""
Purpose:
- Predicts the class labels for the validation set using the trained SVM classifier.

Parameters:
- svm_classifier: Trained SVM classifier object.
- X_val: Validation set features.

Outputs:
- val_predictions: Predicted class labels for the validation set.
"""

# Calculate accuracy on the validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)
"""
Purpose:
- Computes the accuracy of the SVM classifier on the validation set.

Parameters:
- y_val: True class labels of the validation set.
- val_predictions: Predicted class labels obtained from the SVM classifier.

Outputs:
- Prints the validation accuracy score.
"""

Validation Accuracy: 0.6709090909090909


'\nPurpose:\n- Computes the accuracy of the SVM classifier on the validation set.\n\nParameters:\n- y_val: True class labels of the validation set.\n- val_predictions: Predicted class labels obtained from the SVM classifier.\n\nOutputs:\n- Prints the validation accuracy score.\n'