**Import Libraries**

In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
import numpy as np

**Load The Dataset**

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
file_path = '/content/drive/My Drive/checkpoints/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

In [17]:
dataset = dataset.head(50000)

**Filter the dataset do the minimum occurance of each owner is 10**


In [18]:
# Calculate the occurrences of each value in the owner column
value_counts = dataset['Assignee'].value_counts()

# Filter the dataset to include only rows where the value in 'Assignee' has at least 5 occurrences
dataset = dataset[dataset['Assignee'].isin(value_counts[value_counts >= 10].index)]

In [19]:
dataset

Unnamed: 0,Summary_Stemmed,Assignee
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",amit@chromium.org
1,"['proxi', 'caus', 'network', 'request', 'fail'...",jon@chromium.org
2,"['web', 'inspector', 'button', 'dock', 'main',...",pfeldman@chromium.org
3,"['habari', 'admin', 'interfac', 'render', 'cor...",jon@chromium.org
4,"['maxim', 'second', 'larger', 'monitor', 'work...",pkasting@chromium.org
...,...,...
49995,"['tap', 'right', 'suggest', 'add', 'garbag', '...",fengyuan@chromium.org
49996,"['get', 'npapitestplugin', 'test', 'work', 'li...",cevans@chromium.org
49997,"['rac', 'updat', 'combobox', 'invalid', 'appea...",scr@chromium.org
49998,"['flaki', 'gcf', 'test', 'due', 'window', 'tit...",grt@chromium.org


**Encode the labels**



In [20]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Assignee' column to numeric labels
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

y = dataset['Assignee_Class']

In [21]:
dataset

Unnamed: 0,Summary_Stemmed,Assignee,Assignee_Class
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",amit@chromium.org,40
1,"['proxi', 'caus', 'network', 'request', 'fail'...",jon@chromium.org,360
2,"['web', 'inspector', 'button', 'dock', 'main',...",pfeldman@chromium.org,538
3,"['habari', 'admin', 'interfac', 'render', 'cor...",jon@chromium.org,360
4,"['maxim', 'second', 'larger', 'monitor', 'work...",pkasting@chromium.org,544
...,...,...,...
49995,"['tap', 'right', 'suggest', 'add', 'garbag', '...",fengyuan@chromium.org,238
49996,"['get', 'npapitestplugin', 'test', 'work', 'li...",cevans@chromium.org,113
49997,"['rac', 'updat', 'combobox', 'invalid', 'appea...",scr@chromium.org,630
49998,"['flaki', 'gcf', 'test', 'due', 'window', 'tit...",grt@chromium.org,271


In [22]:
# compute the number of unique values for each column in the DataFrame training_data.
dataset.nunique()

Summary_Stemmed    47206
Assignee             813
Assignee_Class       813
dtype: int64

**Split the dataset**

In [23]:
print(dataset.shape)  # Check the shape of the dataset
print(dataset['Assignee_Class'].value_counts())  # Check class distribution

(47206, 3)
Assignee_Class
227    663
663    579
147    484
671    388
360    363
      ... 
228     10
369     10
44      10
136     10
789     10
Name: count, Length: 813, dtype: int64


**Train Word2Vec model**

In [24]:
# Convert strings to lists of words
def ensure_list_of_words(text):
    if isinstance(text, str):
        return text.split()  # or use word_tokenize(text) if tokenization is needed
    return text

In [25]:
dataset["Summary_Stemmed"] = dataset["Summary_Stemmed"].apply(ensure_list_of_words)

In [26]:
w2v_model = Word2Vec(sentences=dataset["Summary_Stemmed"], vector_size=500, window=5, min_count=1, workers=4, )

In [27]:
# Convert each document to a fixed-size vector by averaging word vectors
def document_vector(w2v_model, doc):
    # Filter out words not in the model's vocabulary
    doc = [word for word in doc if word in w2v_model.wv]
    # If the document has no words in the vocabulary, return a zero vector
    if len(doc) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(w2v_model.wv[doc], axis=0)

# Apply document_vector function to each document
X = np.array([document_vector(w2v_model, doc) for doc in dataset["Summary_Stemmed"]])

In [28]:
# StratifiedShuffleSplit setup for initial train+val and test split
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_val_index, test_index in sss1.split(X, y):
    X_train_val, X_test = X[train_val_index], X[test_index]
    y_train_val, y_test = y.iloc[train_val_index], y.iloc[test_index]

In [29]:
# StratifiedShuffleSplit setup for train and validation split
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)

for train_index, val_index in sss2.split(X_train_val, y_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[val_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

**Train an SVM Classifier**

In [81]:
svm_classifier = SVC(C = 1, kernel = 'linear')
svm_classifier.fit(X_train, y_train)

**Evaluate the Classifier on the Validation Set**

In [82]:
# Predict labels on the validation set
val_predictions = svm_classifier.predict(X_val)

# Calculate accuracy on the validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.10507361508314797
