**Import Libraries**

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

**Load The Dataset**

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/checkpoints/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

In [None]:
# Calculate the occurrences of each value in the owner column
value_counts = dataset['Assignee'].value_counts()

# Filter the dataset to include only rows where the value in 'Assignee' has at least 5 occurrences
dataset = dataset[dataset['Assignee'].isin(value_counts[value_counts >= 10].index)]

In [None]:
dataset

Unnamed: 0,Summary_Stemmed,Assignee
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",amit@chromium.org
1,"['proxi', 'caus', 'network', 'request', 'fail'...",jon@chromium.org
2,"['web', 'inspector', 'button', 'dock', 'main',...",pfeldman@chromium.org
3,"['habari', 'admin', 'interfac', 'render', 'cor...",jon@chromium.org
4,"['maxim', 'second', 'larger', 'monitor', 'work...",pkasting@chromium.org
...,...,...
197863,"['total', 'incorrect', 'statement', 'openpgp',...",kaie
197864,"['copi', 'messag', 'sent', 'mail', 'folder', '...",mkmelin+mozilla
197865,"['port', 'bug', '1857516', 'build', 'fail', 'p...",rob
197907,"['consum', 'ac', 'via', 'gradl', 'build', 'rel...",gl


**Encode the labels**



In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Assignee' column to numeric labels
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

In [None]:
dataset

Unnamed: 0,Summary_Stemmed,Assignee,Assignee_Class
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",amit@chromium.org,114
1,"['proxi', 'caus', 'network', 'request', 'fail'...",jon@chromium.org,1031
2,"['web', 'inspector', 'button', 'dock', 'main',...",pfeldman@chromium.org,1586
3,"['habari', 'admin', 'interfac', 'render', 'cor...",jon@chromium.org,1031
4,"['maxim', 'second', 'larger', 'monitor', 'work...",pkasting@chromium.org,1603
...,...,...,...
197863,"['total', 'incorrect', 'statement', 'openpgp',...",kaie,1105
197864,"['copi', 'messag', 'sent', 'mail', 'folder', '...",mkmelin+mozilla,1391
197865,"['port', 'bug', '1857516', 'build', 'fail', 'p...",rob,1711
197907,"['consum', 'ac', 'via', 'gradl', 'build', 'rel...",gl,765


In [None]:
# compute the number of unique values for each column in the DataFrame training_data.
dataset.nunique()

Summary_Stemmed    193207
Assignee             2268
Assignee_Class       2268
dtype: int64

**Split the dataset**

In [None]:
print(dataset.shape)  # Check the shape of the dataset
print(dataset['Assignee_Class'].value_counts())  # Check class distribution

(193464, 3)
Assignee_Class
957     2478
1339    2412
952     1467
1570    1377
668     1197
        ... 
49        10
126       10
1825      10
521       10
2005      10
Name: count, Length: 2268, dtype: int64


In [None]:
# Create the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# Split the dataset into train and test sets
train_idx, test_idx = next(sss.split(dataset, dataset['Assignee_Class']))
train_df = dataset.iloc[train_idx].reset_index(drop=True)
test_df = dataset.iloc[test_idx].reset_index(drop=True)

In [None]:
# Create another StratifiedShuffleSplit object for the train-validation split
sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# Split the initial train set into train and validation sets
for train_idx, val_idx in sss_val.split(train_df, train_df['Assignee_Class']):
    final_train_df = train_df.iloc[train_idx].reset_index(drop=True)
    val_df = train_df.iloc[val_idx].reset_index(drop=True)


In [None]:
print(final_train_df.shape)
print(val_df.shape)
print(test_df.shape)

(123816, 3)
(30955, 3)
(38693, 3)


In [None]:
# Separate features (X) and labels (y) for train, validation, and test sets
X_train, y_train = final_train_df['Summary_Stemmed'], final_train_df['Assignee_Class']
X_val, y_val = val_df['Summary_Stemmed'], val_df['Assignee_Class']
X_test, y_test = test_df['Summary_Stemmed'], test_df['Assignee_Class'] 

**Apply TF-IDF Transformation**

In [None]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation and test data
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

**Train an SVM Classifier**

In [None]:
# Initialize the SVM classifier
svm_classifier = SVC(C = 10, kernel = 'linear', gamma = 'scale')

# Train the classifier on the TF-IDF transformed training data
svm_classifier.fit(X_train_tfidf, y_train)

# Save the model to Google Drive
model_filename = '/content/drive/My Drive/checkpoints/svm_classifier_model.joblib'
joblib.dump(svm_classifier, model_filename)

['/content/drive/My Drive/checkpoints/svm_classifier_model.joblib']

**Evaluate the Classifier on the Validation Set**

In [None]:
# Predict labels on the validation set
val_predictions = svm_classifier.predict(X_val_tfidf)

# Calculate accuracy on the validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.26418995315780974


**Test the Final Model on the Test Set**

In [None]:
# Predict labels on the test set
test_predictions = svm_classifier.predict(X_test_tfidf)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)