**Import Libraries**

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit

**Load The Dataset**

In [41]:
dataset = pd.read_csv("dataset_after_preprocessing.csv")

**Encode the labels**



In [42]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Assignee' column to numeric labels
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

In [26]:
dataset

Unnamed: 0,Summary_Stemmed,Assignee,Assignee_Class
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",amit@chromium.org,151
1,"['proxi', 'caus', 'network', 'request', 'fail'...",jon@chromium.org,1326
2,"['web', 'inspector', 'button', 'dock', 'main',...",pfeldman@chromium.org,2039
3,"['habari', 'admin', 'interfac', 'render', 'cor...",jon@chromium.org,1326
4,"['maxim', 'second', 'larger', 'monitor', 'work...",pkasting@chromium.org,2061
...,...,...,...
197914,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",brosa,350
197915,"['autocomplet', 'type', 'valid', 'valu', 'pass...",brosa,350
197916,"['intermitt', 'slow', 'see', 'ping', 'show', '...",brosa,350
197917,"['investig', 'string', 'metric', 'type', 'adeq...",pmcmanis,2070


In [36]:
# compute the number of unique values for each column in the DataFrame training_data.
dataset.nunique()

Summary_Stemmed    197647
Assignee             2932
Assignee_Class       2932
dtype: int64

**Split the dataset**

In [43]:
print(dataset.shape)  # Check the shape of the dataset
print(dataset['Assignee_Class'].value_counts())  # Check class distribution

(197919, 3)
Assignee_Class
1233    2478
1721    2412
1228    1467
2022    1377
859     1197
        ... 
1527       5
42         5
1724       5
830        5
1884       5
Name: count, Length: 2932, dtype: int64


In [44]:
# Create the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# Split the dataset into train and test sets
train_idx, test_idx = next(sss.split(dataset, dataset['Assignee_Class']))
train_df = dataset.iloc[train_idx].reset_index(drop=True)
test_df = dataset.iloc[test_idx].reset_index(drop=True)

In [45]:
# Create another StratifiedShuffleSplit object for the train-validation split
sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# Split the initial train set into train and validation sets
for train_idx, val_idx in sss_val.split(train_df, train_df['Assignee_Class']):
    final_train_df = train_df.iloc[train_idx].reset_index(drop=True)
    val_df = train_df.iloc[val_idx].reset_index(drop=True)


In [46]:
print(final_train_df.shape)
print(val_df.shape)
print(test_df.shape)

(126668, 3)
(31667, 3)
(39584, 3)


In [47]:
# Separate features (X) and labels (y) for train, validation, and test sets
X_train, y_train = final_train_df['Summary_Stemmed'], final_train_df['Assignee_Class']
X_val, y_val = val_df['Summary_Stemmed'], val_df['Assignee_Class']
X_test, y_test = test_df['Summary_Stemmed'], test_df['Assignee_Class']

**Apply TF-IDF Transformation**

In [48]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation and test data
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

**Train an SVM Classifier**

In [None]:
# Initialize the SVM classifier
svm_classifier = SVC()

# Train the classifier on the TF-IDF transformed training data
svm_classifier.fit(X_train_tfidf, y_train)

**Evaluate the Classifier on the Validation Set**

In [None]:
# Predict labels on the validation set
val_predictions = svm_classifier.predict(X_val_tfidf)

# Calculate accuracy on the validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

**Test the Final Model on the Test Set**

In [None]:
# Predict labels on the test set
test_predictions = svm_classifier.predict(X_test_tfidf)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)