**Import Libraries**

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

**Load The Dataset**

In [2]:
dataset = pd.read_csv("dataset_after_preprocessing.csv")

**Encode the labels**



In [3]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Assignee' column to numeric labels
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

In [4]:
dataset

Unnamed: 0,Summary_Stemmed,Assignee,Assignee_Class
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",amit@chromium.org,192
1,"['proxi', 'caus', 'network', 'request', 'fail'...",jon@chromium.org,1606
2,"['web', 'inspector', 'button', 'dock', 'main',...",pfeldman@chromium.org,2460
3,"['habari', 'admin', 'interfac', 'render', 'cor...",jon@chromium.org,1606
4,"['maxim', 'second', 'larger', 'monitor', 'work...",pkasting@chromium.org,2487
...,...,...,...
199297,"['remov', 'manual', 'page', 'load', 'event', '...",brosa,429
199298,"['redirect', 'main', 'amo', 'homepag', 'instea...",wezhou,3324
199299,"['add', 'git', 'shortref', 'deploy', 'messag',...",sven,3027
199300,"['sign', 'testpilot', 'system', 'addon', 'fx',...",u581815,3199


In [None]:
# compute the number of unique values for each column in the DataFrame training_data.
dataset.nunique()

Summary_Stemmed    199023
Assignee             3515
Assignee_Class       3515
dtype: int64

**Split the dataset**

In [5]:
# Split the dataset into train and test sets (80% train, 20% test)
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)

# Further split the train set into train and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Separate features (X) and labels (y) for train, validation, and test sets
X_train, y_train = train_df['Summary_Stemmed'], train_df['Assignee_Class']
X_val, y_val = val_df['Summary_Stemmed'], val_df['Assignee_Class']
X_test, y_test = test_df['Summary_Stemmed'], test_df['Assignee_Class']

**Apply TF-IDF Transformation**

In [6]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation and test data
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

**Train an SVM Classifier**

In [None]:
# Initialize the SVM classifier
svm_classifier = SVC()

# Train the classifier on the TF-IDF transformed training data
svm_classifier.fit(X_train_tfidf, y_train)

**Evaluate the Classifier on the Validation Set**

In [None]:
# Predict labels on the validation set
val_predictions = svm_classifier.predict(X_val_tfidf)

# Calculate accuracy on the validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

**Test the Final Model on the Test Set**

In [None]:
# Predict labels on the test set
test_predictions = svm_classifier.predict(X_test_tfidf)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)