**Import Libraries**

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
import joblib
from gensim.models import Word2Vec
import numpy as np 

**Load The Dataset**

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/checkpoints/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

**Encode the labels**



In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Assignee' column to numeric labels
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

y = dataset['Assignee_Class']

In [None]:
# compute the number of unique values for each column in the DataFrame training_data.
dataset.nunique()

Summary_Stemmed    193207
Assignee             2268
Assignee_Class       2268
dtype: int64

**Split the dataset**

In [None]:
print(dataset.shape)  # Check the shape of the dataset
print(dataset['Assignee_Class'].value_counts())  # Check class distribution

(193464, 3)
Assignee_Class
957     2478
1339    2412
952     1467
1570    1377
668     1197
        ... 
49        10
126       10
1825      10
521       10
2005      10
Name: count, Length: 2268, dtype: int64


**Train Word2Vec model**

In [None]:
# Convert strings to lists of words
def ensure_list_of_words(text):
    if isinstance(text, str):
        return text.split()  # or use word_tokenize(text) if tokenization is needed
    return text

In [None]:
dataset["Summary_Stemmed"] = dataset["Summary_Stemmed"].apply(ensure_list_of_words)

In [None]:
w2v_model = Word2Vec(sentences=dataset["Summary_Stemmed"], vector_size=500, window=20, min_count=2, workers=4, epochs=250 )

In [None]:
# Save the model to a specified path on Google Drive
model_path = '/content/drive/MyDrive/word2vec_model'
w2v_model.save(model_path)

In [None]:
model_path = '/content/drive/MyDrive/word2vec_model'
w2v_model = Word2Vec.load(model_path)

In [None]:
# Convert each document to a fixed-size vector by averaging word vectors
def document_vector(w2v_model, doc):
    # Filter out words not in the model's vocabulary
    doc = [word for word in doc if word in w2v_model.wv]
    # If the document has no words in the vocabulary, return a zero vector
    if len(doc) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(w2v_model.wv[doc], axis=0)

# Apply document_vector function to each document
X = np.array([document_vector(w2v_model, doc) for doc in dataset["Summary_Stemmed"]])

In [None]:
# StratifiedShuffleSplit setup for initial train+val and test split
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_val_index, test_index in sss1.split(X, y):
    X_train_val, X_test = X[train_val_index], X[test_index]
    y_train_val, y_test = y.iloc[train_val_index], y.iloc[test_index]

In [None]:
# StratifiedShuffleSplit setup for train and validation split
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)

for train_index, val_index in sss2.split(X_train_val, y_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[val_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

**Train an SVM Classifier**

In [None]:
svm_classifier = SVC(C = 1, kernel = 'linear')
svm_classifier.fit(X_train, y_train)

In [None]:
# Save the model to Google Drive
model_filename = '/content/drive/MyDrive/svm_classifier_model_word2vec.joblib'
joblib.dump(svm_classifier, model_filename)

['/content/drive/MyDrive/svm_classifier_model_word2vec.joblib']

**Evaluate the Classifier on the Validation Set**

In [None]:
# Load the model from Google Drive
model_filename = '/content/drive/MyDrive/svm_classifier_model_word2vec.joblib'
svm_classifier = joblib.load(model_filename)

In [None]:
# Predict labels on the validation set
val_predictions = svm_classifier.predict(X_val)

# Calculate accuracy on the validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.02336339906443026
