# 3nd Step Apply Word2Vec and SVM

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
import pickle

In [3]:
# Load preprocessed and clean CSV file long text
# df_ newlt= dataframe_newlongtext
cleaned_path = 'C:/Users/irfanizam/workspace/FYP-Example/FinalFinalFYP/Data2/ArrangedLt.csv'
df_newlt = pd.read_csv(cleaned_path)

In [4]:
# Tokenization
df_newlt['tokens'] = df_newlt['summary'].apply(word_tokenize)

In [5]:
# Train Word2Vec model
word2vec_model = Word2Vec(df_newlt['tokens'], vector_size=100, window=5, min_count=1, workers=4)

In [6]:
# Function to average word vectors for a sentence
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    nwords = 0.
    
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    
    return feature_vector

In [7]:
# Apply the average_word_vectors function to each row
df_newlt['word2vec_features'] = df_newlt['tokens'].apply(
    lambda x: average_word_vectors(x, word2vec_model, word2vec_model.wv.index_to_key, 100)
)

In [8]:
# Prepare data for SVM
X = np.array(list(df_newlt['word2vec_features']), copy=True)  # Explicitly create a new array

In [10]:
# Convert multi-labels to binary form
label_binarizer = LabelBinarizer()
y = label_binarizer.fit_transform(df_newlt[['1', '2', '3']].astype(int))

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# SVM Classifier
svm_classifier = SVC(kernel='linear', C=1, probability=True)

In [13]:
multi_output_classifier = MultiOutputClassifier(svm_classifier, n_jobs=-1)  # n_jobs=-1 uses all available CPU cores

In [14]:
multi_output_classifier.fit(X_train, y_train)

In [15]:
y_pred = multi_output_classifier.predict(X_test)

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.82      0.86       327
           1       0.82      0.64      0.72       188
           2       0.74      0.71      0.72       270

   micro avg       0.83      0.74      0.78       785
   macro avg       0.83      0.72      0.77       785
weighted avg       0.83      0.74      0.78       785
 samples avg       0.64      0.65      0.64       785



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score

# ...

# After making predictions, calculate precision, recall, and F1-score for each label
label_names = ['1', '2', '3']  

precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

# Print the metrics in percentage form
for i in range(len(label_names)):
    print(f"Label: {label_names[i]}")
    print(f"Precision: {precision[i] * 100:.2f}%")
    print(f"Recall: {recall[i] * 100:.2f}%")
    print(f"F1-Score: {f1[i] * 100:.2f}%")

Label: 4
Precision: 91.13%
Recall: 81.65%
F1-Score: 86.13%
Label: 5
Precision: 82.19%
Recall: 63.83%
F1-Score: 71.86%
Label: 6
Precision: 74.32%
Recall: 70.74%
F1-Score: 72.49%


In [19]:
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Accuracy: {accuracy * 100:.2f}%")

SVM Accuracy: 69.91%
