In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
import matplotlib.pyplot as plt

In [2]:
balanced_data = pd.read_csv('../Dataset/balanced_data_with_value_counts.csv')


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Splitting data into features and target variable
X = balanced_data['processed_review_content']
y = balanced_data['final_label']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further splitting the train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [8]:
# Convert tokenized words to strings
# Convert tokenized words to strings
# Convert tokenized words to strings
X_train_str = [' '.join(map(str, words)) if isinstance(words, list) else str(words) for words in X_train]
X_val_str = [' '.join(map(str, words)) if isinstance(words, list) else str(words) for words in X_val]
X_test_str = [' '.join(map(str, words)) if isinstance(words, list) else str(words) for words in X_test]


# Vectorizing the text data
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_str)
X_val_tfidf = tfidf_vectorizer.transform(X_val_str)
X_test_tfidf = tfidf_vectorizer.transform(X_test_str)

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Define SVM model
svm = SVC(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'C': [10],
    'kernel': ['rbf'],
    'gamma': ['scale']
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model on the TF-IDF transformed training set with the best hyperparameters
best_svm = SVC(**best_params, random_state=42)
best_svm.fit(X_train_tfidf, y_train)

# Evaluate the model on the TF-IDF transformed validation set
val_preds_svm = best_svm.predict(X_val_tfidf)
val_accuracy_svm = accuracy_score(y_val, val_preds_svm)
print("Validation Accuracy (SVM):", val_accuracy_svm)
print("Classification Report (SVM - Validation):\n", classification_report(y_val, val_preds_svm))

# Evaluate the model on the TF-IDF transformed test set
test_preds_svm = best_svm.predict(X_test_tfidf)
test_accuracy_svm = accuracy_score(y_test, test_preds_svm)
print("Test Accuracy (SVM):", test_accuracy_svm)
print("Classification Report (SVM - Test):\n", classification_report(y_test, test_preds_svm))


Best Hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Validation Accuracy (SVM): 0.733955659276546
Classification Report (SVM - Validation):
               precision    recall  f1-score   support

           1       0.93      0.93      0.93       445
           2       0.50      0.42      0.46       398
           3       0.97      0.90      0.93       389
           4       0.57      0.68      0.62       482

    accuracy                           0.73      1714
   macro avg       0.74      0.73      0.73      1714
weighted avg       0.74      0.73      0.73      1714

Test Accuracy (SVM): 0.7484834344377042
Classification Report (SVM - Test):
               precision    recall  f1-score   support

           1       0.93      0.90      0.91       538
           2       0.56      0.50      0.53       504
           3       0.96      0.92      0.94       540
           4       0.57      0.66      0.61       561

    accuracy                           0.75      2143
   macr