In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
import matplotlib.pyplot as plt

In [2]:
balanced_data = pd.read_csv('../Dataset/balanced_data_with_value_counts.csv')


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Splitting data into features and target variable
X = balanced_data['processed_review_content']
y = balanced_data['final_label']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further splitting the train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [4]:
# Convert tokenized words to strings
# Convert tokenized words to strings
# Convert tokenized words to strings
X_train_str = [' '.join(map(str, words)) if isinstance(words, list) else str(words) for words in X_train]
X_val_str = [' '.join(map(str, words)) if isinstance(words, list) else str(words) for words in X_val]
X_test_str = [' '.join(map(str, words)) if isinstance(words, list) else str(words) for words in X_test]


# Vectorizing the text data
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_str)
X_val_tfidf = tfidf_vectorizer.transform(X_val_str)
X_test_tfidf = tfidf_vectorizer.transform(X_test_str)

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define Random Forest model
rf = RandomForestClassifier()

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [300],
    'max_depth': [None]
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters (Random Forest):", best_params)

# Train the model on the TF-IDF transformed training set with the best hyperparameters
best_rf = RandomForestClassifier(**best_params)
best_rf.fit(X_train_tfidf, y_train)

# Evaluate the model on the TF-IDF transformed validation set
val_preds_rf = best_rf.predict(X_val_tfidf)
val_accuracy_rf = accuracy_score(y_val, val_preds_rf)
print("Validation Accuracy (Random Forest):", val_accuracy_rf)
print("Classification Report (Random Forest - Validation):\n", classification_report(y_val, val_preds_rf))

# Evaluate the model on the TF-IDF transformed test set
test_preds_rf = best_rf.predict(X_test_tfidf)
test_accuracy_rf = accuracy_score(y_test, test_preds_rf)
print("Test Accuracy (Random Forest):", test_accuracy_rf)
print("Classification Report (Random Forest - Test):\n", classification_report(y_test, test_preds_rf))


Best Hyperparameters (Random Forest): {'max_depth': None, 'n_estimators': 300}
Validation Accuracy (Random Forest): 0.7602100350058343
Classification Report (Random Forest - Validation):
               precision    recall  f1-score   support

           1       0.94      0.92      0.93       445
           2       0.62      0.35      0.45       398
           3       0.98      0.93      0.95       389
           4       0.58      0.81      0.67       482

    accuracy                           0.76      1714
   macro avg       0.78      0.75      0.75      1714
weighted avg       0.77      0.76      0.75      1714

Test Accuracy (Random Forest): 0.7657489500699953
Classification Report (Random Forest - Test):
               precision    recall  f1-score   support

           1       0.94      0.93      0.94       538
           2       0.61      0.39      0.48       504
           3       0.98      0.94      0.96       540
           4       0.56      0.77      0.65       561

    accu