In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
import matplotlib.pyplot as plt

In [13]:
balanced_data = pd.read_csv('../Dataset/balanced_data_with_value_counts.csv')


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Splitting data into features and target variable
X = balanced_data['processed_review_content']
y = balanced_data['final_label']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further splitting the train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)







In [15]:
# Convert tokenized words to strings
# Convert tokenized words to strings
# Convert tokenized words to strings
X_train_str = [' '.join(map(str, words)) if isinstance(words, list) else str(words) for words in X_train]
X_val_str = [' '.join(map(str, words)) if isinstance(words, list) else str(words) for words in X_val]
X_test_str = [' '.join(map(str, words)) if isinstance(words, list) else str(words) for words in X_test]


# Vectorizing the text data
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_str)
X_val_tfidf = tfidf_vectorizer.transform(X_val_str)
X_test_tfidf = tfidf_vectorizer.transform(X_test_str)

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Define logistic regression model
logreg = LogisticRegression(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'penalty': ['l2'],
    'C': [10],
    'solver': ['liblinear']
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model on the TF-IDF transformed training set with the best hyperparameters
best_logreg = LogisticRegression(**best_params, random_state=42)
best_logreg.fit(X_train_tfidf, y_train)

# Evaluate the model on the TF-IDF transformed validation set
val_preds_logreg = best_logreg.predict(X_val_tfidf)
val_accuracy_logreg = accuracy_score(y_val, val_preds_logreg)
print("Validation Accuracy (Logistic Regression):", val_accuracy_logreg)
print("Classification Report (Logistic Regression - Validation):\n", classification_report(y_val, val_preds_logreg))

# Evaluate the model on the TF-IDF transformed test set
test_preds_logreg = best_logreg.predict(X_test_tfidf)
test_accuracy_logreg = accuracy_score(y_test, test_preds_logreg)
print("Test Accuracy (Logistic Regression):", test_accuracy_logreg)
print("Classification Report (Logistic Regression - Test):\n", classification_report(y_test, test_preds_logreg))


Best Hyperparameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Validation Accuracy (Logistic Regression): 0.6668611435239207
Classification Report (Logistic Regression - Validation):
               precision    recall  f1-score   support

           1       0.82      0.80      0.81       445
           2       0.47      0.42      0.45       398
           3       0.84      0.83      0.83       389
           4       0.55      0.61      0.58       482

    accuracy                           0.67      1714
   macro avg       0.67      0.67      0.67      1714
weighted avg       0.67      0.67      0.67      1714

Test Accuracy (Logistic Regression): 0.6724218385440971
Classification Report (Logistic Regression - Test):
               precision    recall  f1-score   support

           1       0.80      0.80      0.80       538
           2       0.51      0.45      0.48       504
           3       0.84      0.84      0.84       540
           4       0.53      0.58      0.55   