In [1]:
#2nd version

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from collections import Counter

# Load the dataset
data = pd.read_csv("sampled_book_df.csv")

# Step 1: Preprocessing
# Map review/score to sentiment labels
def map_sentiment(score):
    if score >= 4:  # Positive sentiment
        return "Positive"
    elif score == 3:  # Neutral sentiment
        return "Neutral"
    else:  # Negative sentiment
        return "Negative"

data['sentiment'] = data['review/score'].apply(map_sentiment)

# Remove rows with missing or invalid review text
data = data.dropna(subset=['review/text'])

# Check class distribution
print("Initial Class Distribution:")
print(data['sentiment'].value_counts())

# Step 2: Feature Engineering
X = data['review/text']
y = data['sentiment']

# Encode target labels
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = {label: weight for label, weight in zip(np.unique(y), class_weights)}

print("Class Weights:", class_weights_dict)

# Convert text data to numeric (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', sublinear_tf=True)

# Transform features
X_tfidf = vectorizer.fit_transform(X)

# Step 3: Address Imbalance
# Use SMOTE to oversample minority classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

print("After SMOTE Resampling:", Counter(y_resampled))

# Encode labels into numerical format for training
y_resampled_encoded = pd.factorize(y_resampled)[0]

# Step 4: Model Training
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled_encoded, test_size=0.2, random_state=42, stratify=y_resampled_encoded)

# Train a Random Forest Classifier with class weights
clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

# Step 5: Evaluation
# Make predictions
y_pred = clf.predict(X_test)

# Decode target labels back to original categories
label_map = {i: label for i, label in enumerate(pd.factorize(y_resampled)[1])}
y_test_labels = [label_map[label] for label in y_test]
y_pred_labels = [label_map[label] for label in y_pred]

# Print classification metrics
print("Classification Report:")
print(classification_report(y_test_labels, y_pred_labels))

print("Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred_labels))



Initial Class Distribution:
Positive    31215
Negative     4472
Neutral      3199
Name: sentiment, dtype: int64
Class Weights: {'Negative': 2.898479427549195, 'Neutral': 4.051891216005002, 'Positive': 0.41524907896844465}
After SMOTE Resampling: Counter({'Positive': 31215, 'Negative': 31215, 'Neutral': 31215})
Classification Report:
              precision    recall  f1-score   support

    Negative       0.93      0.97      0.95      6243
     Neutral       0.95      0.94      0.95      6243
    Positive       0.96      0.93      0.94      6243

    accuracy                           0.95     18729
   macro avg       0.95      0.95      0.95     18729
weighted avg       0.95      0.95      0.95     18729

Confusion Matrix:
[[6030   24  189]
 [ 292 5866   85]
 [ 143  269 5831]]


In [2]:
example_review = ["The book is okay and not so memorable."] #Positive

example_tfidf = vectorizer.transform(example_review)
predicted_sentiment = clf.predict(example_tfidf)
print("Predicted Sentiment:", label_map[predicted_sentiment[0]])

Predicted Sentiment: Neutral


In [3]:
example_review_1 = ["The book is memorable."] #Positive

example_tfidf = vectorizer.transform(example_review_1)
predicted_sentiment = clf.predict(example_tfidf)
print("Predicted Sentiment:", label_map[predicted_sentiment[0]])

Predicted Sentiment: Positive


In [4]:
example_review_2 = ["The book is very worst"] #Negative

example_tfidf = vectorizer.transform(example_review_2)
predicted_sentiment = clf.predict(example_tfidf)
print("Predicted Sentiment:", label_map[predicted_sentiment[0]])

Predicted Sentiment: Negative
