''' Sentiment Classification on Amazon Fine Food Reviews
    Muhammad Aidil Bin Ahmad Husin-IS01082943
    Ahmad Aizat Bin Ahmad Zainal-IS01082871'''

## 1. Data Preprocessing

In [1]:

import pandas as pd

df = pd.read_csv("Reviews.csv")
df = df.drop_duplicates(subset=["UserId", "ProfileName", "Text"])
df = df.dropna(subset=["Text", "Score"])

def label_sentiment(score):
    if score <= 2:
        return "negative"
    elif score == 3:
        return "neutral"
    else:
        return "positive"

df["Sentiment"] = df["Score"].apply(label_sentiment)
df_final = df[["Text", "Sentiment"]]
df_final["Sentiment"].value_counts()


Sentiment
positive    306809
negative     57081
neutral      29757
Name: count, dtype: int64

## 2. Feature Extraction

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Sample to speed up processing (e.g., 20k samples)
df_sampled = df_final.sample(n=20000, random_state=42)

# Extract features and labels
X = df_sampled["Text"]
y = df_sampled["Sentiment"]

# Convert text to TF-IDF vectors
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape

((16000, 5000), (4000, 5000))

In [None]:
TF-IDF feature extraction is done. The dataset has been split into:

Training set: 16,000 samples
Test set: 4,000 samples
Features: 5,000 most informative words

## 3. Model Selection and Training

In [3]:

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


## 4. Model Evaluation
### Lexicon-Based Model: VADER

In [7]:
# Lexicon-Based Sentiment Classification using VADER
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# Initialize VADER
sid = SentimentIntensityAnalyzer()

# Since the train_test_split was done after vectorization, 
# we need to extract the test texts differently
# First get the original indices used for the test set
_, X_test_indices = train_test_split(range(len(X)), test_size=0.2, random_state=42, stratify=y)

# Then get the original text from those indices
X_test_text = df_sampled["Text"].iloc[X_test_indices].reset_index(drop=True)
y_test_list = y_test.reset_index(drop=True)

# Predict using VADER
vader_preds = []
for text in X_test_text:
    score = sid.polarity_scores(text)["compound"]
    if score >= 0.05:
        vader_preds.append("positive")
    elif score <= -0.05:
        vader_preds.append("negative")
    else:
        vader_preds.append("neutral")

# Evaluation
from sklearn.metrics import classification_report
print("VADER Lexicon-Based Report:")
print(classification_report(y_test_list, vader_preds))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


VADER Lexicon-Based Report:
              precision    recall  f1-score   support

    negative       0.61      0.41      0.49       598
     neutral       0.14      0.05      0.07       300
    positive       0.84      0.94      0.89      3102

    accuracy                           0.80      4000
   macro avg       0.53      0.47      0.49      4000
weighted avg       0.75      0.80      0.77      4000



## Logistic Regression and Naive Bayes Machine Languange

In [8]:

from sklearn.metrics import classification_report

y_pred_lr = lr_model.predict(X_test)
y_pred_nb = nb_model.predict(X_test)

print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_lr))

print("Naive Bayes Report:")
print(classification_report(y_test, y_pred_nb))


Logistic Regression Report:
              precision    recall  f1-score   support

    negative       0.75      0.47      0.58       598
     neutral       0.48      0.08      0.14       300
    positive       0.85      0.98      0.91      3102

    accuracy                           0.84      4000
   macro avg       0.69      0.51      0.54      4000
weighted avg       0.81      0.84      0.80      4000

Naive Bayes Report:
              precision    recall  f1-score   support

    negative       0.78      0.12      0.21       598
     neutral       0.00      0.00      0.00       300
    positive       0.79      1.00      0.88      3102

    accuracy                           0.79      4000
   macro avg       0.52      0.37      0.36      4000
weighted avg       0.73      0.79      0.72      4000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
# Save the sampled and preprocessed data used in the analysis to a CSV file
df_sampled.to_csv("C:/Users/HP/Downloads/lab assingment 2/result_reviews.csv", index=False)


## 5. Discussion


### Logistic Regression
- Strengths: Handles high-dimensional data well, interpretable and balanced performance.
- Weaknesses: Linear boundaries and needs tuning.

### Naive Bayes
- Strengths: Very fast, good for text and effective on sparse data.
- Weaknesses: Simplified assumptions and weak for nuanced text.

### VADER(Lexicon-based)
- Strengths: Fast, no training required and good for short texts.
- Weaknesses: Not adaptive, struggles with sarcasm or long reviews.
