In [1]:
# CISB5123 Text Analytics - Lab Assignment 2
# Name: MUHAMMAD IDLAN HAKIMI BIN MOHD AZIZI | ID: SN01082897
# Name: Thanes Selvam | ID: SN01082944

import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load and preprocess data
df = pd.read_csv("Reviews.csv")
df = df[['Score', 'Text']].dropna()
df = df[df['Score'] != 3]  # Remove neutral reviews
df['Label'] = df['Score'].apply(lambda x: 'positive' if x > 3 else 'negative')

# Sample only a small subset for quick demo
df = df.sample(n=300, random_state=42).reset_index(drop=True)

# Lexicon-based sentiment analysis
def get_textblob_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    return 'positive' if polarity > 0 else 'negative'

def get_vader_sentiment(text):
    score = SentimentIntensityAnalyzer().polarity_scores(text)['compound']
    return 'positive' if score > 0.05 else 'negative' if score < -0.05 else 'neutral'

df['TextBlob_Pred'] = df['Text'].apply(get_textblob_sentiment)
df['VADER_Pred'] = df['Text'].apply(get_vader_sentiment)

# Classification report for lexicon-based
print("TextBlob Classification Report:")
print(classification_report(df['Label'], df['TextBlob_Pred'], target_names=['negative', 'positive']))
print("VADER Classification Report:")
print(classification_report(df['Label'], df['VADER_Pred'], target_names=['negative', 'positive', 'neutral']))

# Machine learning-based sentiment analysis
X = df['Text']
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

nb = MultinomialNB()
svm = SVC(kernel='linear')

nb.fit(X_train_vec, y_train)
svm.fit(X_train_vec, y_train)

# Classification report for ML models
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb.predict(X_test_vec), target_names=['negative', 'positive']))
print("SVM Classification Report:")
print(classification_report(y_test, svm.predict(X_test_vec), target_names=['negative', 'positive']))

# Discussion
print("""
Discussion:
Lexicon-based methods like TextBlob and VADER are easy to use and don’t need training. 
TextBlob uses a dictionary of words with polarity, while VADER works better with casual or social media text.

Machine learning models (Naive Bayes and SVM) learn from the data and usually give better results.
SVM performed the best overall.

The evaluation shows that machine learning models are more accurate than lexicon-based ones for this dataset.
""")


TextBlob Classification Report:
              precision    recall  f1-score   support

    negative       0.54      0.29      0.38        48
    positive       0.88      0.95      0.91       252

    accuracy                           0.85       300
   macro avg       0.71      0.62      0.65       300
weighted avg       0.82      0.85      0.83       300

VADER Classification Report:
              precision    recall  f1-score   support

    negative       0.78      0.38      0.51        48
    positive       0.00      0.00      0.00         0
     neutral       0.90      0.97      0.93       252

    accuracy                           0.87       300
   macro avg       0.56      0.45      0.48       300
weighted avg       0.88      0.87      0.86       300

Naive Bayes Classification Report:
              precision    recall  f1-score   support

    negative       1.00      0.08      0.14        13
    positive       0.87      1.00      0.93        77

    accuracy                    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
