# Lab Assignment 2

## Mohammed Samir Ali (SW01080809)
## Muhammad Farish Naufal Bin Norzali (SW01081139)

In [6]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re  

# Data Preprocessing
# Load the dataset
data = pd.read_csv("Reviews.csv")

# Drop rows with missing values
data.dropna(inplace=True)

# Drop duplicate rows
data.drop_duplicates(inplace=True)

# Perform text data cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'\d+', '', text)      # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()                  # Convert to lowercase
    tokens = word_tokenize(text)         # Tokenize text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatize and remove stop words
    cleaned_text = ' '.join(tokens)
    return cleaned_text

data['CleanedText'] = data['Text'].apply(clean_text)

# Save cleaned data to a CSV file
data.to_csv("cleaned_reviews.csv", index=False)

# Feature Extraction
# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Define a function to get sentiment scores using VADER
def get_vader_sentiment(text):
    scores = sid.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply VADER sentiment analysis to the 'Text' column
data['VADER_Sentiment'] = data['Text'].apply(get_vader_sentiment)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the text data
tfidf_features = tfidf_vectorizer.fit_transform(data['Text'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, data['Score'], test_size=0.2, random_state=42)

# Model Selection
# Initialize and train logistic regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)

# Initialize and train multinomial naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predictions
vader_preds = data['VADER_Sentiment']
logreg_preds = logreg_model.predict(X_test)
nb_preds = nb_model.predict(X_test)

# Model Evaluation
# Convert VADER sentiment labels to numerical ratings
vader_numerical_preds = vader_preds.map({'positive': 5, 'negative': 1, 'neutral': 3})

# Evaluate VADER Sentiment Analysis performance
print("VADER Sentiment Analysis:")
print("Accuracy:", accuracy_score(data['Score'], vader_numerical_preds))
print("Classification Report:")
print(classification_report(data['Score'], vader_numerical_preds))

# Evaluate Logistic Regression model performance
print("\nLogistic Regression Model:")
print("Accuracy:", accuracy_score(y_test, logreg_preds))
print("Classification Report:")
print(classification_report(y_test, logreg_preds))

# Evaluate Multinomial Naive Bayes model performance
print("\nMultinomial Naive Bayes Model:")
print("Accuracy:", accuracy_score(y_test, nb_preds))
print("Classification Report:")
print(classification_report(y_test, nb_preds))


VADER Sentiment Analysis:
Accuracy: 0.650765920538493
Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.40      0.45      0.43     52264
           2       0.00      0.00      0.00     29743
           3       0.13      0.04      0.06     42638
           4       0.00      0.00      0.00     80654
           5       0.69      0.95      0.80    363102

    accuracy                           0.65    568401
   macro avg       0.24      0.29      0.26    568401
weighted avg       0.49      0.65      0.56    568401


Logistic Regression Model:
Accuracy: 0.7426834739314397
Classification Report:
              precision    recall  f1-score   support

           1       0.66      0.69      0.68     10515
           2       0.45      0.22      0.30      5937
           3       0.47      0.32      0.38      8460
           4       0.51      0.26      0.34     16026
           5       0.80      0.95      0.87     72743

    accuracy                           0.74    113681
   macro avg       0.58      0.49      0.51    1136

# lexicon-based
## VADER Sentiment Analysis:

### Strengths:
Specifically designed for sentiment analysis, making it easy to use out of the box.
Accounts for sentiment intensity and context, providing fine-grained sentiment analysis.
### Weaknesses:
Relies heavily on pre-defined sentiment lexicons, which may not cover all nuances of language or domain-specific sentiment.
May struggle with sarcasm, irony, or context-dependent sentiment expressions, leading to misclassifications.

# Machine-learning based

## Logistic Regression:

### Strengths:
Can capture complex relationships between features and target variable.
Provides probabilities for each class, allowing for a nuanced understanding of predictions.
### Weaknesses:
Prone to overfitting if the number of features is much larger than the number of observations.
Assumes a linear relationship between features and target variable, which might not hold true in all cases.
-----------------------------
## Multinomial Naive Bayes:

### Strengths:
Efficient and fast to train, making it suitable for large datasets.
Performs well with text data and is less sensitive to irrelevant features.
### Weaknesses:
Relies on the strong independence assumption between features, which might not hold true in real-world data.
May perform poorly if the dataset contains numerical features or if there is a mismatch between the training and test data distributions.

---------------------
In summary, logistic regression and multinomial naive Bayes are powerful machine learning-based approaches that can capture complex relationships in the data, while VADER provides a straightforward lexicon-based approach tailored for sentiment analysis. Each approach has its strengths and weaknesses, 