## Libraries Used

In [1]:
# %pip install pandas matplotlib seaborn  nltk textblob vaderSentiment wordcloud

# Data Handling
import pandas as pd
import numpy as np

# Text Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Display Formatting
from IPython.display import display

## Dataset extraction and organization
### Load and combine all datasets

In [None]:
import pandas as pd # for dataframes
from IPython.display import display # for display

# Load datasets
dataset1 = pd.read_csv('data/amazon_cells_labelled.txt', delimiter='\t', header=None, names=['Review', 'Sentiment'])
dataset2 = pd.read_csv('data/imdb_labelled.txt', delimiter='\t', header=None, names=['Review', 'Sentiment'])
dataset3 = pd.read_csv('data/yelp_labelled.txt', delimiter='\t', header=None, names=['Review', 'Sentiment'])

# Combine datasets
combined_dataset = pd.concat([dataset1, dataset2, dataset3], ignore_index=True)

# Ensure correct column ordering
combined_dataset = combined_dataset[['Review', 'Sentiment']]

# Convert Sentiment column to a centered string format 
combined_dataset["Sentiment"] = combined_dataset["Sentiment"].astype(str).apply(lambda x: f"{x:^5}")

# Display first 10 rows
display(combined_dataset.head(10)) 

# Check for missing values
print(combined_dataset.isnull().sum().to_string())

## Data Cleaning and Preparation
### Steps: 
#### 1. Remove duplicates.
#### 2.Handle missing values.
#### 3.Lowercase text for consistency.
#### 4. Remove punctuation and stopwords.

## Exploratory Data Analysis
### Visualizing and analyzing sentiment distribution.

### Use bar charts 
>to show positive vs. negative sentiments.

### Distribution of review lengths
>Histogram showing text lenght distribution

### Word Cloud for Most Frequent Word
>Generating word clouds for both positive and negative sentiments

### Term Frequency Analysis
>Identifing most common words in positive and negative reviews.

## Model Selection


 ### Vader
 >Leveraging a rule-based model to assess sentiment.

### Logisitic Regression 
>Training a supervised learning model for sentiment classification

## Model Analysis 


### VADER Evaluation 

### Threshold-Based Accuracy
>Measures how often VADER’s sentiment matches labels.

#### Mean Squared Error (MSE) 
>Measures how close VADER's scores are to actual sentiment.

#### Spearman/Pearson Correlation ✅
>Measures how well VADER scores align with actual sentiment.


### Logistic Regression

#### Confusion Matrix
>Essential for analyzing false positives and false negatives.

#### ROC Curve & AUC
>Measures classification performance across thresholds.

### Vader vs Logistic Regression Results

## Conclusion