<a href="https://colab.research.google.com/github/JeeAu/NLP_Project/blob/main/Jee_Au_Wah_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install jupyterlab

In [None]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("PyTorch version:", torch.__version__)
print("CUDA version (compiled):", torch.version.cuda)
print("Current GPU device:", torch.cuda.current_device() if torch.cuda.is_available() else "No GPU detected")
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A")

In [None]:
# First uninstall the problematic packages
!pip uninstall -y huggingface_hub transformers accelerate bertopic

# Install specific compatible versions in the right order
!pip install huggingface_hub==0.24.0
!pip install tokenizers==0.15.2
!pip install transformers==4.35.2
!pip install accelerate==0.25.0
!pip install datasets==2.16.0  # Older version compatible with huggingface_hub 0.24.0
!pip install bertopic==0.16.0

In [None]:
pip show huggingface_hub transformers accelerate bertopic

In [None]:
!pip install --upgrade accelerate
# Install remaining dependencies (excluding already installed packages)
!pip install swifter umap-learn hdbscan sentence-transformers
!pip install pandas numpy matplotlib seaborn nltk scikit-learn

import pandas as pd
import numpy as np
import re
import gc
import warnings
warnings.filterwarnings('ignore')

# NLP and Text Processing
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Parallel processing
import swifter

# Dimensionality reduction and clustering
from umap import UMAP
import hdbscan

# Topic Modeling
from bertopic import BERTopic

# Machine Learning
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

# Text embeddings
from sentence_transformers import SentenceTransformer

# PyTorch for custom model if needed
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Set random seed for reproducibility
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [None]:
print("-----------------------------------")
print("Office Products Customer Feedback Analysis")
print("-----------------------------------")

# ----------------------
# DOWNLOAD AND LOAD DATA
# ----------------------
!wget --no-check-certificate https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Office_Products_5.json.gz
!gzip -d Office_Products_5.json.gz

chunk_size = 100000
chunks = pd.read_json('Office_Products_5.json', lines=True, chunksize=chunk_size)
df = pd.concat(chunks)
df = df[['reviewText', 'overall', 'summary', 'reviewTime']].dropna()
print(f"Loaded {len(df)} reviews. Columns: {df.columns}")

# Convert reviewTime to datetime
df['reviewTime'] = pd.to_datetime(df['reviewTime'], format='%m %d, %Y', errors='coerce')

# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Clean and normalize text"""
    text = re.sub(r'<.*?>|[^a-zA-Z\s]', '', str(text))
    text = text.lower().strip()
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

# Apply preprocessing in batches using swifter for parallel processing
print("Preprocessing review text...")
df['cleaned_review'] = df['reviewText'].swifter.apply(preprocess_text)
df = df.drop(columns=['reviewText'])  # Free memory

# Exploratory Data Analysis (EDA)
print("Generating EDA visualizations...")

# Plot rating distribution
plt.figure(figsize=(10, 6))
rating_counts = df['overall'].value_counts().sort_index()
sns.barplot(x=rating_counts.index, y=rating_counts.values, palette='viridis')
plt.title('Rating Distribution (1-5 Stars)', fontsize=15)
plt.xlabel('Rating')
plt.ylabel('Count')
plt.grid(True, alpha=0.3)
plt.savefig('rating_distribution.png')
plt.show()

# Analyze review length
df['review_length'] = df['cleaned_review'].apply(lambda x: len(x.split()))
plt.figure(figsize=(10, 6))
plt.hist(df['review_length'], bins=50, color='teal', alpha=0.7)
plt.title('Distribution of Review Lengths', fontsize=15)
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.savefig('review_length_distribution.png')
plt.show()

# Year-wise review count
df['review_year'] = df['reviewTime'].dt.year
year_counts = df['review_year'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
sns.lineplot(x=year_counts.index, y=year_counts.values, marker='o', linewidth=2)
plt.title('Reviews by Year', fontsize=15)
plt.xlabel('Year')
plt.ylabel('Number of Reviews')
plt.grid(True, alpha=0.3)
plt.savefig('yearly_review_count.png')
plt.show()

# Rating distribution over time
yearly_ratings = df.groupby('review_year')['overall'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(x='review_year', y='overall', data=yearly_ratings, marker='o', linewidth=2)
plt.title('Average Rating by Year', fontsize=15)
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.ylim(1, 5)
plt.grid(True, alpha=0.3)
plt.savefig('yearly_rating_trend.png')
plt.show()

# Take a sample for further analysis to avoid memory issues
SAMPLE_SIZE = 10000
print(f"Taking a sample of {SAMPLE_SIZE} reviews for detailed analysis...")
sample_df = df.sample(SAMPLE_SIZE, random_state=RANDOM_SEED).copy()
del df
gc.collect()

# Topic Modeling with BERTopic
print("Running BERTopic model for topic discovery...")
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0, metric='cosine', low_memory=True, random_state=RANDOM_SEED)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(embedding_model="sentence-transformers/all-MiniLM-L6-v2",
                       language="english",
                       calculate_probabilities=False,
                       min_topic_size=20,
                       nr_topics="auto",
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       verbose=True)

topics, probs = topic_model.fit_transform(sample_df['cleaned_review'])
sample_df['topic'] = topics

# Visualize topics
print("Visualizing discovered topics...")
fig = topic_model.visualize_topics()
fig.write_html("topics_visualization.html")
plt.figure(figsize=(12, 8))
topic_model.visualize_barchart(top_n_topics=10).show()

# Get topic information
topic_info = topic_model.get_topic_info()
print("\nTop 10 Topics by Size:")
print(topic_info.head(10))

# Display top terms for each topic
print("\nTop 5 Terms for Each Topic:")
for topic_id in topic_info['Topic'][:10]:
    if topic_id != -1:  # Skip outlier topic
        topic_terms = topic_model.get_topic(topic_id)
        terms = ", ".join([term for term, _ in topic_terms[:5]])
        print(f"Topic {topic_id}: {terms}")

# Sentiment Analysis
print("\nPerforming sentiment analysis...")
vader = SentimentIntensityAnalyzer()
sample_df['vader_score'] = sample_df['cleaned_review'].apply(lambda x: vader.polarity_scores(x)['compound'])

def label_sentiment(rating):
    if rating <= 2:
        return 0  # negative
    elif rating == 3:
        return 1  # neutral
    else:
        return 2  # positive

sample_df['sentiment_label'] = sample_df['overall'].apply(label_sentiment)

# Display sentiment distribution
plt.figure(figsize=(8, 6))
sentiment_counts = sample_df['sentiment_label'].map({0:'Negative', 1:'Neutral', 2:'Positive'}).value_counts()
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='RdYlGn')
plt.title('Sentiment Distribution', fontsize=15)
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.grid(True, alpha=0.3)
plt.savefig('sentiment_distribution.png')
plt.show()

# Balance classes for training
print("Balancing dataset for sentiment classification...")
df_majority = sample_df[sample_df['sentiment_label'] == 2]
df_minority_neg = sample_df[sample_df['sentiment_label'] == 0]
df_minority_neu = sample_df[sample_df['sentiment_label'] == 1]

n_samples = min(len(df_majority), 1000)
df_minority_neg_upsampled = resample(df_minority_neg, replace=True, n_samples=n_samples, random_state=RANDOM_SEED)
df_minority_neu_upsampled = resample(df_minority_neu, replace=True, n_samples=n_samples, random_state=RANDOM_SEED)
balanced_df = pd.concat([df_majority.sample(n=n_samples, random_state=RANDOM_SEED),
                         df_minority_neg_upsampled,
                         df_minority_neu_upsampled])
balanced_df = balanced_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print(f"Balanced dataset sizes - Positive: {n_samples}, Neutral: {n_samples}, Negative: {n_samples}")

# Create embeddings using sentence-transformers
print("\nGenerating embeddings for sentiment analysis...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embeddings(texts, batch_size=32):
    return embedding_model.encode(texts, batch_size=batch_size, show_progress_bar=True)

# Generate embeddings for the balanced dataset
embeddings = get_embeddings(balanced_df['cleaned_review'].tolist())

# Split data for sentiment classification
X_train, X_test, y_train, y_test = train_test_split(
    embeddings,
    balanced_df['sentiment_label'].values,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=balanced_df['sentiment_label']
)

# Train a simple classifier on embeddings
print("\nTraining logistic regression for sentiment classification...")
clf = LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nLogistic Regression Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.show()

# Analyze topics related to sentiment
print("\nAnalyzing topics by sentiment...")
topic_sentiment = sample_df.groupby('topic')['vader_score'].mean().sort_values(ascending=False)
print("\nTopics by average sentiment score (highest to lowest):")
for topic_id, score in topic_sentiment.items():
    if topic_id != -1:  # Skip outlier topic
        topic_words = topic_model.get_topic(topic_id)
        topic_words_str = ", ".join([word for word, _ in topic_words[:5]])
        print(f"Topic {topic_id} ({topic_words_str}): {score:.3f}")

# Visualize topic-sentiment relationship
plt.figure(figsize=(12, 6))
topic_sentiment = topic_sentiment[topic_sentiment.index != -1]  # Remove outlier topic
topic_sentiment_df = pd.DataFrame({'Topic': topic_sentiment.index, 'Sentiment': topic_sentiment.values})
topic_sentiment_df = topic_sentiment_df.sort_values('Sentiment', ascending=False)

# Get topic words for the x labels
topic_labels = []
for topic_id in topic_sentiment_df['Topic']:
    words = [word for word, _ in topic_model.get_topic(topic_id)[:2]]
    topic_labels.append(f"{topic_id}: {', '.join(words)}")

# Plot with readable topic labels
plt.figure(figsize=(14, 8))
bar_plot = sns.barplot(x='Topic', y='Sentiment', data=topic_sentiment_df, palette='RdYlGn_r')
plt.title('Average Sentiment Score by Topic', fontsize=15)
plt.xlabel('Topic ID and Key Terms')
plt.ylabel('Average Sentiment Score')
plt.xticks(range(len(topic_labels)), topic_labels, rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('topic_sentiment.png')
plt.show()

# Save key insights to CSV files
print("\nSaving analysis results to files...")

# Save topic information
topic_info.to_csv('topic_info.csv')

# Save sentiment by topic
topic_sentiment_df.to_csv('topic_sentiment.csv')

# Create a dataframe with example reviews for each topic
topic_examples = []
for topic_id in topic_info['Topic'][:15]:  # Top 15 topics
    if topic_id == -1:
        continue
    # Get 3 example reviews for this topic
    examples = sample_df[sample_df['topic'] == topic_id].sort_values('vader_score', ascending=False).head(3)
    for _, row in examples.iterrows():
        topic_examples.append({
            'Topic': topic_id,
            'Sentiment': row['vader_score'],
            'Rating': row['overall'],
            'Review': row['cleaned_review'][:200] + '...'  # First 200 chars
        })

# Save examples to CSV
pd.DataFrame(topic_examples).to_csv('topic_examples.csv')

print("\nAnalysis complete! Results saved to CSV files and visualizations saved as PNG files.")

-----------------------------------
Office Products Customer Feedback Analysis
-----------------------------------
--2025-09-22 14:47:07--  https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Office_Products_5.json.gz
Resolving jmcauley.ucsd.edu (jmcauley.ucsd.edu)... 137.110.160.73
Connecting to jmcauley.ucsd.edu (jmcauley.ucsd.edu)|137.110.160.73|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 111685374 (107M) [application/x-gzip]
Saving to: ‘Office_Products_5.json.gz’


2025-09-22 14:47:12 (24.0 MB/s) - ‘Office_Products_5.json.gz’ saved [111685374/111685374]

gzip: Office_Products_5.json already exists; do you wish to overwrite (y or n)? 

In [None]:
# Install required PyTorch-related libraries
!pip install torch torchvision torchaudio

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import LabelEncoder

# 1. Encode labels (negative=0, neutral=1, positive=2)
label_encoder = LabelEncoder()
balanced_df['label'] = label_encoder.fit_transform(balanced_df['sentiment_label'])

# 2. PyTorch Dataset
class ReviewDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

dataset = ReviewDataset(embeddings, balanced_df['label'].values)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# 3. Define the Variational Autoencoder
class VAE(nn.Module):
    def __init__(self, input_dim=384, latent_dim=64):
        super(VAE, self).__init__()
        # Encoder
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc_mu = nn.Linear(256, latent_dim)
        self.fc_logvar = nn.Linear(256, latent_dim)

        # Decoder
        self.fc3 = nn.Linear(latent_dim, 256)
        self.fc4 = nn.Linear(256, input_dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc_mu(h1), self.fc_logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return self.fc4(h3)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        x_recon = self.decode(z)
        return x_recon, mu, logvar

# 4. VAE Loss
def vae_loss(recon_x, x, mu, logvar):
    recon_loss = F.mse_loss(recon_x, x, reduction='sum')
    kld = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + kld

# 5. Train the VAE
vae = VAE()
optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3)

vae.train()
for epoch in range(10):
    total_loss = 0
    for batch in train_loader:
        inputs, _ = batch
        optimizer.zero_grad()
        recon_x, mu, logvar = vae(inputs)
        loss = vae_loss(recon_x, inputs, mu, logvar)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader.dataset):.4f}")

# 6. Extract latent vectors for classification
vae.eval()
def extract_latents(dataloader):
    all_latents = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            mu, _ = vae.encode(inputs)
            all_latents.append(mu)
            all_labels.append(labels)
    return torch.cat(all_latents), torch.cat(all_labels)

X_train_latent, y_train_latent = extract_latents(train_loader)
X_val_latent, y_val_latent = extract_latents(val_loader)

# 7. Train a simple classifier on latent vectors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)
clf.fit(X_train_latent.numpy(), y_train_latent.numpy())

# 8. Evaluate classifier
y_pred = clf.predict(X_val_latent.numpy())
accuracy = accuracy_score(y_val_latent.numpy(), y_pred)
f1 = f1_score(y_val_latent.numpy(), y_pred, average='weighted')

print(f"\nVAE + Logistic Regression Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_val_latent.numpy(), y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
plt.figure(figsize=(8, 6))
conf_matrix = confusion_matrix(y_val_latent.numpy(), y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix (VAE)', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('vae_confusion_matrix.png')
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

print("\nGenerating TF-IDF vectors and applying LSA...")

# Step 1: TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.9,
    min_df=5,
    max_features=5000,
    stop_words='english'
)

X_tfidf = tfidf_vectorizer.fit_transform(balanced_df['cleaned_review'])

# Step 2: Apply LSA using TruncatedSVD
n_components = 100  # Latent dimensions
svd = TruncatedSVD(n_components=n_components, random_state=RANDOM_SEED)
X_lsa = svd.fit_transform(X_tfidf)

print(f"LSA reduced TF-IDF shape: {X_lsa.shape}")

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_lsa,
    balanced_df['sentiment_label'].values,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=balanced_df['sentiment_label']
)

# Step 4: Train classifier on LSA-reduced features
print("\nTraining Logistic Regression on LSA features...")
clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)
clf.fit(X_train, y_train)

# Step 5: Evaluate model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nLSA + Logistic Regression Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Step 6: Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix (LSA)', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('lsa_confusion_matrix.png')
plt.show()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import seaborn as sns

print("\nRunning LDA for topic-based sentiment classification...")

# Vectorize using CountVectorizer (LDA works on raw counts, not TF-IDF)
vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=5,
    max_features=3000,
    stop_words='english'
)
X_counts = vectorizer.fit_transform(balanced_df['cleaned_review'])

# Apply LDA
n_topics = 30
lda_model = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=10,
    learning_method='online',
    random_state=RANDOM_SEED,
    evaluate_every=-1,
    n_jobs=-1
)

X_topics = lda_model.fit_transform(X_counts)
print(f"LDA topic distribution shape: {X_topics.shape}")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_topics,
    balanced_df['sentiment_label'].values,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=balanced_df['sentiment_label']
)

# Train a classifier on LDA features
print("\nTraining classifier on LDA topic distributions...")
clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nLDA + Logistic Regression Results:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Purples',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('LDA + Logistic Regression Confusion Matrix', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('lda_confusion_matrix.png')
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from nltk.corpus import stopwords
import nltk

# Download stopwords if not already done
nltk.download('stopwords')

# Random seed for reproducibility
RANDOM_SEED = 100

# ============================
# Load your dataset
# ============================

# Ensure your dataset is loaded properly
# Must contain 'cleaned_review' and 'sentiment_label'
# Example:
# balanced_df = pd.read_csv('your_dataset.csv')
# assert 'cleaned_review' in balanced_df.columns
# assert 'sentiment_label' in balanced_df.columns

# ============================
# LDA + Logistic Regression
# ============================

print("\nRunning LDA for topic-based sentiment classification...")

# Vectorize using CountVectorizer (raw counts)
vectorizer = CountVectorizer(
    max_df=0.90,
    min_df=3,
    max_features=500,
    stop_words=stopwords.words('english'),
    ngram_range=(1, 2)  # include bigrams
)
X_counts = vectorizer.fit_transform(balanced_df['cleaned_review'])

# Fit LDA
n_topics = 25  # Lower topic count for better generalization
lda_model = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=20,  # Increased iterations
    learning_method='online',
    learning_decay=0.7,
    evaluate_every=1,
    random_state=RANDOM_SEED,
    n_jobs=-1
)
X_topics = lda_model.fit_transform(X_counts)
print(f"LDA topic distribution shape: {X_topics.shape}")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_topics,
    balanced_df['sentiment_label'].values,
    test_size=0.5,
    random_state=RANDOM_SEED,
    stratify=balanced_df['sentiment_label']
)

# Train classifier
print("\nTraining Logistic Regression on LDA topic distributions...")
clf = LogisticRegression(
    max_iter=100,
    class_weight='balanced',
    solver='lbfgs',
    random_state=RANDOM_SEED,
    C=2.0  # Increased regularization strength
)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nLDA + Logistic Regression Results:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=['Negative', 'Neutral', 'Positive']
))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Purples',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('LDA + Logistic Regression Confusion Matrix', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('lda_confusion_matrix.png')
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("\nVectorizing text with TF-IDF for Naive Bayes...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    min_df=5,
    max_df=0.9,
    stop_words='english'
)

X = vectorizer.fit_transform(balanced_df['cleaned_review'])
y = balanced_df['sentiment_label'].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1,
    stratify=y,
    random_state=RANDOM_SEED
)

# Train Naive Bayes model
print("\nTraining Multinomial Naive Bayes model...")
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nMultinomial Naive Bayes Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix - Naive Bayes', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('naive_bayes_confusion_matrix.png')
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

print("\nVectorizing text with TF-IDF for SVM...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=7000,
    min_df=5,
    max_df=0.9,
    stop_words='english'
)

X = vectorizer.fit_transform(balanced_df['cleaned_review'])
y = balanced_df['sentiment_label'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1,
    stratify=y,
    random_state=RANDOM_SEED
)

# Train the SVM
print("\nTraining LinearSVC for sentiment classification...")
svm_model = LinearSVC(C=1.0, class_weight='balanced', max_iter=5000)
svm_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nLinear SVM Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix - Linear SVM', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('svm_confusion_matrix.png')
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

print("\nVectorizing text using TF-IDF for Random Forest...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    min_df=5,
    max_df=0.85,
    stop_words='english'
)

X = vectorizer.fit_transform(balanced_df['cleaned_review'])
y = balanced_df['sentiment_label'].values

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_SEED
)

print("\nTraining Random Forest Classifier...")
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=50,
    class_weight='balanced',
    random_state=RANDOM_SEED,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nRandom Forest Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='BuGn',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix - Random Forest', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('rf_confusion_matrix.png')
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

print("\nVectorizing text using TF-IDF for Random Forest...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=100000,
    min_df=5,
    max_df=0.9,
    stop_words='english'
)

X = vectorizer.fit_transform(balanced_df['cleaned_review'])
y = balanced_df['sentiment_label'].values

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.1,
    stratify=y,
    random_state=RANDOM_SEED
)

print("\nTraining Random Forest Classifier...")
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=500,
    class_weight='balanced',
    random_state=RANDOM_SEED,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nRandom Forest Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='BuGn',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix - Random Forest', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('rf_confusion_matrix.png')
plt.show()


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# Vectorizing with TF-IDF
print("\nVectorizing text with TF-IDF for Stacked Ensemble...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=7000,
    min_df=5,
    max_df=0.85,
    stop_words='english'
)

X = vectorizer.fit_transform(balanced_df['cleaned_review'])
y = balanced_df['sentiment_label'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    stratify=y,
    random_state=RANDOM_SEED
)

# Define base learners
base_learners = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)),
    ('svm', LinearSVC(C=1.0, class_weight='balanced', max_iter=5000))
]

# Define meta learner
meta_learner = LogisticRegression(max_iter=1000)

# Build stacked ensemble
stacked_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1
)

# Train model
print("\nTraining Stacked Ensemble for sentiment classification...")
stacked_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = stacked_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nStacked Ensemble Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix - Stacked Ensemble', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('stacked_confusion_matrix.png')
plt.show()



In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# Vectorizing with TF-IDF
print("\nVectorizing text with TF-IDF for Stacked Ensemble...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=9000,
    min_df=5,
    max_df=0.95,
    stop_words='english'
)

X = vectorizer.fit_transform(balanced_df['cleaned_review'])
y = balanced_df['sentiment_label'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1,
    stratify=y,
    random_state=RANDOM_SEED
)

# Define base learners
base_learners = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)),
    ('svm', LinearSVC(C=1.0, class_weight='balanced', max_iter=5000))
]

# Define meta learner
meta_learner = LogisticRegression(max_iter=1000)

# Build stacked ensemble
stacked_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1
)

# Train model
print("\nTraining Stacked Ensemble for sentiment classification...")
stacked_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = stacked_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nStacked Ensemble Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix - Stacked Ensemble', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('stacked_confusion_matrix.png')
plt.show()



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt



print("\nVectorizing text with TF-IDF for XGBoost...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=8000,
    min_df=5,
    max_df=0.85,
    stop_words='english'
)

X = vectorizer.fit_transform(balanced_df['cleaned_review'])
y = balanced_df['sentiment_label'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.5,
    stratify=y,
    random_state=RANDOM_SEED
)

# Train XGBoost model
print("\nTraining XGBoost for sentiment classification...")
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,  # Change if you have a different number of classes
    eval_metric='mlogloss',
    use_label_encoder=False,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=RANDOM_SEED
)

xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nXGBoost Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix - XGBoost', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('xgboost_confusion_matrix.png')
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt



print("\nVectorizing text with TF-IDF for XGBoost...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=80000,
    min_df=5,
    max_df=0.9,
    stop_words='english'
)

X = vectorizer.fit_transform(balanced_df['cleaned_review'])
y = balanced_df['sentiment_label'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1,
    stratify=y,
    random_state=RANDOM_SEED
)

# Train XGBoost model
print("\nTraining XGBoost for sentiment classification...")
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,  # Change if you have a different number of classes
    eval_metric='mlogloss',
    use_label_encoder=False,
    max_depth=9,
    learning_rate=0.3,
    n_estimators=1000,
    random_state=RANDOM_SEED
)

xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nXGBoost Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix - XGBoost', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('xgboost_confusion_matrix.png')
plt.show()
