In [1]:
# Sentiment and Topic Analysis for Product Reviews (No gensim)

# ------------------------------
# 1. INSTALLATION (Only needed in Colab or first-time setup)
# Uncomment below lines if running in Colab or a new environment

# 
# 
# %pip install pandas numpy matplotlib seaborn wordcloud nltk spacy scikit-learn textblob beautifulsoup4 contractions emoji pyLDAvis plotly

# ------------------------------
# 2. IMPORT LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
import spacy
import re
import string
import contractions
import emoji
import os

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD


In [7]:
import nltk

# Run these once to download the required data
nltk.download('punkt')         # for word_tokenize
nltk.download('stopwords')     # for stop words
nltk.download('wordnet')       # for lemmatization
nltk.download('omw-1.4')       # required for WordNet in newer versions


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OceanComputers\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OceanComputers\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OceanComputers\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\OceanComputers\AppData\Roaming\nltk_data...


True

In [3]:
# 4. SAMPLE DATA (Can be replaced with your own CSV)
data = {
    'review': [
        "I love this phone, battery lasts forever!",
        "Terrible service. Not happy at all.",
        "Okay product. Does the job, nothing fancy.",
        "The quality is amazing and the camera is great!",
        "Stopped working after a week. Waste of money."
    ]
}
df = pd.read_csv("D:/internship/sentiment_analysis/Data/product_reviews_mock_data.csv")

In [4]:
df.head()

Unnamed: 0,ReviewID,ProductID,UserID,Rating,ReviewText,ReviewDate
0,REV2000,Product_E,User_114,4,fantastic. wonderful experience.,2023-04-17
1,REV2001,Product_C,User_186,2,broke easily. awful.,2023-11-27
2,REV2002,Product_E,User_101,3,met expectations. five stars.,2023-12-10
3,REV2003,Product_A,User_175,5,very satisfied. wonderful experience.,2023-11-10
4,REV2004,Product_C,User_158,1,worst purchase. one star.,2024-05-25


In [5]:
df.columns

Index(['ReviewID', 'ProductID', 'UserID', 'Rating', 'ReviewText',
       'ReviewDate'],
      dtype='object')

In [8]:
# 5. TEXT CLEANING FUNCTION
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

# This line works now after NLTK downloads
df['clean_review'] = df['ReviewText'].apply(clean_text)


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\OceanComputers/nltk_data'
    - 'c:\\Users\\OceanComputers\\miniconda3\\envs\\sentiment_env\\nltk_data'
    - 'c:\\Users\\OceanComputers\\miniconda3\\envs\\sentiment_env\\share\\nltk_data'
    - 'c:\\Users\\OceanComputers\\miniconda3\\envs\\sentiment_env\\lib\\nltk_data'
    - 'C:\\Users\\OceanComputers\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
# 6. SENTIMENT ANALYSIS (VADER & TextBlob)
vader = SentimentIntensityAnalyzer()
df['vader_score'] = df['review'].apply(lambda x: vader.polarity_scores(x)['compound'])
df['vader_sentiment'] = df['vader_score'].apply(lambda x: 'positive' if x > 0.05 else 'negative' if x < -0.05 else 'neutral')

df['textblob_polarity'] = df['review'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['textblob_sentiment'] = df['textblob_polarity'].apply(lambda x: 'positive' if x > 0.05 else 'negative' if x < -0.05 else 'neutral')


In [None]:
# 7. TOPIC MODELING ON NEGATIVE REVIEWS
negative_reviews = df[df['vader_sentiment'] == 'negative']['clean_review']

# --- TF-IDF Vectorization ---
tfidf = TfidfVectorizer(max_df=0.9, min_df=1, stop_words='english')
tfidf_matrix = tfidf.fit_transform(negative_reviews)

# --- NMF for Topic Modeling ---
nmf = NMF(n_components=2, random_state=42)
nmf_topics = nmf.fit_transform(tfidf_matrix)

words = tfidf.get_feature_names_out()
nmf_topic_words_list = []
print("\n--- NMF Topics ---")
for topic_idx, topic in enumerate(nmf.components_):
    top_words = [words[i] for i in topic.argsort()[:-6:-1]]
    print(f"NMF Topic #{topic_idx + 1}: {', '.join(top_words)}")
    nmf_topic_words_list.append({"topic": f"NMF Topic {topic_idx + 1}", **{words[i]: topic[i] for i in topic.argsort()[:-6:-1]}})

    # WordCloud
    topic_words = {words[i]: topic[i] for i in topic.argsort()[:-11:-1]}
    wc = WordCloud(width=600, height=300, background_color='white').generate_from_frequencies(topic_words)
    plt.figure(figsize=(6,3))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'NMF Topic #{topic_idx + 1}')
    plt.savefig(f'nmf_topic_{topic_idx + 1}.png')
    plt.show()

# --- LSA using TruncatedSVD ---
lsa = TruncatedSVD(n_components=2, random_state=42)
lsa.fit(tfidf_matrix)

lsa_topic_words_list = []
print("\n--- LSA Topics ---")
for topic_idx, topic in enumerate(lsa.components_):
    top_words = [words[i] for i in topic.argsort()[:-6:-1]]
    print(f"LSA Topic #{topic_idx + 1}: {', '.join(top_words)}")
    lsa_topic_words_list.append({"topic": f"LSA Topic {topic_idx + 1}", **{words[i]: topic[i] for i in topic.argsort()[:-6:-1]}})

    # Bar Plot
    topic_words = {words[i]: topic[i] for i in topic.argsort()[:-6:-1]}
    plt.figure(figsize=(6,3))
    sns.barplot(x=list(topic_words.values()), y=list(topic_words.keys()), palette='viridis')
    plt.title(f'LSA Topic #{topic_idx + 1}')
    plt.xlabel('Weight')
    plt.ylabel('Word')
    plt.tight_layout()
    plt.savefig(f'lsa_topic_{topic_idx + 1}.png')
    plt.show()


In [None]:
# 8. EXPORT TOPICS TO CSV
pd.DataFrame(nmf_topic_words_list).to_csv("nmf_topics.csv", index=False)
pd.DataFrame(lsa_topic_words_list).to_csv("lsa_topics.csv", index=False)

In [None]:
# 9. OUTPUT DATA
print("\n\n--- Final DataFrame with Sentiment ---")
print(df[['review', 'vader_sentiment', 'textblob_sentiment']])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OceanComputers\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OceanComputers\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OceanComputers\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\OceanComputers\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


ValueError: DataFrame constructor not properly called!