<a href="https://colab.research.google.com/github/GurionRamapoguSajeevan/GenAI-customer-review-sentiment-engine/blob/main/GenAI_sentiment_review_engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# STEP 1: LIBRARIES AND INITIAL SET UP

## Installing required Libraries

In [None]:
!pip install pandas nltk spacy scikit-learn transformers matplotlib seaborn streamlit
!python -m spacy download en_core_web_sm  # For English NLP preprocessing

## Importing necessary libraries

In [None]:
import pandas as pd
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')

# STEP 2: Loading and Understanding the Data

In [None]:
df = pd.read_csv('/content/amazon_review.csv')

In [None]:
df.head()

In [None]:
df.info()

### Relevant columns: We'll focus on reviewText (for text analysis), overall (rating, as a proxy for sentiment validation), and asin (product ID, for filtering in the dashboard).

In [None]:
df = df[['reviewText', 'overall', 'asin']]
df.head()

# Step 3: Cleaning and Preprocessing the Text

In [None]:
# Here I am defining a preprocessing function (this handles lowercase, tokenization, stopword removal, and lemmatization—standard NLP steps to make text ready for models):

### this extra one is needed for the tabular data in the tokenizer. 'punkt_tab' tokenizer model required for word_tokenize in recent NLTK versions

In [None]:
nltk.download('punkt_tab')

#### 1. First, drop any rows where the actual reviewText is missing


In [None]:
df = df.dropna(subset=['reviewText'])

#### 2. Define Preprocessing for LDA (Topic Modeling) ONLY

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

def preprocess_text_for_topics(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    # Remove stopwords and non-alphanumeric (CRITICAL: Only do this for Topic Modeling, not Sentiment!)
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    doc = nlp(' '.join(tokens))
    lemmatized = [token.lemma_ for token in doc]
    return ' '.join(lemmatized)


#### 3. Applying the Preprocessing Function to the reviewText column:

In [None]:
df['cleaned_review'] = df['reviewText'].apply(preprocess_text_for_topics)

#### 4. Filtering out rows that became empty after cleaning (so LDA doesn't crash), but keeping the index aligned for the raw text.


In [None]:
df = df[df['cleaned_review'].str.len() > 0].copy()

print("Data ready. 'reviewText' will be used for AI, 'cleaned_review' for LDA.")
df.head()

# Step 4: Extract Insights Using NLP and AI Models

####  1. Sentiment Analysis (using a lightweight DistilBERT model)

In [None]:

# NEW MODEL: Uses a 5-star rating system which we map to 3 classes (Neg/Neu/Pos).

sentiment_pipeline = pipeline(
    'sentiment-analysis',
    model='nlptown/bert-base-multilingual-uncased-sentiment',
    device=0
)

# a. Prepare raw text list
raw_reviews = df['reviewText'].astype(str).tolist()

# b. Run inference in batches
sentiment_results = sentiment_pipeline(raw_reviews, truncation=True, batch_size=16)

# c. Define the mapping function to convert 5-star to 3-class label
def map_star_to_sentiment(label):
    if label in ['1 star', '2 stars']:
        return 'NEGATIVE'
    elif label == '3 stars':
        return 'NEUTRAL'
    elif label in ['4 stars', '5 stars']:
        return 'POSITIVE'
    # Handle potential label formatting (e.g., 'X star' vs 'X stars')
    elif 'star' in label:
        star_num = int(label.split()[0])
        if star_num <= 2:
            return 'NEGATIVE'
        elif star_num == 3:
            return 'NEUTRAL'
        else:
            return 'POSITIVE'
    return 'NEUTRAL' # Default fallback

# d. Extract labels and apply mapping
df['sentiment'] = [map_star_to_sentiment(result['label']) for result in sentiment_results]


In [None]:
# e. Vizualization check
sns.countplot(x='sentiment', data=df)
plt.title("Sentiment Distribution (3-Class)")
plt.show()

In [None]:
# Checking a sample to verify "Positives" aren't hallucinating on negative words
print(df[['reviewText', 'sentiment']].head(10))

#### 2. Themes/Topics (using traditional LDA for simplicity and to show hybrid skills—AI + classic ML):

In [None]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['cleaned_review'])

lda = LatentDirichletAllocation(n_components=5, random_state=42)  # Extract 5 main themes
lda.fit(dtm)

# Display top words per theme (print in notebook)
for i, topic in enumerate(lda.components_):
    print(f"Theme {i}: {' '.join([vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]])}")

# Assign dominant theme to each review
df['theme'] = lda.transform(dtm).argmax(axis=1)

#### 3. Pain Points and Suggestions (using zero-shot classification with BART: another GPU-accelerated HuggingFace model, ):

In [None]:
# Load Model
zero_shot_pipeline = pipeline(
    'zero-shot-classification',
    model='facebook/bart-large-mnli',
    device=0
)

pain_labels = ['quality issue', 'delivery problem', 'price complaint', 'usability issue', 'no pain']
suggestion_labels = ['improve durability', 'better packaging', 'add features', 'lower price', 'no suggestion']

# Prepare raw text list
raw_reviews = df['reviewText'].astype(str).tolist()

print("Processing Pain Points... (This may take a moment)")

# Batch process Pain Points
pain_results = zero_shot_pipeline(
    raw_reviews,
    candidate_labels=pain_labels,
    batch_size=8, # BART is larger, so we use a smaller batch size to avoid Out of Memory
    truncation=True
)
df['pain_point'] = [result['labels'][0] for result in pain_results]

print("Processing Suggestions...")
# Batch process Suggestions
suggestion_results = zero_shot_pipeline(
    raw_reviews,
    candidate_labels=suggestion_labels,
    batch_size=8,
    truncation=True
)
df['suggestion'] = [result['labels'][0] for result in suggestion_results]

# Step 5: Saving the Final Processed-reviewes Dataset

In [None]:
# Save final
df.to_csv('/content/processed_reviews.csv', index=False)
print("Processing complete. File saved.")