<a href="https://colab.research.google.com/github/GurionRamapoguSajeevan/GenAI-customer-review-sentiment-engine/blob/main/GenAI_sentiment_engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# STEP 1: LIBRARIES AND INITIAL SET UP

## Installing required Libraries

In [None]:
!pip install pandas nltk spacy scikit-learn transformers matplotlib seaborn streamlit
!python -m spacy download en_core_web_sm  # For English NLP preprocessing

## Importing necessary libraries

In [None]:
import pandas as pd
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')

# STEP 2: Loading Data set

In [None]:
df = pd.read_csv('/amazon_review.csv')

In [None]:
df.head()

In [None]:
df.info()

### Relevant columns: We'll focus on reviewText (for text analysis), overall (rating, as a proxy for sentiment validation), and asin (product ID, for filtering in the dashboard).

In [None]:
df = df[['reviewText', 'overall', 'asin']]
df.head()

# Step 3: Cleaning and Preprocessing the Text

In [None]:
# Here I am defining a preprocessing function (this handles lowercase, tokenization, stopword removal, and lemmatization—standard NLP steps to make text ready for models):

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()  # Lowercase
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove stopwords and non-alphanumeric
    doc = nlp(' '.join(tokens))  # Lemmatize with spaCy
    lemmatized = [token.lemma_ for token in doc]
    return ' '.join(lemmatized)

### this extra one is needed for the tabular data in the tokenizer. 'punkt_tab' tokenizer model required for word_tokenize in recent NLTK versions

In [None]:
nltk.download('punkt_tab')

In [None]:
# Applying the Preprocessing Function to the reviewText column:

In [None]:
df['cleaned_review'] = df['reviewText'].apply(preprocess_text)
df.head()

In [None]:
# Dropping empty reviews if any

In [None]:
df = df.dropna(subset=['cleaned_review'])

# Step 4: Extract Insights Using NLP and AI Models

###  1. Sentiment Analysis (using a lightweight DistilBERT model)

In [None]:
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', device=0)  # device=0 uses GPU

# df['sentiment'] = df['cleaned_review'].apply(lambda x: sentiment_pipeline(x)[0]['label'] if x else 'NEUTRAL')

In [None]:
df['sentiment'] = df['cleaned_review'].apply(lambda x: sentiment_pipeline(x, truncation=True, max_length=512)[0]['label'] if x else 'NEUTRAL')

In [None]:
df.head(20)

In [None]:
sns.countplot(x='sentiment', data=df)
plt.show()  # Quick viz in notebook

### 2. Themes/Topics (using traditional LDA for simplicity and to show hybrid skills—AI + classic ML):

In [None]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['cleaned_review'])

lda = LatentDirichletAllocation(n_components=5, random_state=42)  # Extract 5 main themes
lda.fit(dtm)

# Display top words per theme (print in notebook)
for i, topic in enumerate(lda.components_):
    print(f"Theme {i}: {' '.join([vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]])}")

# Assign dominant theme to each review
df['theme'] = lda.transform(dtm).argmax(axis=1)

### 3. Pain Points and Suggestions (using zero-shot classification with BART—another HuggingFace model, GPU-accelerated):

In [None]:
zero_shot_pipeline = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=0)  # GPU

pain_labels = ['quality issue', 'delivery problem', 'price complaint', 'usability issue', 'no pain']
suggestion_labels = ['improve durability', 'better packaging', 'add features', 'lower price', 'no suggestion']

def extract_pain_point(text):
    if not text:
        return 'none'
    result = zero_shot_pipeline(text, candidate_labels=pain_labels)
    return result['labels'][0]

def extract_suggestion(text):
    if not text:
        return 'none'
    result = zero_shot_pipeline(text, candidate_labels=suggestion_labels)
    return result['labels'][0]

df['pain_point'] = df['cleaned_review'].apply(extract_pain_point)
df['suggestion'] = df['cleaned_review'].apply(extract_suggestion)

In [None]:
df.to_csv('/processed_reviews.csv', index=False)