## 1. Install & Import Required Libraries

In [1]:
# Core Libraries
import os
import re
import time
import numpy as np
import pandas as pd
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# Web Scraping
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Language Detection and Text Cleaning
from langdetect import detect, DetectorFactory
from unidecode import unidecode
import emoji

# NLP and Text Processing
import nltk
from nltk.sentiment.util import mark_negation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy
from gensim.models import Word2Vec

# Machine Learning and Topic Modeling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import Normalizer

# Download NLTK resources (run only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

print("All required libraries imported and NLP models loaded successfully.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


All required libraries imported and NLP models loaded successfully.



## 2. Web Scraping: Extract Reviews and Ratings from BeMinimalist


In [3]:



options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


url = "https://beminimalist.co /collections/hair/products/hair-growth-actives-18"
driver.get(url)

print("BeMinimalist product page...")
time.sleep(10)


driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.6);")
time.sleep(5)


page = 1
collected_html = ""

while page <= 95:
    print(f"Scraping page {page}...")
    time.sleep(4)
    collected_html += driver.page_source

    try:
  
        next_link = driver.find_element(By.CSS_SELECTOR, "a[aria-label='Navigate to next page']")
        if next_link.get_attribute("aria-disabled") == "true":
            print("Reached last available page of reviews.")
            break

    
        driver.execute_script("arguments[0].scrollIntoView(true);", next_link)
        time.sleep(2)
        ActionChains(driver).move_to_element(next_link).click().perform()

        page += 1
        time.sleep(5)

    except Exception:
        print("No further 'Next' pagination link found — finished.")
        break

print(f"\nStopped at page {page} (limit reached or end of pages).")


soup = BeautifulSoup(collected_html, "html.parser")
review_blocks = soup.select("div.yotpo-review")

reviews = []
for i, r in enumerate(review_blocks, 1):
    name = r.select_one(".yotpo-reviewer-name")
    date = r.select_one(".yotpo-review-date")
    rating_div = r.select_one(".yotpo-star-rating.yotpo-review-star-rating")
    title = r.select_one(".yotpo-review-title strong, .yotpo-review-title")
    text = r.select_one(".yotpo-read-more-text, .content-review")

    
    rating_text = rating_div.get("aria-label") if rating_div and rating_div.has_attr("aria-label") else ""
    rating = rating_text.split()[0] if rating_text else ""

    reviews.append({
        "S.No": i,
        "Name": name.get_text(strip=True) if name else "Anonymous",
        "Date": date.get_text(strip=True) if date else "",
        "Rating": rating,
        "Title": title.get_text(strip=True) if title else "",
        "Review": text.get_text(strip=True) if text else ""
    })

driver.quit()

df = pd.DataFrame(reviews)
df.to_csv("minimalist_reviews_with_ratings.csv", index=False, encoding="utf-8-sig")

print(f"\nExtracted {len(df)} total reviews from {page} pages.")
print("Saved as 'minimalist_reviews_with_ratings.csv'")



BeMinimalist product page...
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...


## 3. Data Cleaning: Create Simplified Review Dataset (S.No, Review, Rating)


In [4]:

RAW_CSV = "minimalist_reviews_with_ratings.csv"
df_raw = pd.read_csv(RAW_CSV)

df_raw = df_raw.fillna("")
df_raw["Rating"] = pd.to_numeric(df_raw["Rating"], errors="coerce")

df_raw["Review"] = (df_raw["Title"].astype(str) + " " + df_raw["Review"].astype(str)).str.strip()
df_raw = df_raw[df_raw["Review"].str.len() > 0].copy().reset_index(drop=True)

print(f"Loaded {len(df_raw)} valid reviews from raw file.")

df_simple = pd.DataFrame({
    "S.No": range(1, len(df_raw) + 1),
    "Review": df_raw["Review"],
    "Rating": df_raw["Rating"].astype(int)
})

df_simple = df_simple[df_simple["Rating"] > 0].reset_index(drop=True)

df_simple.to_csv("minimalist_reviews_clean.csv", index=False, encoding="utf-8-sig")

print("Simplified dataset created successfully.")
print("Saved as: minimalist_reviews_clean.csv")
print("\nPreview:")
display(df_simple.head(10))


Loaded 474 valid reviews from raw file.
Simplified dataset created successfully.
Saved as: minimalist_reviews_clean.csv

Preview:


Unnamed: 0,S.No,Review,Rating
0,1,Hair Fall Reduced My hair fall has reduced in ...,5
1,2,Haircare I really love the product,5
2,3,"Improvement shown Nice product, I got the result",5
3,4,Nice product Nice product,5
4,5,Oil leaked out competely I opened the delivery...,1
5,6,Love the product I have been using this serum ...,5
6,7,Medium effect Medium effect I used this produc...,3
7,8,Does the job! I started a routine with this as...,5
8,9,Good It is a nice product. My Hair fall got re...,4
9,10,Good This product is good,4


## 4. Language Detection and Hindi-to-English Translation


In [5]:


DetectorFactory.seed = 0  

HI_EN_DICT = {
    "accha": "good",
    "achha": "good",
    "bahut accha": "very good",
    "bahut acha": "very good",
    "bahut badhiya": "very good",
    "badhiya": "good",
    "bura": "bad",
    "bekar": "useless",
    "bekaar": "useless",
    "ghatiya": "awful",
    "sasta": "cheap",
    "mehnga": "expensive",
    "mehanga": "expensive",
    "paisa vasool": "value for money",
    "pasand": "like",
    "pasand aaya": "liked it",
    "nahi": "not",
    "nahin": "not",
    "thik": "okay",
    "theek": "okay",
    "bahut": "very",
    "kam": "less",
    "zyada": "more",
    "jaldi": "fast",
    "dheere": "slow",
    "sugandh": "fragrance",
    "khushboo": "fragrance",
    "mehnge": "expensive",
    "white cast": "white cast",
    "jaldi absorb": "absorbs quickly",
    "chipchipa": "sticky",
    "non sticky": "non-sticky"
}

def normalize_hindi_ascii(text):
    text = unidecode(str(text))
    return text.lower().strip()

def rule_translate_hi_to_en(text):
    text = normalize_hindi_ascii(text)
    for hi in sorted(HI_EN_DICT.keys(), key=len, reverse=True):
        text = re.sub(rf"\b{re.escape(hi)}\b", HI_EN_DICT[hi], text)
    return text

def detect_lang_safe(text):
    try:
        return detect(str(text))
    except:
        return "unknown"

df = pd.read_csv("minimalist_reviews_clean.csv")

df["lang"] = df["Review"].apply(detect_lang_safe)
df["text_translated"] = df.apply(
    lambda r: rule_translate_hi_to_en(r["Review"]) if r["lang"] == "hi" else r["Review"],
    axis=1
)

print("Translation complete. Example output:")
print(df.head(10))

df.to_csv("minimalist_reviews_translated.csv", index=False, encoding="utf-8-sig")
print("Saved as minimalist_reviews_translated.csv")


Translation complete. Example output:
   S.No                                             Review  Rating lang  \
0     1  Hair Fall Reduced My hair fall has reduced in ...       5   en   
1     2                 Haircare I really love the product       5   en   
2     3   Improvement shown Nice product, I got the result       5   en   
3     4                          Nice product Nice product       5   ro   
4     5  Oil leaked out competely I opened the delivery...       1   en   
5     6  Love the product I have been using this serum ...       5   en   
6     7  Medium effect Medium effect I used this produc...       3   en   
7     8  Does the job! I started a routine with this as...       5   en   
8     9  Good It is a nice product. My Hair fall got re...       4   en   
9    10                          Good This product is good       4   en   

                                     text_translated  
0  Hair Fall Reduced My hair fall has reduced in ...  
1                 Haircare

 ## 5. Text Cleaning, Normalization, and Lemmatization


In [6]:




STOPWORDS = set(stopwords.words("english"))
IMPORTANT_WORDS = {"skin", "cream", "oil", "serum", "spf", "fragrance", "sunscreen", "hair", "growth", "fall"}
STOPWORDS = STOPWORDS - IMPORTANT_WORDS

URL_RE = re.compile(r"https?://\S+|www\.\S+")
HTML_RE = re.compile(r"<.*?>")

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("\n", " ")
    text = re.sub(HTML_RE, " ", text)
    text = re.sub(URL_RE, " ", text)

    phrase_map = [
        ("hair fall", "hair_fall"),
        ("white cast", "white_cast"),
        ("dark spots", "dark_spots"),
        ("no dandruff", "no_dandruff"),
        ("less hair fall", "less_hair_fall")
    ]
    for old, new in phrase_map:
        text = re.sub(old, new, text, flags=re.IGNORECASE)

    text = re.sub(r"[^a-zA-Z0-9\s\.\,\!\?\'\u263a-\U0001f999]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def normalize_tokenize_lemmatize(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = mark_negation(tokens)
    tokens = [w for w in tokens if w not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

df["clean"] = df["text_translated"].apply(clean_text)
df["norm"] = df["clean"].apply(normalize_tokenize_lemmatize)

df = df.drop_duplicates(subset=["norm"]).reset_index(drop=True)

print("Cleaned reviews:", len(df))
print(df[["Rating", "text_translated", "norm"]].head(10))


Cleaned reviews: 468
   Rating                                    text_translated  \
0       5  Hair Fall Reduced My hair fall has reduced in ...   
1       5                 Haircare I really love the product   
2       5   Improvement shown Nice product, I got the result   
3       5                          Nice product Nice product   
4       1  Oil leaked out competely I opened the delivery...   
5       5  Love the product I have been using this serum ...   
6       3  Medium effect Medium effect I used this produc...   
7       5  Does the job! I started a routine with this as...   
8       4  Good It is a nice product. My Hair fall got re...   
9       4                          Good This product is good   

                                                norm  
0  hair fall reduced hair fall reduced 1 month us...  
1                       haircare really love product  
2        improvement shown nice product , got result  
3                          nice product nice product  

## 6. Part-of-Speech (POS) Tag Analysis


In [7]:


nlp = spacy.load("en_core_web_sm")

def pos_counts(text):
    doc = nlp(text)
    tags = [token.pos_ for token in doc]
    return pd.Series(tags).value_counts()

sample_size = min(200, len(df))
pos_df = df["clean"].sample(sample_size).apply(pos_counts).fillna(0).astype(int)
pos_summary = pos_df.sum().sort_values(ascending=False)

print(pos_summary.head(10))


NOUN     1109
VERB      741
PRON      585
PUNCT     450
ADJ       446
AUX       393
ADP       376
ADV       371
DET       332
CCONJ     192
dtype: int64


 ## 7. Extraction of Adjectives and Verbs for Product Descriptions




def extract_pos_words(text, pos_types={"ADJ"}):
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc if token.pos_ in pos_types and token.is_alpha]

def adjectives_near_terms(text, keywords=("product", "serum")):
    doc = nlp(text)
    nearby = []
    for i, token in enumerate(doc):
        if token.text.lower() in keywords:
            for j in range(max(0, i - 3), min(len(doc), i + 4)):
                if doc[j].pos_ == "ADJ" and doc[j].is_alpha:
                    nearby.append(doc[j].lemma_.lower())
    return nearby

adjectives = Counter()
verbs = Counter()
context_adjectives = Counter()

for review in df["clean"]:
    adjectives.update(extract_pos_words(review, {"ADJ"}))
    verbs.update(extract_pos_words(review, {"VERB"}))
    context_adjectives.update(adjectives_near_terms(review))

print("\nTop adjectives overall:")
print(adjectives.most_common(20))

print("\nAdjectives describing the product or serum:")
print(context_adjectives.most_common(20))

print("\nTop verbs:")
print(verbs.most_common(20))


## 8. Named Entity Recognition (NER) Analysis


In [10]:
def extract_ents(text):
    doc = nlp(text)
    return [(e.text, e.label_) for e in doc.ents]

df["ents"] = df["clean"].apply(extract_ents)

# Frequency of entity strings (optional filtering)
from collections import Counter
ent_counter = Counter()
for ents in df["ents"]:
    for e,_ in ents:
        ent_counter[e.lower()] += 1

ent_counter.most_common(20)


[('5', 20),
 ('a month', 14),
 ('first', 12),
 ('3 weeks', 11),
 ('3 months', 10),
 ('one', 10),
 ('second', 10),
 ('2', 9),
 ('18', 9),
 ('1', 9),
 ('minimalist', 8),
 ('3', 7),
 ('a week', 7),
 ('daily', 6),
 ('2 months', 6),
 ('half', 6),
 ('4', 6),
 ('one month', 5),
 ('3rd', 5),
 ('2nd', 4)]

## 9. Bag-of-Words and TF-IDF Vectorization


In [11]:


bow_vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 2))
X_bow = bow_vectorizer.fit_transform(df["norm"])

tfidf_vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(df["norm"])

print(X_bow.shape, X_tfidf.shape)


(468, 1305) (468, 1305)


## 10. Word2Vec Training and Semantic Similarity Exploration


In [12]:


tokenized_reviews = [text.split() for text in df["norm"]]
w2v_model = Word2Vec(
    tokenized_reviews,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    epochs=20
)

def similar_words(term, topn=5):
    try:
        return w2v_model.wv.most_similar(term, topn=topn)
    except KeyError:
        return []

keywords = ["sunscreen", "sticky", "fragrance", "white", "cast", "oily"]
for word in keywords:
    print(word, "→", similar_words(word))


sunscreen → []
sticky → [('non', 0.9947893619537354), ('oil', 0.9921171069145203), ('recommend', 0.991783857345581), ('free', 0.9912040829658508), ('must', 0.9907038807868958)]
fragrance → []
white → []
cast → []
oily → [('scalp', 0.9666950702667236), ('make', 0.9565368890762329), ('also', 0.9180022478103638), ('greasy', 0.9050824046134949), ('applying', 0.9038339257240295)]


## 11. Sentiment Analysis using VADER


In [13]:




analyzer = SentimentIntensityAnalyzer()
df["vader"] = df["clean"].apply(lambda text: analyzer.polarity_scores(text)["compound"])

def get_sentiment_label(score):
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    return "neutral"

df["vader_label"] = df["vader"].apply(get_sentiment_label)
df["vader_label"].value_counts(normalize=True).round(3)


vader_label
positive    0.654
negative    0.231
neutral     0.115
Name: proportion, dtype: float64

## 12. Topic Modeling using LSA and LDA


In [14]:


corpus = df["norm"].values

tfidf = TfidfVectorizer(
    max_df=0.8,
    min_df=3,
    stop_words='english'
)
X_tfidf = tfidf.fit_transform(corpus)
print("TF-IDF shape:", X_tfidf.shape)

k = 5
svd = TruncatedSVD(n_components=k, random_state=42)
svd_X = svd.fit_transform(X_tfidf)

terms = np.array(tfidf.get_feature_names_out())

print("\nTopics from LSA (TruncatedSVD):\n")
for i, comp in enumerate(svd.components_):
    top_idx = np.argsort(comp)[::-1][:10]
    print(f"Topic {i+1}:", ", ".join(terms[top_idx]))

count_vectorizer = CountVectorizer(
    max_df=0.8,
    min_df=3,
    stop_words='english'
)
X_count = count_vectorizer.fit_transform(corpus)
print("\nCount Vector shape:", X_count.shape)

for n_topics in [5, 8, 10]:
    print(f"\nLDA Results (n_topics={n_topics})")
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=42,
        learning_method="online",
        max_iter=20,
        evaluate_every=2,
        n_jobs=-1
    )
    lda.fit(X_count)
    
    terms = np.array(count_vectorizer.get_feature_names_out())
    for i, topic in enumerate(lda.components_):
        top_idx = topic.argsort()[-10:][::-1]
        print(f"Topic {i+1}:", ", ".join(terms[top_idx]))


TF-IDF shape: (468, 430)

Topics from LSA (TruncatedSVD):

Topic 1: hair, good, product, growth, result, fall, using, serum, month, hairfall
Topic 2: good, result, price, product, time, light, care, quantity, buy, slowly
Topic 3: i_neg, hair_neg, it_neg, _neg, any_neg, in_neg, using_neg, the_neg, my_neg, and_neg
Topic 4: product, great, star, nice, work, amazing, love, really, loved, got
Topic 5: result, month, work, waiting, using, serum, better, hairfall, working, awesome

Count Vector shape: (468, 430)

LDA Results (n_topics=5)
Topic 1: best, waiting, in_neg, change, awesome, hairfall_neg, change_neg, result, no_neg, normal
Topic 2: hair, growth, fall, using, product, serum, month, result, bottle, reduced
Topic 3: i_neg, for_neg, using_neg, _neg, it_neg, this_neg, product_neg, after_neg, the_neg, any_neg
Topic 4: product, good, hairfall, result, month, got, great, nice, using, star
Topic 5: hair_neg, _neg, it_neg, any_neg, growth_neg, i_neg, and_neg, in_neg, my_neg, of_neg

LDA Resu

## 13. Word Similarity Analysis using TF-IDF and SVD (LSA)


In [15]:


corpus = df["norm"].values.tolist()

phrase_map = {
    "white cast": "white_cast",
    "hair fall": "hair_fall",
    "non sticky": "nonsticky",
    "non sticky ": "nonsticky "
}

for i in range(len(corpus)):
    for old, new in phrase_map.items():
        corpus[i] = corpus[i].replace(old, new)

domain_terms = ["sticky", "nonsticky", "oily", "fragrance",
                "smell", "white_cast", "hair_fall", "texture", "greasy"]

for w in domain_terms:
    corpus.append(f"This product has {w}")

tfidf = TfidfVectorizer(
    max_df=0.9,
    min_df=1,
    stop_words='english'
)
X_tfidf = tfidf.fit_transform(corpus)

vocab = np.array(tfidf.get_feature_names_out())
word_to_idx = {w: i for i, w in enumerate(vocab)}
W = X_tfidf.T

def clean_word(w):
    w = w.lower().strip()
    w = re.sub(r"[^a-z_]", "", w)
    return w

def tfidf_similar_words(word, topn=8, min_score=0.2):
    word = clean_word(word)
    if word not in word_to_idx:
        return f"Word '{word}' not found in vocabulary."
    idx = word_to_idx[word]
    sims = cosine_similarity(W[idx], W)[0]
    best_idx = np.argsort(sims)[::-1][1:]
    best = [(vocab[i], float(sims[i])) for i in best_idx if sims[i] > min_score][:topn]
    return best

def show_similar_words(words):
    for w in words:
        sims = tfidf_similar_words(w)
        if isinstance(sims, str):
            print(sims)
            continue
        print(f"\n{w.upper()} — Top Similar Words (TF-IDF):")
        for s, score in sims:
            print(f"  {s:<15} → {score:.3f}")
    print("\nTF-IDF similarity completed.\n")

print("Building SVD-smoothed vectors (LSA)...")

svd = TruncatedSVD(n_components=150, random_state=42)
W_reduced = svd.fit_transform(W)
W_reduced = Normalizer(copy=False).fit_transform(W_reduced)

def svd_similar_words(word, topn=8):
    word = clean_word(word)
    if word not in word_to_idx:
        return f"Word '{word}' not found in vocabulary."
    idx = word_to_idx[word]
    sims = cosine_similarity([W_reduced[idx]], W_reduced)[0]
    best_idx = np.argsort(sims)[::-1][1:topn+1]
    return [(vocab[i], float(sims[i])) for i in best_idx]

def show_svd_similar(words):
    for w in words:
        sims = svd_similar_words(w)
        if isinstance(sims, str):
            print(sims)
            continue
        print(f"\n{w.upper()} — Smoothed Semantic Neighbors (SVD):")
        for s, score in sims:
            print(f"  {s:<15} → {score:.3f}")
    print("\nSVD-smoothed similarity completed.\n")

probes = ["sticky", "oily", "fragrance", "white_cast", "hair_fall"]

print("\nTF-IDF RAW SIMILARITY")
show_similar_words(probes)

print("\nSVD-SMOOTHED SIMILARITY")
show_svd_similar(probes)


Building SVD-smoothed vectors (LSA)...

TF-IDF RAW SIMILARITY

STICKY — Top Similar Words (TF-IDF):
  extend          → 0.315
  going           → 0.229

OILY — Top Similar Words (TF-IDF):
  make            → 0.440
  kindly          → 0.350
  everyday        → 0.332
  scalp           → 0.304
  leaf            → 0.277
  applying        → 0.277
  water           → 0.247
  sure            → 0.241

FRAGRANCE — Top Similar Words (TF-IDF):
  clearly         → 0.270
  highlight       → 0.270

WHITE_CAST — Top Similar Words (TF-IDF):

HAIR_FALL — Top Similar Words (TF-IDF):
  reduced         → 0.353
  significantly   → 0.277
  increased       → 0.260
  product         → 0.256
  reduces         → 0.205

TF-IDF similarity completed.


SVD-SMOOTHED SIMILARITY

STICKY — Smoothed Semantic Neighbors (SVD):
  extend          → 0.712
  going           → 0.460
  effect          → 0.326
  little          → 0.203
  bit             → 0.188
  white_cast      → 0.144
  product         → 0.141
  feel         

## 14. Review Clustering using TF-IDF and K-Means


In [16]:


tfidf = TfidfVectorizer(min_df=2, stop_words='english')
X_tfidf = tfidf.fit_transform(df["norm"])

print("TF-IDF matrix shape:", X_tfidf.shape)
print("DataFrame rows:", len(df))

n_clusters = 5 if X_tfidf.shape[0] >= 50 else 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
labels = kmeans.fit_predict(X_tfidf)

df["cluster"] = labels
print("Cluster labels assigned.")

representatives = []
for c in range(n_clusters):
    idx = np.where(labels == c)[0]
    if len(idx) == 0:
        continue
    subset = X_tfidf[idx]
    centroid = kmeans.cluster_centers_[c].reshape(1, -1)
    sims = cosine_similarity(subset, centroid).ravel()
    rep_idx = idx[np.argmax(sims)]
    representatives.append((c, int(rep_idx), df.loc[rep_idx, "clean"]))

print("\nRepresentative reviews per cluster:")
for c, i, text in representatives:
    print(f"\nCluster {c} | Row {i}\n{text[:400]}...")


TF-IDF matrix shape: (468, 638)
DataFrame rows: 468
Cluster labels assigned.

Representative reviews per cluster:

Cluster 0 | Row 49
effective product I had severe hair loss for 3 months and then I started using hair growth serum. it gives results in 3 weeks. . very nice product...

Cluster 1 | Row 111
Not much of difference I've Used 4 bottles of this serum, every alternate day at night. I can't see any difference in hair growth or volume, I think it's same as before. But I'm rating it 3 because it didn't increase hair loss. 5th one I'm already Using and then 1 more in my stock. If after 6th also I won't see any visib......

Cluster 2 | Row 199
Helped With My Hairfall Issue very fast My Dermat suggested this hair growth serum for my persistent hair fall problem that had been bothering me for the last few years. This product works wonders, I had lesser hair fall within a month. On my second bottle already. Consistent use also has led to the growth of baby hairs....

Cluster 3 | Row 28

## 15. Review Insights and Keyword Mentions Analysis


In [17]:
def percent_positive():
    return (df["vader_label"] == "positive").mean()

def mentions(keyword):
    pattern = rf"\b{re.escape(keyword)}\b"
    return df["clean"].str.contains(pattern, case=False).mean()

pos_rate = round(percent_positive() * 100, 1)
white_cast_rate = round(mentions("white cast") * 100, 1)
sticky_rate = round(mentions("sticky") * 100, 1)

print(f"Positivity: {pos_rate}%")
print(f"Mentions 'white cast': {white_cast_rate}%")
print(f"Mentions 'sticky': {sticky_rate}%")


Positivity: 65.4%
Mentions 'white cast': 0.0%
Mentions 'sticky': 2.4%


## 16. Sentiment Classification using Naive Bayes and TF-IDF


In [18]:


def rating_to_label(rating):
    if rating >= 4:
        return "positive"
    elif rating <= 2:
        return "negative"
    return "neutral"

df["label"] = df["Rating"].apply(rating_to_label)

X_train, X_test, y_train, y_test = train_test_split(
    df["norm"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

print("Data split completed.")
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
print("Label distribution:\n", df["label"].value_counts())

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 3),
    stop_words="english"
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF vectorization complete.")
print("Training data shape:", X_train_tfidf.shape)

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

print("Naive Bayes model trained.")

y_pred = nb_model.predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {round(acc * 100, 2)}%\n")
print("Classification Report:\n", classification_report(y_test, y_pred))


Data split completed.
Train size: 374, Test size: 94
Label distribution:
 label
positive    300
negative     98
neutral      70
Name: count, dtype: int64
TF-IDF vectorization complete.
Training data shape: (374, 5000)
Naive Bayes model trained.

Accuracy: 64.89%

Classification Report:
               precision    recall  f1-score   support

    negative       0.50      0.05      0.09        20
     neutral       0.00      0.00      0.00        14
    positive       0.65      1.00      0.79        60

    accuracy                           0.65        94
   macro avg       0.38      0.35      0.29        94
weighted avg       0.52      0.65      0.52        94



## 17. Sentiment Prediction using Trained Naive Bayes Model


In [19]:
def predict_sentiment(text):
    vec = tfidf.transform([text])
    pred = nb_model.predict(vec)[0]
    print(f"\nReview: {text}")
    print(f"Predicted Sentiment: {pred.upper()}")

predict_sentiment("This serum really reduced my hair fall and feels lightweight!")
predict_sentiment("hair are bad")
predict_sentiment("It’s okay, but I didn’t notice much change.")



Review: This serum really reduced my hair fall and feels lightweight!
Predicted Sentiment: POSITIVE

Review: hair are bad
Predicted Sentiment: POSITIVE

Review: It’s okay, but I didn’t notice much change.
Predicted Sentiment: POSITIVE


## 18. Sentiment Prediction using VADER Lexicon-Based Model


In [20]:


sia = SentimentIntensityAnalyzer()

def predict_vader_sentiment(text):
    score = sia.polarity_scores(text)
    compound = score["compound"]

    if compound >= 0.05:
        sentiment = "positive"
    elif compound <= -0.05:
        sentiment = "negative"
    else:
        sentiment = "neutral"

    print(f"\nReview: {text}")
    print(f"Compound Score: {compound}")
    print(f"Predicted Sentiment (VADER): {sentiment.upper()}")

predict_vader_sentiment("This serum really reduced my hair fall and feels lightweight!")
predict_vader_sentiment("hair are bad")
predict_vader_sentiment("It’s okay, but I didn’t notice much change.")



Review: This serum really reduced my hair fall and feels lightweight!
Compound Score: 0.0
Predicted Sentiment (VADER): NEUTRAL

Review: hair are bad
Compound Score: -0.5423
Predicted Sentiment (VADER): NEGATIVE

Review: It’s okay, but I didn’t notice much change.
Compound Score: 0.1154
Predicted Sentiment (VADER): POSITIVE


## 19. Model Performance Comparison: VADER vs Naive Bayes


In [21]:
df_test = pd.DataFrame({
    "actual": y_test,
    "predicted_nb": y_pred
})

df_test["predicted_vader"] = df.loc[y_test.index, "vader_label"].values

print("VADER vs Naive Bayes Comparison:")
print(df_test.head(10))

vader_acc = (df_test["actual"] == df_test["predicted_vader"]).mean()
nb_acc = (df_test["actual"] == df_test["predicted_nb"]).mean()

print(f"\nVADER Accuracy: {round(vader_acc * 100, 2)}%")
print(f"Naive Bayes Accuracy: {round(nb_acc * 100, 2)}%")


VADER vs Naive Bayes Comparison:
       actual predicted_nb predicted_vader
459  negative     positive         neutral
317  positive     positive        positive
324  positive     positive        positive
430  negative     positive        negative
176  positive     positive         neutral
233  positive     positive        positive
462  negative     positive        negative
156  positive     positive        positive
422  negative     positive        negative
247  positive     positive        positive

VADER Accuracy: 71.28%
Naive Bayes Accuracy: 64.89%


## 20. Save Processed Datasets (Full and Rating-Wise)


In [22]:
df_out = df[[
    "S.No",
    "Rating",
    "lang",
    "Review",
    "text_translated",
    "clean",
    "norm",
    "vader",
    "vader_label",
    "cluster"
]]

df_out.to_csv("minimalist_processed_all.csv", index=False, encoding="utf-8-sig")
print("Processed dataset saved as 'minimalist_processed_all.csv'.")

for r in [1, 2, 3, 4, 5]:
    subset = df_out[df_out["Rating"] == r]
    if not subset.empty:
        filename = f"minimalist_reviews_rating_{r}.csv"
        subset.to_csv(filename, index=False, encoding="utf-8-sig")
        print(f"Saved {len(subset)} reviews to {filename}.")


Processed dataset saved as 'minimalist_processed_all.csv'.
Saved 66 reviews to minimalist_reviews_rating_1.csv.
Saved 32 reviews to minimalist_reviews_rating_2.csv.
Saved 70 reviews to minimalist_reviews_rating_3.csv.
Saved 94 reviews to minimalist_reviews_rating_4.csv.
Saved 206 reviews to minimalist_reviews_rating_5.csv.


## 21. Review Summarization 

In [27]:


summary = df.groupby("cluster").apply(
    lambda g: g.sample(1)["clean"].values[0]
).reset_index(drop=True)

print("Representative customer feedback summary:\n")
for i, text in enumerate(summary, 1):
    print(f"{i}. {text}\n")


Representative customer feedback summary:

1. Shows result, but will have to be consistent Unlike skincare for my face, I admit I can get quite lazy when it comes to doing anything for my hair. Despite this, the serum has become the only thing I do now besides washing, that is. Been using for some time now, and I can see the result, in terms of less hairfall and visible new growth. But yo...

2. No change is observed There is no change in the existing condition

3. It's too early to say anything about it I have straight and fine hair, the serum feels too heavy on scalp and oily, but the oiliness reduces overtime. I feel minimalist should come up with a lighter formula, as i have fine hair the serum made my scalp more visible.

4. 5 Stars Still I have hair fall

5. Good Really good lightweight hair serum. Using it for 3 months straight shows new hair growth.



## 22. Simulated Question–Answer Based on Review Insights


In [29]:
questions_answers = [
    (
        "Does this serum actually help reduce hair fall?",
        "Yes. Over 65% of overall reviews are positive, and topic modeling shows frequent mention of 'hair', 'fall', and 'growth' together. "
        "Representative reviews describe visible reduction in hair fall within 3–4 weeks of consistent use."
    ),
    (
        "Is the product sticky or greasy?",
        "Only about 2–3% of reviews mention 'sticky' or 'oily'. Most of those are mixed to negative, while the majority describe it as lightweight and smooth, "
        "so users generally find the texture non-sticky."
    ),
    (
        "Does it have a strong fragrance?",
        "Mentions of fragrance are rare and sentiment is neutral to positive. Users mostly describe the smell as mild or pleasant, "
        "suggesting fragrance is not a major concern."
    ),
    (
        "How soon do people see results?",
        "Clustering and keyword patterns highlight '1 month', '3 weeks', and '3 months' as frequent time references. "
        "Many users reported visible reduction in hair fall and some new growth within the first month of regular use."
    ),
    (
        "Are customers generally satisfied overall?",
        "Yes. About 65% of VADER sentiment scores and 65% Naive Bayes predictions are positive. "
        "VADER achieved ≈ 71% accuracy compared to 65% for Naive Bayes, showing that sentiment detection aligns well with overall user satisfaction."
    ),
    (
        "What issues or negatives do users mention?",
        "Roughly 23% of reviews are negative. The most common complaints include oil leakage during delivery, lack of visible results after multiple bottles, "
        "and a heavy texture for fine hair. These appear in clusters 1 and 3 from the K-Means grouping."
    ),
    (
        "Can the serum be recommended for consistent users?",
        "Yes. Representative reviews from the largest clusters highlight that consistent use over 2–3 months leads to reduced hair fall, "
        "thicker strands, and visible baby-hair growth."
    )
]

for q, a in questions_answers:
    print(f"Q: {q}\nA: {a}\n")


Q: Does this serum actually help reduce hair fall?
A: Yes. Over 65% of overall reviews are positive, and topic modeling shows frequent mention of 'hair', 'fall', and 'growth' together. Representative reviews describe visible reduction in hair fall within 3–4 weeks of consistent use.

Q: Is the product sticky or greasy?
A: Only about 2–3% of reviews mention 'sticky' or 'oily'. Most of those are mixed to negative, while the majority describe it as lightweight and smooth, so users generally find the texture non-sticky.

Q: Does it have a strong fragrance?
A: Mentions of fragrance are rare and sentiment is neutral to positive. Users mostly describe the smell as mild or pleasant, suggesting fragrance is not a major concern.

Q: How soon do people see results?
A: Clustering and keyword patterns highlight '1 month', '3 weeks', and '3 months' as frequent time references. Many users reported visible reduction in hair fall and some new growth within the first month of regular use.

Q: Are custom