# Task 2: Sentiment and Thematic Analysis
## 1 â€“ Imports & Setup


In [None]:
# review_pipeline_prod.py
import os
import sys
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
import spacy

# ------------------------
# Setup Logging
# ------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

# ------------------------
# Paths
# ------------------------
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(ROOT_DIR, "data", "preprocessed")
os.makedirs(DATA_DIR, exist_ok=True)

INPUT_PATH = os.path.join(DATA_DIR, "google_play_processed_reviews.csv")
OUTPUT_PATH = os.path.join(DATA_DIR, "sentiment_preprocessed.csv")

# Add project src to path
sys.path.append(ROOT_DIR)

# ------------------------
# Import Pipeline Modules
# ------------------------
from src.sentiment_analysis import SentimentAnalyzer
from src.thematic_analysis import ThematicAnalyzer
from src.pipeline import ReviewPipeline

# ------------------------
# NLTK Setup
# ------------------------
nltk.download('vader_lexicon', quiet=True)
nltk.download('stopwords', quiet=True)
sia = SentimentIntensityAnalyzer()
stop_words = set(nltk.corpus.stopwords.words("english"))


## Load Preprocessed Reviews

In [None]:

# ------------------------
# Load Data
# ------------------------
try:
    df = pd.read_csv(INPUT_PATH)
    logging.info(f"Loaded {len(df)} reviews from {INPUT_PATH}")
except FileNotFoundError:
    logging.error(f"Input file not found: {INPUT_PATH}")
    raise
df.head()


## Sentiment Analysis

In [None]:
# ------------------------
# Run Pipeline
# ------------------------
pipeline = ReviewPipeline(input_path=INPUT_PATH, output_path=OUTPUT_PATH)
df_final = pipeline.run_pipeline()

# ------------------------
# Lexicon-Based Sentiment (TextBlob & VADER)
# ------------------------
sentiment_analyzer = SentimentAnalyzer(df, text_column="clean_text")
df_sentiment = sentiment_analyzer.analyze_sentiment()
df_sentiment[["clean_text", "rating", "sentiment_score", "sentiment_label"]].head()

logging.info("Running TextBlob and VADER sentiment scoring...")
df_final["tb_polarity"] = df_final["clean_text"].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df_final["tb_subjectivity"] = df_final["clean_text"].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

def polarity_to_label(p):
    if p > 0.1:
        return "positive"
    elif p < -0.1:
        return "negative"
    else:
        return "neutral"

df_final["tb_sentiment"] = df_final["tb_polarity"].apply(polarity_to_label)

df_final["vader_compound"] = df_final["clean_text"].apply(lambda x: sia.polarity_scores(str(x))["compound"])

def vader_label(c):
    if c >= 0.05:
        return "positive"
    elif c <= -0.05:
        return "negative"
    else:
        return "neutral"

df_final["vader_sentiment"] = df_final["vader_compound"].apply(vader_label)
print(df_final.head())


In [None]:
# sentiment analysis
sentiment_analyzer = SentimentAnalyzer(df, text_column="clean_text")
df_sentiment = sentiment_analyzer.analyze_sentiment()
df_sentiment[["clean_text", "rating", "sentiment_score", "sentiment_label"]].head()

## Thematic Analysis

In [None]:
# Thematic Analysis
thematic_analyzer = ThematicAnalyzer(df_sentiment, text_column="clean_text")
df_themes = thematic_analyzer.extract_keywords(top_n=10)
df_themes = thematic_analyzer.assign_themes()
df_themes[["clean_text", "keywords", "identified_themes"]].head()
# Ensure all sentiment columns are carried over
#for col in ["tb_polarity", "tb_subjectivity", "tb_sentiment", "vader_compound", "vader_sentiment"]:
    #if col in df_sentiment.columns:
        #df_themes[col] = df_sentiment[col]


In [None]:
# ------------------------
# Frequency-Based & TF-IDF
# ------------------------
logging.info("Computing Bag-of-Words and TF-IDF...")

df_final["clean_text"] = df_final["clean_text"].str.lower()

# Bag of Words
count_vec = CountVectorizer(stop_words="english")
X_counts = count_vec.fit_transform(df_final["clean_text"])
word_counts = np.asarray(X_counts.sum(axis=0)).flatten()
vocab = np.array(count_vec.get_feature_names_out())
freq_df = pd.DataFrame({"word": vocab, "count": word_counts}).sort_values("count", ascending=False)
freq_df.to_csv(os.path.join(DATA_DIR, "freq_words.csv"), index=False)


# TF-IDF
tfidf_vec = TfidfVectorizer(stop_words="english")
X_tfidf = tfidf_vec.fit_transform(df_final["clean_text"])
tfidf_means = np.asarray(X_tfidf.mean(axis=0)).flatten()
vocab_tfidf = np.array(tfidf_vec.get_feature_names_out())
tfidf_df = pd.DataFrame({"word": vocab_tfidf, "tfidf": tfidf_means}).sort_values("tfidf", ascending=False)
tfidf_df.to_csv(os.path.join(DATA_DIR, "tfidf_words.csv"), index=False)
# Show top 10 TF-IDF words
print("\nTop 10 TF-IDF Words:")
display(tfidf_df.head(10))
import matplotlib.pyplot as plt

# Top 10 frequent words plot
top_n = 10
top_freq = freq_df.head(top_n)
plt.bar(top_freq["word"], top_freq["count"])
plt.xticks(rotation=45)
plt.title("Top Frequent Words (Bag-of-Words)")
plt.ylabel("Count")
plt.show()


### Topic Modelling

In [None]:

# ------------------------
# Topic Modeling (LDA)
# ------------------------
logging.info("Performing LDA topic modeling...")
df_final["tokens"] = df_final["clean_text"].str.split()
df_final["tokens_nostop"] = df_final["tokens"].apply(lambda words: [w for w in words if w not in stop_words])

dictionary = Dictionary(df_final["tokens_nostop"])
corpus = [dictionary.doc2bow(tokens) for tokens in df_final["tokens_nostop"]]

lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10, random_state=42)
topics = lda_model.show_topics(num_topics=2, num_words=10, formatted=False)

for i, topic in topics:
    logging.info(f"--- Topic {i+1} ---")
    for word, weight in topic:
        logging.info(f"{word:15s} weight={weight:.4f}")



### PoS Tagging

In [None]:

# ------------------------
# Part-of-Speech Tagging (Nouns)
# ------------------------
logging.info("Extracting nouns using spaCy...")
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    logging.warning("spaCy model not found. Downloading...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

df_final["nouns"] = df_final["clean_text"].apply(lambda x: [token.text for token in nlp(str(x)) if token.pos_ == "NOUN"])
df_final[["clean_text", "nouns"]].head()



## Save Processed data

In [None]:

# ------------------------
# Save Final Processed Data
# ------------------------
df_final.to_csv(OUTPUT_PATH, index=False)
logging.info(f"Final processed data saved to {OUTPUT_PATH}")


## Visualizations

In [None]:

# ------------------------
# Optional Visualizations
# ------------------------
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 4)

# Sentiment Distribution
plt.figure(figsize=(12,4))
sns.countplot(x="sentiment_label", data=df_final, palette="viridis")
plt.title("Pipeline Sentiment Distribution")
plt.show()


# Top Themes
top_themes = df_final["identified_themes"].value_counts().nlargest(10)
sns.barplot(x=top_themes.values, y=top_themes.index, palette="Set2")
plt.title("Top Themes Across Reviews")
plt.xlabel("Number of Reviews")
plt.ylabel("Theme")
plt.show()

# Sentiment Score vs Rating Visualization
fig, axes = plt.subplots(1, 2, figsize=(12,4))
axes[0].scatter(df_themes["rating"], df_themes["sentiment_score"])
axes[0].set_title("Pipeline Sentiment vs Rating")
axes[0].set_xlabel("Star Rating")
axes[0].set_ylabel("Sentiment Score")
plt.show()

#axes[1].scatter(df_themes["rating"], df_themes["tb_polarity"])
#axes[1].set_title("TextBlob Polarity vs Rating")
#axes[1].set_xlabel("Star Rating")
#axes[1].set_ylabel("Polarity (-1 to 1)")
#plt.show()

