# Task 2: Sentiment and Thematic Analysis
## 1 – Imports & Setup


In [None]:

# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

import spacy
import os
import sys

# Get absolute path to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add project root to Python path
sys.path.append(ROOT_DIR)

print("Project Root Added:", ROOT_DIR)


# Import modular classes
from src.sentiment_analysis import SentimentAnalyzer
from src.thematic_analysis import ThematicAnalyzer
from src.pipeline import ReviewPipeline

# Plot styling
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 4)



## 2 – Download NLTK Lexicons / Stopwords


In [None]:
# nltk downloader
nltk.download('vader_lexicon')
nltk.download('stopwords')
sia = SentimentIntensityAnalyzer()
stop_words = set(nltk.corpus.stopwords.words("english"))



## 3 – Load Preprocessed Reviews


In [None]:

# Load Reviews

df = pd.read_csv("../data/preprocessed/google_play_processed_reviews.csv")
df.head()



## 4 – Sentiment Analysis Using `SentimentAnalyzer` Class


In [None]:

# sentiment analysis
sentiment_analyzer = SentimentAnalyzer(df, text_column="clean_text")
df_sentiment = sentiment_analyzer.analyze_sentiment()
df_sentiment[["clean_text", "rating", "sentiment_score", "sentiment_label"]].head()



## 5 – Lexicon-Based Sentiment (TextBlob & VADER) for Comparison


In [None]:

# TextBlob
df_sentiment["tb_polarity"] = df_sentiment["clean_text"].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df_sentiment["tb_subjectivity"] = df_sentiment["clean_text"].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

def polarity_to_label(p):
    if p > 0.1:
        return "positive"
    elif p < -0.1:
        return "negative"
    else:
        return "neutral"

df_sentiment["tb_sentiment"] = df_sentiment["tb_polarity"].apply(polarity_to_label)

# VADER
def vader_compound(text):
    return sia.polarity_scores(str(text))["compound"]

df_sentiment["vader_compound"] = df_sentiment["clean_text"].apply(vader_compound)

def vader_label(c):
    if c >= 0.05:
        return "positive"
    elif c <= -0.05:
        return "negative"
    else:
        return "neutral"

df_sentiment["vader_sentiment"] = df_sentiment["vader_compound"].apply(vader_label)
print(df_sentiment.head())





## 6 – Thematic Analysis Using `ThematicAnalyzer` Class


In [None]:

# Thematic Analyzer
thematic_analyzer = ThematicAnalyzer(df_sentiment, text_column="clean_text")
df_themes = thematic_analyzer.extract_keywords(top_n=10)
df_themes = thematic_analyzer.assign_themes()
df_themes[["clean_text", "keywords", "identified_themes"]].head()


## 7 – Frequency-Based (Bag of Words) & TF-IDF


In [None]:

# Application of Frequency-Based (Bag of Words) & TF-IDF
df_themes["clean_text"] = df_themes["clean_text"].str.lower()

# Bag of Words
count_vec = CountVectorizer(stop_words="english")
X_counts = count_vec.fit_transform(df_themes["clean_text"])
word_counts = np.asarray(X_counts.sum(axis=0)).flatten()
vocab = np.array(count_vec.get_feature_names_out())
freq_df = pd.DataFrame({"word": vocab, "count": word_counts}).sort_values("count", ascending=False)

# TF-IDF
tfidf_vec = TfidfVectorizer(stop_words="english")
X_tfidf = tfidf_vec.fit_transform(df_themes["clean_text"])
tfidf_means = np.asarray(X_tfidf.mean(axis=0)).flatten()
vocab_tfidf = np.array(tfidf_vec.get_feature_names_out())
tfidf_df = pd.DataFrame({"word": vocab_tfidf, "tfidf": tfidf_means}).sort_values("tfidf", ascending=False)
# Show top 10 frequent words
print("Top 10 Frequent Words:")
display(freq_df.head(10))

# Show top 10 TF-IDF words
print("\nTop 10 TF-IDF Words:")
display(tfidf_df.head(10))
import matplotlib.pyplot as plt

# Top 10 frequent words plot
top_n = 10
top_freq = freq_df.head(top_n)
plt.bar(top_freq["word"], top_freq["count"])
plt.xticks(rotation=45)
plt.title("Top Frequent Words (Bag-of-Words)")
plt.ylabel("Count")
plt.show()



## 8 – Topic Modeling (LDA)


In [None]:

# Tokenize & remove stopwords
df_themes["tokens"] = df_themes["clean_text"].str.split()
df_themes["tokens_nostop"] = df_themes["tokens"].apply(lambda words: [w for w in words if w not in stop_words])

dictionary = Dictionary(df_themes["tokens_nostop"])
corpus = [dictionary.doc2bow(tokens) for tokens in df_themes["tokens_nostop"]]

lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10, random_state=42)
topics = lda_model.show_topics(num_topics=2, num_words=10, formatted=False)

for i, topic in topics:
    print(f"\n--- Topic {i+1} ---")
    for word, weight in topic:
        print(f"{word:15s}  weight={weight:.4f}")


## 9 – Part-of-Speech Tagging (Noun Extraction)


In [None]:

# PoS Tagging
nlp = spacy.load("en_core_web_sm")
df_themes["nouns"] = df_themes["clean_text"].apply(lambda x: [token.text for token in nlp(str(x)) if token.pos_ == "NOUN"])
df_themes[["clean_text", "nouns"]].head()



## 10 – Visualizations
### Sentiment Distribution (Pipeline vs TextBlob vs VADER)


In [None]:

# Visualizing Sentiment Distribution (Pipeline vs TextBlob vs VADER)
plt.figure(figsize=(12,4))
sns.countplot(x="sentiment_label", data=df_themes, palette="viridis")
plt.title("Pipeline Sentiment Distribution")
plt.show()

plt.figure(figsize=(12,4))
sns.countplot(x="tb_sentiment", data=df_themes, palette="plasma")
plt.title("TextBlob Sentiment Distribution")
plt.show()

plt.figure(figsize=(12,4))
sns.countplot(x="vader_sentiment", data=df_themes, palette="coolwarm")
plt.title("VADER Sentiment Distribution")
plt.show()



### Themes Distribution


In [None]:

# Theme distribution visualization
top_themes = df_themes["identified_themes"].value_counts().nlargest(10)
sns.barplot(x=top_themes.values, y=top_themes.index, palette="Set2")
plt.title("Top Themes Across Reviews")
plt.xlabel("Number of Reviews")
plt.ylabel("Theme")
plt.show()


### Sentiment Score vs Rating Visualization


In [None]:

# Sentiment Score vs Rating Visualization
fig, axes = plt.subplots(1, 2, figsize=(12,4))
axes[0].scatter(df_themes["rating"], df_themes["sentiment_score"])
axes[0].set_title("Pipeline Sentiment vs Rating")
axes[0].set_xlabel("Star Rating")
axes[0].set_ylabel("Sentiment Score")

axes[1].scatter(df_themes["rating"], df_themes["tb_polarity"])
axes[1].set_title("TextBlob Polarity vs Rating")
axes[1].set_xlabel("Star Rating")
axes[1].set_ylabel("Polarity (-1 to 1)")

plt.tight_layout()
plt.show()



## 11 – Save Processed Data (Optional)


In [None]:

# Save Processed Data (Optional)

import os

# Save to project_root/data/preprocessed/
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # one level up from notebooks/
output_dir = os.path.join(project_root, "data", "preprocessed")
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "sentiment_preprocessed.csv")
df_themes.to_csv(output_path, index=False)

print(f"Saved processed data to {output_path}")

