In [2]:
# Task 2 – Sentiment & Thematic Analysis
#Omega Consultancy Fintech Mobile App Review Analytics

#This notebook performs:

#✔ Load processed review data  
#✔ Sentiment visualization  
#✔ TF-IDF keyword extraction  
#✔ Theme visualization  
#✔ Sample reviews per theme  
#✔ Business insights summary  


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from config import DATA_PATHS

pd.set_option("display.max_colwidth", 250)

df = pd.read_csv(DATA_PATHS["sentiment_results"])
print(f"Loaded {len(df)} processed reviews")
df.head()


In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x="sentiment_label", palette="viridis")
plt.title("Overall Sentiment Distribution")
plt.show()


In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x="bank", hue="sentiment_label", palette="viridis")
plt.title("Sentiment Distribution per Bank")
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
df.groupby("bank")["sentiment_score"].mean().plot(kind="bar", color="teal")
plt.title("Average Sentiment Score per Bank")
plt.ylabel("Average Score")
plt.show()


In [None]:
## 2. TF-IDF Keyword Extraction
#Helps highlight most frequently discussed topics per bank.


In [None]:
keywords_per_bank = {}

for bank in df["bank"].unique():
    reviews = df[df["bank"] == bank]["review"].astype(str).tolist()
    vectorizer = TfidfVectorizer(stop_words="english", max_features=10)

    try:
        vectorizer.fit(reviews)
        keywords = vectorizer.get_feature_names_out().tolist()
    except ValueError:
        keywords = []

    keywords_per_bank[bank] = keywords

keywords_per_bank


In [None]:
## 3. Theme Distribution
#Themes summarize feedback patterns for each bank.


In [None]:
plt.figure(figsize=(8, 5))
df["themes"].value_counts().plot(kind="bar", color="purple")
plt.title("Theme Distribution Across All Banks")
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x="bank", hue="themes", palette="tab20")
plt.title("Theme Distribution per Bank")
plt.xticks(rotation=45)
plt.legend(loc="upper right", bbox_to_anchor=(1.3, 1))
plt.show()


In [None]:
## 4. Sample Reviews per Theme
#Useful for illustrating real user complaints & feedback.


In [None]:
for theme in df["themes"].unique():
    print(f"\n----- Theme: {theme} -----\n")
    display(df[df["themes"] == theme][["review", "rating", "bank"]].head(5))


In [None]:
## 5. Business Insights Summary
#These insights support product and engineering decisions for the banks.


In [None]:
print("""
=====================================================
TASK 2 – BUSINESS INSIGHT SUMMARY
=====================================================

1. Key Sentiment Insights:
- CBE and Dashen have more negative sentiment linked to speed & login issues.
- Positive reviews often praise UI simplicity and reliability.

2. Major Complaint Themes:
- Transaction Performance (slow loading, failed transfers)
- Account Access Issues (login errors, OTP failures)
- Customer Support delays

3. Positive Drivers:
- Clean UI design
- Ease of navigation
- Reliable balance checking

4. Recommendations:
- Improve app load speed & transfer stability
- Strengthen login & OTP systems
- Add biometric login, alerts & better support integration

=====================================================
Insights ready for Task 4 reporting.
=====================================================
""")
