In [1]:
# add src to path if needed
import sys, os
sys.path.append('../src')

import pandas as pd
from sentiment import SentimentAnalyzer
from themes import ThemeExtractor

# 1) Load your clean dataset from Task 1
df = pd.read_csv("clean_reviews.csv")  # adjust path - if notebook inside notebook/ use "../notebook/clean_reviews.csv" or correct path

# quick check
print(df['bank'].value_counts())

# 2) Sentiment analysis
# Try transformer first (auto): will fall back to vader if transformers not available
sa = SentimentAnalyzer(method="auto", device=None)  # device=None => CPU
sent_df = sa.analyze_series(df['review'], batch_size=64)

# attach results
df = pd.concat([df.reset_index(drop=True), sent_df.reset_index(drop=True)], axis=1)
# save intermediate
df.to_csv("task2_with_sentiment.csv", index=False)
print("Saved task2_with_sentiment.csv")

# Basic aggregation: mean sentiment_score per bank and rating
agg = df.groupby(['bank','rating'])['sentiment_score'].agg(['count','mean'])
print(agg.sort_values('count', ascending=False).head(20))

# 3) Thematic analysis: extract TF-IDF keywords per bank
te = ThemeExtractor(ngram_range=(1,2), top_k_keywords=30, min_df=2)
bank_keywords = te.extract_bank_keywords(df, bank_col='bank', text_col='review')

# Display top 15 keywords per bank
for bank, kws in bank_keywords.items():
    print(f"\nTop keywords for {bank}:")
    for w, score in kws[:15]:
        print(f"  {w} ({score:.2f})")

# 4) Map keywords to themes using a rule-based mapping (edit to taste)
theme_mapping = {
    "Account Access Issues": ["login", "password", "fingerprint", "pin", "authenticate", "access"],
    "Transaction Performance": ["slow", "delay", "transfer", "processing", "timeout", "speed"],
    "UI & UX": ["ui", "interface", "layout", "design", "buttons", "navigation"],
    "Crashes & Stability": ["crash", "freeze", "bug", "error", "exception"],
    "Customer Support": ["support", "customer service", "help", "agent", "response", "call"]
}

# Build theme suggestions per bank by mapping top keywords
bank_themes = {}
for bank, kws in bank_keywords.items():
    keywords = [k for k,_ in kws]
    matched_themes = te.rule_based_theme_mapping(keywords, theme_mapping)
    # fallback: if fewer than 2 themes, run clustering on top keywords to suggest groups
    if len(matched_themes) < 2:
        clusters = te.cluster_keywords(keywords[:30], n_clusters=3)
        # label clusters generically
        matched_themes = [f"cluster_{i}" for i in clusters.keys()]
    bank_themes[bank] = matched_themes

print("\nSuggested themes per bank:")
for b, t in bank_themes.items():
    print(b, "=>", t)

# 5) Assign themes to individual reviews via keyword matching (simple)
def assign_themes_to_review(text, mapping):
    text_l = text.lower()
    themes = set()
    for theme, kws in mapping.items():
        for kw in kws:
            if kw in text_l:
                themes.add(theme)
                break
    return ";".join(sorted(themes)) if themes else "Other"

df['identified_themes'] = df['review'].astype(str).apply(lambda t: assign_themes_to_review(t, theme_mapping))

# Save final CSV for Task 2
out_cols = ['review','rating','date','bank','source','sentiment_label','sentiment_score','identified_themes']
df[out_cols].to_csv("task2_results.csv", index=False)
print("Saved task2_results.csv")


bank
BOA       408
CBE       403
Dashen    375
Name: count, dtype: int64


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Saved task2_with_sentiment.csv
               count      mean
bank   rating                 
CBE    5         252  0.480916
Dashen 5         229  0.510104
BOA    1         175 -0.869354
       5         172  0.366170
CBE    1          70 -0.681755
Dashen 1          59 -0.774107
CBE    4          37  0.022754
Dashen 4          35  0.179971
       3          34 -0.003815
BOA    3          29 -0.444899
CBE    3          23 -0.474668
       2          21 -0.612910
BOA    4          19 -0.362784
Dashen 2          18 -0.628444
BOA    2          13 -0.945256

Top keywords for BOA:
  app (36.25)
  good (19.40)
  bank (12.43)
  boa (10.83)
  working (9.71)
  work (9.60)
  best (9.13)
  mobile (8.32)
  worst (8.31)
  use (8.04)
  great (7.88)
  banking (7.55)
  doesn (7.49)
  like (6.85)
  bad (6.61)

Top keywords for CBE:
  app (33.92)
  good (29.72)
  best (17.02)
  cbe (12.45)
  bank (11.00)
  nice (10.12)
  like (8.38)
  application (7.29)
  good app (7.17)
  update (6.70)
  excellent (6.19)