### Sentiment and Thematic Analysis
#### Quantify review sentiment and identify themes to uncover satisfaction drivers and pain points.
* Sentiment Analysis with distilbert-base-uncased-finetuned-sst-2-english
* Thematic Analysis

#### Load The SCraped and Cleaned Data

In [1]:
# import package 
import pandas as pd
df=pd.read_csv("../data/processed/bank_reviews_clean.csv")
df.head(5)


Unnamed: 0,review_id,review,rating,date,bank,source
0,5112423d-e618-44ba-ba49-62677cb76cd6,This application is very important and advanta...,5,2025-11-28,Commercial Bank of Ethiopia,Google Play Store
1,bcb34681-1dd4-4781-b400-4393bb10b1d9,why didn't work this app?,1,2025-11-28,Commercial Bank of Ethiopia,Google Play Store
2,c69f051a-00f8-4144-8423-b7ebcd328d2d,The app makes our life easier. Thank you CBE!,5,2025-11-28,Commercial Bank of Ethiopia,Google Play Store
3,f8002d06-b5c5-4ed1-9d51-a9a379304cf8,the most advanced app. but how to stay safe?,5,2025-11-27,Commercial Bank of Ethiopia,Google Play Store
4,81000db5-aa51-467e-826c-fc96160e96a8,Good application,4,2025-11-27,Commercial Bank of Ethiopia,Google Play Store


#### Sentiment Analysis Using Huggingface DistilBert

In [2]:
from transformers import pipeline


sentiment_model = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

def get_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return None
    result = sentiment_model(text[:500])[0]  # truncate very long reviews
    return result['label'], result['score']

df['sentiment_label'], df['sentiment_score'] = zip(*df['review'].apply(get_sentiment))


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [3]:
df.head(5)

Unnamed: 0,review_id,review,rating,date,bank,source,sentiment_label,sentiment_score
0,5112423d-e618-44ba-ba49-62677cb76cd6,This application is very important and advanta...,5,2025-11-28,Commercial Bank of Ethiopia,Google Play Store,POSITIVE,0.998468
1,bcb34681-1dd4-4781-b400-4393bb10b1d9,why didn't work this app?,1,2025-11-28,Commercial Bank of Ethiopia,Google Play Store,NEGATIVE,0.999132
2,c69f051a-00f8-4144-8423-b7ebcd328d2d,The app makes our life easier. Thank you CBE!,5,2025-11-28,Commercial Bank of Ethiopia,Google Play Store,POSITIVE,0.999696
3,f8002d06-b5c5-4ed1-9d51-a9a379304cf8,the most advanced app. but how to stay safe?,5,2025-11-27,Commercial Bank of Ethiopia,Google Play Store,NEGATIVE,0.95651
4,81000db5-aa51-467e-826c-fc96160e96a8,Good application,4,2025-11-27,Commercial Bank of Ethiopia,Google Play Store,POSITIVE,0.999855


#### Aggregate Sentiment by Bank and Rating

In [4]:
bank_rating_sentiment=df.groupby(['bank','rating']).agg({'sentiment_score':'mean'}).reset_index()
print(bank_rating_sentiment)


                           bank  rating  sentiment_score
0             Bank of Abyssinia       1         0.992800
1             Bank of Abyssinia       2         0.966670
2             Bank of Abyssinia       3         0.998618
3             Bank of Abyssinia       4         0.969993
4             Bank of Abyssinia       5         0.975590
5   Commercial Bank of Ethiopia       1         0.992261
6   Commercial Bank of Ethiopia       2         0.992607
7   Commercial Bank of Ethiopia       3         0.978476
8   Commercial Bank of Ethiopia       4         0.977407
9   Commercial Bank of Ethiopia       5         0.981973
10                  Dashen Bank       1         0.988763
11                  Dashen Bank       2         0.996925
12                  Dashen Bank       3         0.970207
13                  Dashen Bank       4         0.944521
14                  Dashen Bank       5         0.988043


#### Thematic Analysis

In [5]:

import sys
sys.path.append('..')
# import models necessary for thematic analysis
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scripts.config import DATA_PATHS, OUTPUT_FILE

#### Preprocess Text

In [6]:
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation, numbers
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df['clean_review'] = df['review'].apply(preprocess_text)


#### TF-IDF Keyword Extraction

In [7]:
vectorizer = TfidfVectorizer(
    max_features=500,  # smaller feature set for small dataset
    stop_words='english',
    ngram_range=(1,2)
)
X = vectorizer.fit_transform(df['clean_review'].dropna())
keywords = vectorizer.get_feature_names_out()



#### KMeans Clustering for Topics

In [8]:
kmeans = KMeans(n_clusters=5, random_state=42)  # 5 themes
df['topic'] = kmeans.fit_predict(X)
df['topic'].head(5)


0    1
1    1
2    1
3    1
4    1
Name: topic, dtype: int32

#### Manual Theme Mapping

In [9]:
theme_mapping = {
     0: "Ease of Use / Performance",
    1: "General Praise / Mixed Feedback",
    2: "User Experience / App Features",
    3: "Bugs / Issues / Reliability",
    4: "Positive Feedback / UI Appreciation"
}

df['theme'] = df['topic'].map(theme_mapping)
df.head(5)


Unnamed: 0,review_id,review,rating,date,bank,source,sentiment_label,sentiment_score,clean_review,topic,theme
0,5112423d-e618-44ba-ba49-62677cb76cd6,This application is very important and advanta...,5,2025-11-28,Commercial Bank of Ethiopia,Google Play Store,POSITIVE,0.998468,this application is very important and advanta...,1,General Praise / Mixed Feedback
1,bcb34681-1dd4-4781-b400-4393bb10b1d9,why didn't work this app?,1,2025-11-28,Commercial Bank of Ethiopia,Google Play Store,NEGATIVE,0.999132,why didnt work this app,1,General Praise / Mixed Feedback
2,c69f051a-00f8-4144-8423-b7ebcd328d2d,The app makes our life easier. Thank you CBE!,5,2025-11-28,Commercial Bank of Ethiopia,Google Play Store,POSITIVE,0.999696,the app makes our life easier thank you cbe,1,General Praise / Mixed Feedback
3,f8002d06-b5c5-4ed1-9d51-a9a379304cf8,the most advanced app. but how to stay safe?,5,2025-11-27,Commercial Bank of Ethiopia,Google Play Store,NEGATIVE,0.95651,the most advanced app but how to stay safe,1,General Praise / Mixed Feedback
4,81000db5-aa51-467e-826c-fc96160e96a8,Good application,4,2025-11-27,Commercial Bank of Ethiopia,Google Play Store,POSITIVE,0.999855,good application,1,General Praise / Mixed Feedback


#### Inspect Top Keywords per Theme (Optional)

In [10]:
for topic_num in df['topic'].unique():
    indices = df[df['topic'] == topic_num].index
    subset_X = X[indices]
    summed_tfidf = subset_X.sum(axis=0)
    words_scores = [(word, summed_tfidf[0, idx]) for idx, word in enumerate(keywords)]
    sorted_words = sorted(words_scores, key=lambda x: x[1], reverse=True)[:10]
    print(f"Top keywords for topic {topic_num}: {[w[0] for w in sorted_words]}")


Top keywords for topic 1: ['app', 'best', 'excellent', 'like', 'working', 'ok', 'best app', 'cbe', 'wow', 'bad']
Top keywords for topic 4: ['nice', 'nice app', 'app', 'use', 'best', 'app seen', 'seen', 'abdrug', 'abdrug bentahuuuu', 'abdulakim']
Top keywords for topic 2: ['good', 'good app', 'app', 'bad', 'experience', 'service', 'work', 'love new', 'love', 'new']
Top keywords for topic 0: ['easy', 'fast', 'easy use', 'app', 'use', 'fast app', 'simple', 'bank', 'service', 'fast service']
Top keywords for topic 3: ['bank', 'worst', 'app', 'worst app', 'dashen', 'dashen bank', 'mobile', 'fix', 'banking', 'dont']


#### Save Results to CSV

In [11]:
import os
output_file = os.path.join(DATA_PATHS['processed'], "bank_reviews_with_themes.csv")
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"[SAVED] Themed reviews → {output_file}")

[SAVED] Themed reviews → d:\week-2_Project\Customer_Experience_Analytics\data\processed\bank_reviews_with_themes.csv


In [12]:
df['review_id'].duplicated().sum()


np.int64(1034)

In [13]:
df.review_id.value_counts().head(10)


review_id
5112423d-e618-44ba-ba49-62677cb76cd6    3
bcb34681-1dd4-4781-b400-4393bb10b1d9    3
c69f051a-00f8-4144-8423-b7ebcd328d2d    3
f8002d06-b5c5-4ed1-9d51-a9a379304cf8    3
81000db5-aa51-467e-826c-fc96160e96a8    3
3d88a334-958c-4717-9f97-c5d46359e054    3
99d376ea-4824-4af9-a093-27360acc3a5c    3
f1861daf-a1ed-407a-9e7c-295edbb3877d    3
fd178fb7-7026-4d02-98a0-5c86c3bd56f5    3
571c66c6-fd18-437b-b8e5-8c443e9db2df    3
Name: count, dtype: int64