# import dependencies

In [7]:
import pandas as pd
!pip install transformers torch tqdm
!pip install transformers torch tqdm
from transformers import pipeline
from tqdm import tqdm
!pip install nltk scikit-learn spacy
!python -m nltk.downloader stopwords
!python -m spacy download en_core_web_sm
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from collections import defaultdict




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\addis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Load Cleaned Data

In [8]:
df = pd.read_csv('../data/cleaned_reviews.csv')
df.head()


Unnamed: 0,review,rating,date,bank,source
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06,Commercial Bank of Ethiopia,Google Play
1,what is this app problem???,1,2025-06-05,Commercial Bank of Ethiopia,Google Play
2,the app is proactive and a good connections.,5,2025-06-05,Commercial Bank of Ethiopia,Google Play
3,I cannot send to cbebirr app. through this app.,3,2025-06-05,Commercial Bank of Ethiopia,Google Play
4,good,4,2025-06-05,Commercial Bank of Ethiopia,Google Play


#  Sentiment Analysis with DistilBERT

In [9]:
# Load sentiment pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Apply to each review (batched)
tqdm.pandas()
df['sentiment'] = df['review'].progress_apply(lambda x: sentiment_pipeline(x[:512])[0]['label'])
df['sentiment_score'] = df['review'].progress_apply(lambda x: sentiment_pipeline(x[:512])[0]['score'])

df[['review', 'sentiment', 'sentiment_score']].head()


Device set to use cpu
100%|██████████| 1177/1177 [00:28<00:00, 41.39it/s]
100%|██████████| 1177/1177 [00:27<00:00, 42.92it/s]


Unnamed: 0,review,sentiment,sentiment_score
0,"""Why don’t your ATMs support account-to-accoun...",NEGATIVE,0.996465
1,what is this app problem???,NEGATIVE,0.999623
2,the app is proactive and a good connections.,POSITIVE,0.999868
3,I cannot send to cbebirr app. through this app.,NEGATIVE,0.995335
4,good,POSITIVE,0.999816


# Aggregate by Rating/Bank

In [3]:
# Grouping by rating to see sentiment distributions
sentiment_by_rating = df.groupby(['bank', 'rating'])['sentiment'].value_counts(normalize=True).unstack().fillna(0)
sentiment_by_rating

df.to_csv('../outputs/sentiment_results.csv', index=False)

NameError: name 'df' is not defined

# Preprocess Text for Thematic Analysis

In [11]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.lemma_ not in stop_words and token.is_alpha])

# Preprocess
df['cleaned'] = df['review'].apply(clean_text)
df['lemmatized'] = df['cleaned'].apply(lemmatize)
df[['review', 'lemmatized']].head()


Unnamed: 0,review,lemmatized
0,"""Why don’t your ATMs support account-to-accoun...",atms support accounttoaccount transfer like co...
1,what is this app problem???,app problem
2,the app is proactive and a good connections.,app proactive good connection
3,I cannot send to cbebirr app. through this app.,I send cbebirr app app
4,good,good


# Extract Keywords with TF-IDF

In [12]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=100)
X = vectorizer.fit_transform(df['lemmatized'])
keywords = vectorizer.get_feature_names_out()
print(keywords[:20])


bank_keywords = defaultdict(list)
for bank in df['bank'].unique():
    sub_df = df[df['bank'] == bank]
    X_bank = vectorizer.fit_transform(sub_df['lemmatized'])
    top_idx = X_bank.sum(axis=0).A1.argsort()[-10:][::-1]
    bank_keywords[bank] = [vectorizer.get_feature_names_out()[i] for i in top_idx]

bank_keywords



['access' 'account' 'add' 'ahead' 'also' 'always' 'amazing' 'app'
 'app ever' 'app work' 'application' 'bad' 'bad app' 'bank' 'bank super'
 'banking' 'banking app' 'boa' 'cbe' 'convenient']


defaultdict(list,
            {'Commercial Bank of Ethiopia': ['app',
              'good',
              'good app',
              'nice',
              'cbe',
              'bank',
              'like',
              'use',
              'work',
              'easy'],
             'Bank of Abyssinia': ['app',
              'work',
              'good',
              'bank',
              'bad',
              'use',
              'boa',
              'please',
              'time',
              'banking'],
             'Dashen Bank': ['app',
              'good',
              'dashen',
              'bank',
              'super',
              'use',
              'banking',
              'fast',
              'one',
              'feature']})

# Manually Group Keywords into Themes

In [13]:
# Manually assign themes (rule-based)
def assign_theme(text):
    text = text.lower()
    if any(kw in text for kw in ['login', 'signin', 'password']):
        return 'Account Access'
    elif any(kw in text for kw in ['slow', 'delay', 'loading']):
        return 'Transaction Speed'
    elif any(kw in text for kw in ['ui', 'interface', 'design']):
        return 'UI/UX'
    elif any(kw in text for kw in ['support', 'help', 'customer']):
        return 'Customer Support'
    elif any(kw in text for kw in ['crash', 'bug', 'error']):
        return 'App Stability'
    else:
        return 'Other'

df['theme'] = df['lemmatized'].apply(assign_theme)

df[['review', 'sentiment', 'theme']].to_csv('../outputs/themes_by_bank.csv', index=False)



## Task 2 Summary

- Reviews analyzed: 1200+
- Sentiment labels assigned: 100%
- Number of themes: 5
