In [2]:
#import and Load models
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline

# Load spaCy English model for text preprocessing
nlp = spacy.load("en_core_web_sm")

# Load DistilBERT SST-2 sentiment analysis pipeline from HuggingFace
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
#

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [3]:
"""
Load all banks’ cleaned review data and combine 
into one dataframe with a bank_name column for later grouping.
"""
# Load cleaned reviews CSVs for each bank
dashen_df = pd.read_csv("../Data/Dashen_Bank_cleaned.csv")
cbe_df = pd.read_csv("../Data/CBE_cleaned.csv")
boa_df = pd.read_csv("../data/BOA_cleaned.csv")

# Add bank_name column if not present
dashen_df['bank_name'] = 'Dashen Bank'
cbe_df['bank_name'] = 'CBE'
boa_df['bank_name'] = 'BOA'

# Combine all data into a single DataFrame for ease of processing
df = pd.concat([dashen_df, cbe_df, boa_df], ignore_index=True)

print(f"Total reviews loaded: {len(df)}")

Total reviews loaded: 1452


In [4]:
df

Unnamed: 0,date,rating,cleaned_review,bank_name
0,2025-06-09,5,this app is good for you guys,Dashen Bank
1,2025-06-09,5,wow,Dashen Bank
2,2025-06-08,5,kalid,Dashen Bank
3,2025-06-07,2,I like this mobile banking app very much Overa...,Dashen Bank
4,2025-06-06,3,love,Dashen Bank
...,...,...,...,...
1447,2024-05-30,1,I m sorry but what kind of stupid developer th...,BOA
1448,2024-05-29,1,What is the purpose or point of not allowing t...,BOA
1449,2024-05-28,1,Ayseram,BOA
1450,2024-05-28,1,Worst banking app ever,BOA


In [5]:
"""
Preprocessing includes lemmatization and stopword removal, 
which improves keyword extraction quality.
"""
def preprocess_text(text):
    """
    Preprocess the input text:
    - Lowercase
    - Remove stopwords
    - Lemmatize
    - Keep only alphabetic tokens
    """
    # Check if the input is a string
    if not isinstance(text, str):
        return ""
    
    # Use spaCy to process the text
    doc = nlp(text.lower())
    
    # Token filtering: remove stopwords and non-alphabetic tokens, apply lemmatization
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    
    # Join tokens back to a string
    return " ".join(tokens)



In [6]:
# Create a new column with cleaned text
df['cleaned_text'] = df['cleaned_review'].apply(preprocess_text)

In [14]:
# Ensure all inputs are strings and not empty
texts = df['cleaned_review'].astype(str).fillna('').tolist()

# Remove very short or empty reviews
texts = [t for t in texts if len(t.strip()) > 1]

# Run sentiment analysis on the cleaned, filtered list
results = sentiment_analyzer(texts)

# Update dataframe only for the rows you analyzed
filtered_df = df[df['cleaned_review'].astype(str).str.strip().apply(len) > 1].copy()

# Assign results
filtered_df['sentiment_label'] = [res['label'] for res in results]
filtered_df['sentiment_score'] = [res['score'] for res in results]

# Optional: Define NEUTRAL for low-confidence scores (<0.6)
def refine_sentiment(label, score):
    return "NEUTRAL" if score < 0.6 else label.upper()

filtered_df['sentiment_label'] = filtered_df.apply(
    lambda row: refine_sentiment(row['sentiment_label'], row['sentiment_score']), axis=1
)

# Merge results back into the main dataframe (optional but useful if you want full alignment)
df.update(filtered_df)

# Print results
print(filtered_df[['cleaned_review', 'sentiment_label', 'sentiment_score']].head())


                                      cleaned_review sentiment_label  \
0                      this app is good for you guys        POSITIVE   
1                                                wow        POSITIVE   
2                                              kalid        POSITIVE   
3  I like this mobile banking app very much Overa...        NEGATIVE   
4                                               love        POSITIVE   

   sentiment_score  
0         0.999817  
1         0.999592  
2         0.899888  
3         0.998102  
4         0.999874  


In [18]:
"""
agg_sentiment = df.groupby(['bank_name', 'rating'])['sentiment_score  '].mean().reset_index()
print("Average sentiment scores by bank and rating:\n", agg_sentiment)
"""
agg_sentiment = filtered_df.groupby(['bank_name', 'rating'])['sentiment_score'].mean().reset_index()
print("Average sentiment scores by bank and rating:\n", agg_sentiment)


Average sentiment scores by bank and rating:
       bank_name  rating  sentiment_score
0           BOA       1         0.982221
1           BOA       2         0.960065
2           BOA       3         0.973588
3           BOA       4         0.954884
4           BOA       5         0.970574
5           CBE       1         0.976689
6           CBE       2         0.977339
7           CBE       3         0.980845
8           CBE       4         0.958350
9           CBE       5         0.983203
10  Dashen Bank       1         0.994544
11  Dashen Bank       2         0.990873
12  Dashen Bank       3         0.988391
13  Dashen Bank       4         0.967959
14  Dashen Bank       5         0.986196


Unnamed: 0,date,rating,cleaned_review,bank_name,cleaned_text
0,2025-06-09,5,this app is good for you guys,Dashen Bank,app good guy
1,2025-06-09,5,wow,Dashen Bank,wow
2,2025-06-08,5,kalid,Dashen Bank,kalid
3,2025-06-07,2,I like this mobile banking app very much Overa...,Dashen Bank,like mobile banking app overall user interface...
4,2025-06-06,3,love,Dashen Bank,love
...,...,...,...,...,...
1447,2024-05-30,1,I m sorry but what kind of stupid developer th...,BOA,m sorry kind stupid developer think app fast d...
1448,2024-05-29,1,What is the purpose or point of not allowing t...,BOA,purpose point allow screenshot recipe transfer
1449,2024-05-28,1,Ayseram,BOA,ayseram
1450,2024-05-28,1,Worst banking app ever,BOA,bad bank app
