In [228]:
from google_play_scraper import Sort, reviews
import pandas as pd
import time
import os
import sys
import spacy
from transformers import pipeline
from collections import defaultdict
from script.scrape_bank_review import *
from script.clean_review import *
#from script.analysis_utils import process_reviews, aggregate_sentiment, aggregate_themes, save_results
# Add the parent directory () to the Python path
# Add the root directory (parent of benin_eda_project/ and script/) to the Python path
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(root_dir)
print(f"Added {root_dir} to sys.path")

try:
    from script.scrape_bank_review import scrape_all_banks, scrape_bank_reviews
    print("Import successful.")
except ImportError as e:
    print(f"Import error: {e}")
    raise

print("Modules imported successfully.")

Added e:\KAIM to sys.path
Import successful.
Modules imported successfully.


##### bank apps

In [229]:
# Define the banks with their app IDs
banks = [
    {"name": "BOA", "app_id": "com.boa.boaMobileBanking"},
    {"name": "CBE", "app_id": "com.combanketh.mobilebanking"},
    {"name": "Dashen", "app_id": "com.dashen.dashensuperapp"}
]

In [230]:
# Scrape reviews and create DataFrames for each bank
df_CBE = None
df_BOA = None
df_Dashen = None


In [231]:
for bank in banks:
    bank_playstore_id = bank["app_id"]
    print(f"\nFetching reviews for {bank['name']} (ID: {bank_playstore_id})...\n")
    try:
        result, continuation_token = reviews(
            bank_playstore_id,
            lang="en",
            country="et",  # Ethiopia
            sort=Sort.NEWEST,
            count=400,  # Reduced to avoid rate limits
            filter_score_with=None
        )
        if not result:
            print(f"No reviews found for {bank['name']}.")
            continue

        # Format reviews for DataFrame
        reviews_data = [
            {
                "user_name": review.get('userName', ''),
                "rating": review.get('score', ''),
                "date": review.get('at').strftime('%Y-%m-%d %H:%M:%S') if review.get('at') else 'N/A',
                "review": review.get('content', ''),
                "review_id": review.get('reviewId', ''),
                "app_version": review.get('appVersion', ''),
                "replied_at": str(review.get('repliedAt', '—')),
                "reply_content": review.get('replyContent', '—'),
                "thumbs_up_count": review.get('thumbsUpCount', ''),
                "user_image_url": review.get('userImage', ''),
                "bank": bank["name"],
                "source": "Google Play"
            }
            for review in result
        ]

        # Create DataFrame for this bank's reviews
        df = pd.DataFrame(reviews_data)

        # Assign to the appropriate DataFrame variable
        if bank["name"] == "BOA":
            df_BOA = df
        elif bank["name"] == "CBE":
            df_CBE = df
        elif bank["name"] == "Dashen":
            df_Dashen = df

        # Print reviews for verification
        for i, review in enumerate(result, 1):
            print(f":small_blue_diamond: Review {i}")
            print(f"  :adult: User Name        : {review.get('userName', '')}")
            print(f"  :star: Rating           : {review.get('score', '')}")
            print(
                f"  :date: Date            : {review.get('at').strftime('%Y-%m-%d %H:%M:%S') if review.get('at') else 'N/A'}"
            )
            print(f"  :memo: Review Content   : {review.get('content', '')}")
            print(f"  :id: Review ID        : {review.get('reviewId', '')}")
            print(f"  :iphone: App Version      : {review.get('appVersion', '')}")
            print(f"  :repeat: Replied At       : {review.get('repliedAt', '—')}")
            print(f"  :speech_balloon: Reply Content    : {review.get('replyContent', '—')}")
            print(f"  :+1: Thumbs Up Count  : {review.get('thumbsUpCount', '')}")
            print(f"  :globe_with_meridians: User Image URL   : {review.get('userImage', '')}")
            print("-" * 70)

        # Save to CSV
        save_dir = os.path.join('..', 'data')
        os.makedirs(save_dir, exist_ok=True)
        file_path = os.path.join(save_dir, f"{bank['name']}_reviews.csv")
        df.to_csv(file_path, index=False)
        print(f"Created DataFrame df_{bank['name']} with {len(df)} reviews and saved to {file_path}")

        # Add delay to avoid rate limiting
        time.sleep(5)  # Increased delay to ensure scraper stability

    except Exception as e:
        print(f"Error fetching reviews for {bank['name']}: {e}")
        continue  # Ensure loop continues even if one bank fails

# Verify all DataFrames were created
if df_BOA is not None:
    print(f"\ndf_BOA contains {len(df_BOA)} reviews")
    print(df_BOA.head())
else:
    print("\ndf_BOA was not created")
if df_CBE is not None:
    print(f"\ndf_CBE contains {len(df_CBE)} reviews")
    print(df_CBE.head())
else:
    print("\ndf_CBE was not created")
if df_Dashen is not None:
    print(f"\ndf_Dashen contains {len(df_Dashen)} reviews")
    print(df_Dashen.head())
else:
    print("\ndf_Dashen was not created")


Fetching reviews for BOA (ID: com.boa.boaMobileBanking)...

:small_blue_diamond: Review 1
  :adult: User Name        : Phillmon Haftom
  :star: Rating           : 3
  :date: Date            : 2025-06-05 11:57:36
  :memo: Review Content   : it's not working
  :id: Review ID        : 937102c6-ae88-419c-994b-80e520faacb8
  :iphone: App Version      : 25.05.03
  :repeat: Replied At       : None
  :speech_balloon: Reply Content    : None
  :+1: Thumbs Up Count  : 0
  :globe_with_meridians: User Image URL   : https://play-lh.googleusercontent.com/a/ACg8ocJUHD1VlowMkpu5Ud6IlrkId8h-uGAlDtJtFS8bLkxHA1gDeA=mo
----------------------------------------------------------------------
:small_blue_diamond: Review 2
  :adult: User Name        : Robel Alebachew
  :star: Rating           : 1
  :date: Date            : 2025-06-03 16:21:34
  :memo: Review Content   : Hello, I’m facing a problem with the BOA Mobile app. Every time I enter my phone number and password, the app crashes and shows an error that

##### Create index for pre processing

In [None]:
df["review_id"] = df.index
df.tail()

Unnamed: 0,user_name,rating,date,review,review_id,app_version,replied_at,reply_content,thumbs_up_count,user_image_url,bank,source
395,Mahilet Eshetu,5,2025-01-17,Dashen yichalal. Ewnetem one step a head,395,1.0.1,,,3,https://play-lh.googleusercontent.com/a/ACg8oc...,Dashen,Google Play
396,Leul Tube,4,2025-01-17,It has a Good performance but need more upgrad...,396,1.0.4,,,75,https://play-lh.googleusercontent.com/a-/ALV-U...,Dashen,Google Play
397,Fantabil Deresse,5,2025-01-17,It is a very wonderful work that has saved its...,397,1.0.4,,,5,https://play-lh.googleusercontent.com/a-/ALV-U...,Dashen,Google Play
398,Dawit Alemayehu,5,2025-01-17,“Life-changing!” I can’t imagine going back to...,398,,,,4,https://play-lh.googleusercontent.com/a/ACg8oc...,Dashen,Google Play
399,Meba Abiye,5,2025-01-17,Pro max,399,,,,7,https://play-lh.googleusercontent.com/a/ACg8oc...,Dashen,Google Play


### Preprocessing

In [None]:
clean_and_process_data(df)

In [216]:
combined_df = df
combined_df.shape
combined_df.tail()

Unnamed: 0,user_name,rating,date,review,review_id,app_version,replied_at,reply_content,thumbs_up_count,user_image_url,bank,source
395,Mahilet Eshetu,5,2025-01-17,Dashen yichalal. Ewnetem one step a head,395,1.0.1,,,3,https://play-lh.googleusercontent.com/a/ACg8oc...,Dashen,Google Play
396,Leul Tube,4,2025-01-17,It has a Good performance but need more upgrad...,396,1.0.4,,,75,https://play-lh.googleusercontent.com/a-/ALV-U...,Dashen,Google Play
397,Fantabil Deresse,5,2025-01-17,It is a very wonderful work that has saved its...,397,1.0.4,,,5,https://play-lh.googleusercontent.com/a-/ALV-U...,Dashen,Google Play
398,Dawit Alemayehu,5,2025-01-17,“Life-changing!” I can’t imagine going back to...,398,,,,4,https://play-lh.googleusercontent.com/a/ACg8oc...,Dashen,Google Play
399,Meba Abiye,5,2025-01-17,Pro max,399,,,,7,https://play-lh.googleusercontent.com/a/ACg8oc...,Dashen,Google Play


#### Clean Data for the sentiment analysis

In [224]:
df_banks = combined_df.drop(columns=["user_image_url", "thumbs_up_count", "app_version", "replied_at", "reply_content"])
df_banks = df_banks.rename(columns={"review_content": "review", "rating": "rating", "date": "date"})
df_banks.head()

Unnamed: 0,user_name,rating,date,review,review_id,bank,source
0,Adonijah,2,2025-06-07,I like this mobile banking app very much. Over...,0,Dashen,Google Play
1,OBSA KA,3,2025-06-06,love,1,Dashen,Google Play
2,Masersha,5,2025-06-03,መቸሸጠ,2,Dashen,Google Play
3,Koki Mulugeta,5,2025-06-03,wow,3,Dashen,Google Play
4,Badhasa Dassaalany,5,2025-06-01,gadaa,4,Dashen,Google Play


In [225]:
clean_file = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'clean_reviews.csv'))
sys.path.append(root_dir)
df_banks.to_csv(clean_file, index=False, encoding="utf-8")
print(f"Saved clean dataset to {clean_file}")

Saved clean dataset to e:\KAIM\Customer-Experience-Analytics-for-Fintech-Apps\data\clean_reviews.csv


Clean the data to the best format for further sentiment analysis