# App Setup

In [2]:
apps = {
    "CBE": ("com.combanketh.mobilebanking", "Commercial Bank of Ethiopia"),
    "BOA": ("com.boa.boaMobileBanking", "Bank of Abyssinia"),
    "Dashen": ("com.dashen.dashensuperapp", "Dashen Bank")
}

# Scraping Code

In [3]:
from google_play_scraper import Sort, reviews
import pandas as pd
from datetime import datetime

all_reviews = []

for short_name, (app_id, bank_name) in apps.items():
    print(f"🔄 Fetching reviews for {bank_name}")
    result, _ = reviews(
        app_id,
        lang='en',
        country='us',
        sort=Sort.NEWEST,
        count=500  # Can go up to 5000
    )
    for entry in result:
        all_reviews.append({
            'review': entry['content'],
            'rating': entry['score'],
            'date': entry['at'].strftime('%Y-%m-%d'),
            'bank': bank_name,
            'source': 'Google Play'
        })

# Convert to DataFrame
df_raw = pd.DataFrame(all_reviews)
df_raw.to_csv("../data/raw_reviews.csv", index=False)
df_raw.head()


🔄 Fetching reviews for Commercial Bank of Ethiopia
🔄 Fetching reviews for Bank of Abyssinia
🔄 Fetching reviews for Dashen Bank


Unnamed: 0,review,rating,date,bank,source
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06,Commercial Bank of Ethiopia,Google Play
1,what is this app problem???,1,2025-06-05,Commercial Bank of Ethiopia,Google Play
2,the app is proactive and a good connections.,5,2025-06-05,Commercial Bank of Ethiopia,Google Play
3,I cannot send to cbebirr app. through this app.,3,2025-06-05,Commercial Bank of Ethiopia,Google Play
4,good,4,2025-06-05,Commercial Bank of Ethiopia,Google Play


#  Data Preprocessing

I will:
- Remove duplicate reviews
- Drop missing entries
- Normalize the date format to `YYYY-MM-DD`


In [4]:
# Drop duplicates and nulls
df_cleaned = df_raw.drop_duplicates(subset='review')
df_cleaned.dropna(subset=['review', 'rating', 'date', 'bank'], inplace=True)

# Normalize date
df_cleaned['date'] = pd.to_datetime(df_cleaned['date'], errors='coerce')
df_cleaned.dropna(subset=['date'], inplace=True)
df_cleaned['date'] = df_cleaned['date'].dt.strftime('%Y-%m-%d')

# Save cleaned dataset
df_cleaned.to_csv("../data/cleaned_reviews.csv", index=False)
df_cleaned.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.dropna(subset=['review', 'rating', 'date', 'bank'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['date'] = pd.to_datetime(df_cleaned['date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.dropna(subset=['date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

Unnamed: 0,review,rating,date,bank,source
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06,Commercial Bank of Ethiopia,Google Play
1,what is this app problem???,1,2025-06-05,Commercial Bank of Ethiopia,Google Play
2,the app is proactive and a good connections.,5,2025-06-05,Commercial Bank of Ethiopia,Google Play
3,I cannot send to cbebirr app. through this app.,3,2025-06-05,Commercial Bank of Ethiopia,Google Play
4,good,4,2025-06-05,Commercial Bank of Ethiopia,Google Play


# Result Summary

In [8]:
print(f"- Total reviews scraped: {len(df_raw)}")
print(f"- Reviews after cleaning: {len(df_cleaned)}")
print("- Clean dataset saved to: data/cleaned_reviews.csv")

- Total reviews scraped: 1448
- Reviews after cleaning: 1177
- Clean dataset saved to: data/cleaned_reviews.csv
