In [4]:
# Add src folder to path
import sys
sys.path.append('../src')

from scraper import ReviewScraper
from preprocessor import ReviewPreprocessor

# ------------------ SCRAPER ------------------
# Replace with actual Google Play app IDs
banks = {
    "CBE": "com.combanketh.mobilebanking",
    "BOA": "com.boa.boaMobileBanking",
    "Dashen": "com.cr2.amolelight"
}

scraper = ReviewScraper(banks)
raw_df = scraper.scrape_reviews(count_per_bank=500)
scraper.save_reviews(raw_df, "raw_reviews.csv")

# ------------------ PREPROCESSING ------------------
preprocessor = ReviewPreprocessor(raw_df)
preprocessor.remove_duplicates_and_missing()
preprocessor.normalize_dates()
preprocessor.save_clean("clean_reviews.csv")

# ------------------ QUICK CHECK ------------------
import pandas as pd
df = pd.read_csv("clean_reviews.csv")
print("Reviews per bank:\n", df['bank'].value_counts())
print(f"Total reviews: {len(df)}")
missing_pct = df.isna().mean().max() * 100
print(f"Maximum missing data percentage: {missing_pct:.2f}%")


Scraping reviews for CBE...
Scraping reviews for BOA...
Scraping reviews for Dashen...
Total reviews scraped: 1500
Saved raw_reviews.csv
Saved clean_reviews.csv
Reviews per bank:
 bank
BOA       408
CBE       403
Dashen    375
Name: count, dtype: int64
Total reviews: 1186
Maximum missing data percentage: 0.00%
