<h1> Customer Experience Analytics for Fintech Apps </h1>

A Real-World Data Engineering Challenge: Scraping, Analyzing, and Visualizing Google Play Store Reviews.

Task-1: Data Collection and Preprocessing

In [7]:
# import necessary libraries
import pandas as pd
from google_play_scraper import Sort, reviews
import csv
from datetime import datetime
import schedule
import logging
import time
import os

In [4]:
pip install pandas schedule

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
# Set up logging
logging.basicConfig(filename='scraper.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [9]:
# Define app IDs and their corresponding bank names
APP_DETAILS = {
    'com.commercialbankofethiopia.mobilebanking': 'Commercial Bank of Ethiopia',
    'com.bankofabyssinia.mobilebanking': 'Bank of Abyssinia',
    'com.dashen.dashensuperapp': 'Dashen Bank'
}


In [None]:
def scrape_play_store_reviews():
    all_reviews_data = []

    for app_id, bank_name in APP_DETAILS.items():
        logging.info(f"🔄 Fetching reviews for {bank_name} (App ID: {app_id})...")
        try:
            results, _ = reviews(
                app_id,
                lang='en',
                country='us',
                sort=Sort.NEWEST,
                count=4000,  # Increased count to ensure 400+ unique reviews per bank
                filter_score_with=None
            )

            for entry in results:
                all_reviews_data.append({
                    'review_text': entry['content'],
                    'rating': entry['score'],
                    'date': entry['at'].strftime('%Y-%m-%d'),
                    'bank': bank_name,
                    'source': 'Google Play'
                })
            logging.info(f"✅ Fetched {len(results)} reviews for {bank_name}.")
        except Exception as e:
            logging.error(f"Error occurred while scraping {bank_name} (App ID: {app_id}): {e}")

    if not all_reviews_data:
        logging.warning("No reviews were scraped. Exiting.")
        return

    # Convert to DataFrame for easier preprocessing
    df = pd.DataFrame(all_reviews_data)

    # Preprocessing
    # Remove duplicates
    initial_rows = len(df)
    df.drop_duplicates(subset=['review_text', 'bank', 'date'], inplace=True)
    logging.info(f"Removed {initial_rows - len(df)} duplicate reviews.")

    # Handle missing data (e.g., drop rows where review_text or rating is missing)
    df.dropna(subset=['review_text', 'rating'], inplace=True)
    logging.info(f"Remaining reviews after dropping NaNs: {len(df)}")

    # Normalize dates (already handled during scraping with strftime('%Y-%m-%d'))
    # Ensure date column is in datetime format for consistency, though string format is fine for CSV
    df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')


    # Save as CSV
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'data/all_banks_reviews_{timestamp}.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    logging.info(f"✅ Saved {len(df)} unique and preprocessed reviews to {filename}")

# Different scheduling options (uncomment the one you want to use):
# schedule.every().day.at("01:00").do(scrape_play_store_reviews)  # Daily at 1 AM
# schedule.every(6).hours.do(scrape_play_store_reviews)           # Every 6 hours
# schedule.every().monday.do(scrape_play_store_reviews)           # Every Monday
schedule.every(1).minute.do(scrape_play_store_reviews)             # Every minute for testing

# To run immediately for testing without waiting for schedule
# scrape_play_store_reviews()

while True:
    schedule.run_pending()
    time.sleep(1)
