<h1> Customer Experience Analytics for Fintech Apps </h1>

A Real-World Data Engineering Challenge: Scraping, Analyzing, and Visualizing Google Play Store Reviews.

**Task-1: Data Collection and Preprocessing**

<li>Scrapes reviews from Google Play Store for 3 Ethiopian banks
<li>Cleans and preprocesses the data
<li>Saves as CSV

In [1]:
from google_play_scraper import Sort, reviews
import csv
from datetime import datetime
import logging
import time
import os

In [2]:
# Set up logging
logging.basicConfig(filename='scraper.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
def scrape_play_store_reviews(app_id, bank_name):
    logging.info(f"🔄 Fetching reviews for {bank_name}...")

    try:
        results, _ = reviews(
            app_id,
            lang='en',
            country='et', # Ethiopia
            sort=Sort.NEWEST,
            count=500,  # Aim for more than needed (400+)
            filter_score_with=None
        )

        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f'{bank_name.replace(" ", "_")}_reviews_{timestamp}.csv'

        # Write to CSV
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=['review_id', 'review_text', 'rating', 'date', 'bank_name', 'source'])
            writer.writeheader()

            for i, entry in enumerate(results):
                writer.writerow({
                    'review_id': f"{bank_name.lower().replace(' ', '_')}_{i}",
                    'review_text': entry['content'],
                    'rating': entry['score'],
                    'date': entry['at'].strftime('%Y-%m-%d'),
                    'bank_name': bank_name,
                    'source': 'Google Play'
                })

        logging.info(f"✅ Saved {len(results)} reviews to {filename}")
        return filename
    except Exception as e:
        logging.error(f"Error occurred: {e}")
        return None

In [5]:
def preprocess_data():
    """Combine all CSV files and preprocess the data"""
    logging.info("Starting data preprocessing...")
    
    # Find all CSV files with reviews
    csv_files = [f for f in os.listdir() if f.endswith('.csv') and 'reviews' in f]
    
    if not csv_files:
        logging.error("No review CSV files found for preprocessing")
        return
        
    # Combine all CSVs into one dataset
    all_reviews = []
    for file in csv_files:
        with open(file, mode='r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                all_reviews.append(row)
    
    # Remove duplicates (based on review text and bank name)
    unique_reviews = {}
    for review in all_reviews:
        key = (review['review_text'], review['bank_name'])
        if key not in unique_reviews:
            unique_reviews[key] = review
    
    # Save combined, deduplicated dataset
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f'all_banks_reviews_clean_{timestamp}.csv'
    
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['review_id', 'review_text', 'rating', 'date', 'bank_name', 'source'])
        writer.writeheader()
        
        for review in unique_reviews.values():
            writer.writerow(review)
    
    logging.info(f"✅ Preprocessing complete. Saved {len(unique_reviews)} unique reviews to {output_file}")
    return output_file

In [6]:
def run_scraping():
    """Run scraping for all three banks"""
    banks = [
        {'app_id': 'com.dashen.dashensuperapp', 'name': 'Dashen Bank'},
        {'app_id': 'com.cbe.dfc.asc.android', 'name': 'Commercial Bank of Ethiopia'},
        {'app_id': 'com.boa.android', 'name': 'Bank of Abyssinia'}
    ]
    
    for bank in banks:
        scrape_play_store_reviews(bank['app_id'], bank['name'])
    
    # After scraping all banks, preprocess the data
    clean_file = preprocess_data()
    logging.info(f"Data collection and preprocessing complete: {clean_file}")

if __name__ == "__main__":
    run_scraping()

KeyError: 'bank_name'