Task 4: Insights and Recommendations

<li>Generates visualizations (sentiment distribution, rating distribution, themes)
<li>Identifies key insights, drivers, and pain points
<li>Creates a summary report with recommendations

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import numpy as np
from datetime import datetime
import logging
import os

In [None]:
# Set up logging
logging.basicConfig(filename='insights.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
# Create directory for visualizations
os.makedirs('visualizations', exist_ok=True)

In [None]:

def load_data(csv_path):
    """Load the analyzed review data"""
    try:
        df = pd.read_csv(csv_path)
        logging.info(f"Loaded {len(df)} analyzed reviews from {csv_path}")
        return df
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        return None

In [None]:

def generate_sentiment_distribution(df):
    """Generate sentiment distribution visualization"""
    logging.info("Generating sentiment distribution visualization...")
    
    # Group by bank and sentiment
    sentiment_counts = df.groupby(['bank_name', 'sentiment_label']).size().reset_index(name='count')
    
    # Calculate percentages
    total_by_bank = sentiment_counts.groupby('bank_name')['count'].sum().reset_index()
    sentiment_counts = pd.merge(sentiment_counts, total_by_bank, on='bank_name', suffixes=('', '_total'))
    sentiment_counts['percentage'] = sentiment_counts['count'] / sentiment_counts['count_total'] * 100
    
    # Create visualization
    plt.figure(figsize=(12, 8))
    sns.barplot(x='bank_name', y='percentage', hue='sentiment_label', data=sentiment_counts)
    plt.title('Sentiment Distribution by Bank', fontsize=16)
    plt.xlabel('Bank', fontsize=14)
    plt.ylabel('Percentage of Reviews', fontsize=14)
    plt.xticks(rotation=45)
    plt.legend(title='Sentiment')
    plt.tight_layout()
    
    # Save visualization
    output_path = 'visualizations/sentiment_distribution.png'
    plt.savefig(output_path, dpi=300)
    plt.close()
    logging.info(f"Sentiment distribution visualization saved to {output_path}")
    
    return sentiment_counts

In [None]:

def generate_rating_distribution(df):
    """Generate rating distribution visualization"""
    logging.info("Generating rating distribution visualization...")
    
    # Group by bank and rating
    rating_counts = df.groupby(['bank_name', 'rating']).size().reset_index(name='count')
    
    # Calculate percentages
    total_by_bank = rating_counts.groupby('bank_name')['count'].sum().reset_index()
    rating_counts = pd.merge(rating_counts, total_by_bank, on='bank_name', suffixes=('', '_total'))
    rating_counts['percentage'] = rating_counts['count'] / rating_counts['count_total'] * 100
    
    # Create visualization
    plt.figure(figsize=(12, 8))
    sns.barplot(x='rating', y='percentage', hue='bank_name', data=rating_counts)
    plt.title('Rating Distribution by Bank', fontsize=16)
    plt.xlabel('Rating (1-5 stars)', fontsize=14)
    plt.ylabel('Percentage of Reviews', fontsize=14)
    plt.legend(title='Bank')
    plt.tight_layout()
    
    # Save visualization
    output_path = 'visualizations/rating_distribution.png'
    plt.savefig(output_path, dpi=300)
    plt.close()
    logging.info(f"Rating distribution visualization saved to {output_path}")
    
    # Calculate average ratings
    avg_ratings = df.groupby('bank_name')['rating'].mean().reset_index()
    logging.info(f"Average ratings: {avg_ratings.to_dict('records')}")
    
    return rating_counts, avg_ratings

In [None]:
def generate_theme_distribution(df):
    """Generate theme distribution visualization"""
    logging.info("Generating theme distribution visualization...")
    
    # Expand themes (semicolon-separated) into multiple rows
    theme_data = []
    for _, row in df.iterrows():
        bank = row['bank_name']
        themes = row['identified_themes'].split(';') if pd.notna(row['identified_themes']) else ['Unclassified']
        for theme in themes:
            if theme:  # Skip empty themes
                theme_data.append({'bank_name': bank, 'theme': theme})
    
    theme_df = pd.DataFrame(theme_data)
    
    # Count themes by bank
    theme_counts = theme_df.groupby(['bank_name', 'theme']).size().reset_index(name='count')
    
    # Filter to top themes
    top_themes = theme_counts.groupby('theme')['count'].sum().nlargest(5).index.tolist()
    filtered_themes = theme_counts[theme_counts['theme'].isin(top_themes)]
    
    # Create visualization
    plt.figure(figsize=(12, 8))
    sns.barplot(x='theme', y='count', hue='bank_name', data=filtered_themes)
    plt.title('Top 5 Themes by Bank', fontsize=16)
    plt.xlabel('Theme', fontsize=14)
    plt.ylabel('Number of Reviews', fontsize=14)
    plt.xticks(rotation=45)
    plt.legend(title='Bank')
    plt.tight_layout()
    
    # Save visualization
    output_path = 'visualizations/theme_distribution.png'
    plt.savefig(output_path, dpi=300)
    plt.close()
    logging.info(f"Theme distribution visualization saved to {output_path}")
    
    return theme_counts

In [None]:

def generate_sentiment_by_theme(df):
    """Generate sentiment by theme visualization"""
    logging.info("Generating sentiment by theme visualization...")
    
    # Expand themes into multiple rows
    theme_sentiment_data = []
    for _, row in df.iterrows():
        bank = row['bank_name']
        sentiment = row['sentiment_label'] if pd.notna(row['sentiment_label']) else 'NEUTRAL'
        themes = row['identified_themes'].split(';') if pd.notna(row['identified_themes']) else ['Unclassified']
        for theme in themes:
            if theme:  # Skip empty themes
                theme_sentiment_data.append({
                    'bank_name': bank,
                    'theme': theme,
                    'sentiment_label': sentiment
                })
    
    theme_sentiment_df = pd.DataFrame(theme_sentiment_data)
    
    # Count by theme and sentiment
    theme_sentiment_counts = theme_sentiment_df.groupby(['theme', 'sentiment_label']).size().reset_index(name='count')
    
    # Filter to top themes
    top_themes = theme_sentiment_df.groupby('theme').size().nlargest(5).index.tolist()
    filtered_data = theme_sentiment_counts[theme_sentiment_counts['theme'].isin(top_themes)]
    
    # Create visualization
    plt.figure(figsize=(12, 8))
    sns.barplot(x='theme', y='count', hue='sentiment_label', data=filtered_data)
    plt.title('Sentiment Distribution by Theme', fontsize=16)
    plt.xlabel('Theme', fontsize=14)
    plt.ylabel('Number of Reviews', fontsize=14)
    plt.xticks(rotation=45)
    plt.legend(title='Sentiment')
    plt.tight_layout()
    
    # Save visualization
    output_path = 'visualizations/sentiment_by_theme.png'
    plt.savefig(output_path, dpi=300)
    plt.close()
    logging.info(f"Sentiment by theme visualization saved to {output_path}")
    
    return theme_sentiment_counts

In [None]:

def generate_word_clouds(df):
    """Generate word clouds for each bank"""
    logging.info("Generating word clouds...")
    
    banks = df['bank_name'].unique()
    
    for bank in banks:
        bank_df = df[df['bank_name'] == bank]
        
        # Combine all reviews for this bank
        text = ' '.join(bank_df['review_text'].dropna())
        
        if not text:
            logging.warning(f"No text available for {bank} word cloud")
            continue
        
        # Create word cloud
        wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(text)
        
        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Word Cloud for {bank}', fontsize=16)
        plt.tight_layout()
        
        # Save visualization
        output_path = f'visualizations/wordcloud_{bank.replace(" ", "_")}.png'
        plt.savefig(output_path, dpi=300)
        plt.close()
        logging.info(f"Word cloud for {bank} saved to {output_path}")

In [None]:
def identify_key_insights(df):
    """Identify key insights, drivers, and pain points"""
    logging.info("Identifying key insights...")
    
    insights = {}
    
    # Average ratings
    avg_ratings = df.groupby('bank_name')['rating'].mean().to_dict()
    insights['average_ratings'] = avg_ratings
    
    # Sentiment distribution
    sentiment_dist = df.groupby(['bank_name', 'sentiment_label']).size().unstack().fillna(0)
    insights['sentiment_distribution'] = sentiment_dist.to_dict()
    
    # Identify drivers (positive themes)
    drivers = {}
    pain_points = {}
    
    for bank in df['bank_name'].unique():
        bank_df = df[df['bank_name'] == bank]
        
        # Get positive reviews
        positive_df = bank_df[bank_df['sentiment_label'] == 'POSITIVE']
        
        # Expand themes
        positive_themes = []
        for themes in positive_df['identified_themes'].dropna():
            positive_themes.extend(themes.split(';'))
        
        # Count theme occurrences
        theme_counts = pd.Series(positive_themes).value_counts()
        
        # Top 3 drivers
        drivers[bank] = theme_counts.nlargest(3).to_dict()
        
        # Get negative reviews
        negative_df = bank_df[bank_df['sentiment_label'] == 'NEGATIVE']
        
        # Expand themes
        negative_themes = []
        for themes in negative_df['identified_themes'].dropna():
            negative_themes.extend(themes.split(';'))
        
        # Count theme occurrences
        theme_counts = pd.Series(negative_themes).value_counts()
        
        # Top 3 pain points
        pain_points[bank] = theme_counts.nlargest(3).to_dict()
    
    insights['drivers'] = drivers
    insights['pain_points'] = pain_points
    
    # Log insights
    logging.info(f"Key insights: {insights}")
    
    return insights

In [None]:

def generate_recommendations(insights):
    """Generate recommendations based on insights"""
    logging.info("Generating recommendations...")
    
    recommendations = {}
    
    for bank, pain_points in insights['pain_points'].items():
        bank_recommendations = []
        
        for theme, count in pain_points.items():
            if theme == 'Performance':
                bank_recommendations.append("Improve app performance and loading times to reduce user frustration")
            elif theme == 'UI/UX':
                bank_recommendations.append("Redesign user interface for better navigation and ease of use")
            elif theme == 'Features':
                bank_recommendations.append("Add more requested features and improve existing functionality")
            elif theme == 'Security':
                bank_recommendations.append("Enhance security measures while maintaining ease of access")
            elif theme == 'Support':
                bank_recommendations.append("Improve customer support response times and resolution processes")
        
        recommendations[bank] = bank_recommendations
    
    # Log recommendations
    logging.info(f"Recommendations: {recommendations}")
    
    return recommendations

In [None]:

def create_summary_report(df, insights, recommendations):
    """Create a summary report with insights and recommendations"""
    logging.info("Creating summary report...")
    
    report = "# Customer Experience Analytics Report\n\n"
    report += f"**Date:** {datetime.now().strftime('%Y-%m-%d')}\n\n"
    
    report += "## Overview\n\n"
    report += f"This report analyzes {len(df)} user reviews from the Google Play Store for three Ethiopian banks' mobile apps.\n\n"
    
    report += "## Summary of Findings\n\n"
    
    # Average ratings
    report += "### Average Ratings\n\n"
    for bank, rating in insights['average_ratings'].items():
        report += f"- **{bank}:** {rating:.2f}/5.0\n"
    report += "\n"
    
    # Key drivers
    report += "### Key Satisfaction Drivers\n\n"
    for bank, drivers in insights['drivers'].items():
        report += f"**{bank}:**\n"
        for theme, count in drivers.items():
            report += f"- {theme}: {count} mentions\n"
        report += "\n"
    
    # Pain points
    report += "### Key Pain Points\n\n"
    for bank, pains in insights['pain_points'].items():
        report += f"**{bank}:**\n"
        for theme, count in pains.items():
            report += f"- {theme}: {count} mentions\n"
        report += "\n"
    
    # Recommendations
    report += "## Recommendations\n\n"
    for bank, recs in recommendations.items():
        report += f"**{bank}:**\n"
        for rec in recs:
            report += f"- {rec}\n"
        report += "\n"
    
    # Note on ethics and limitations
    report += "## Limitations and Ethical Considerations\n\n"
    report += "- App store reviews may have selection bias, with users more likely to leave reviews when they have extremely positive or negative experiences.\n"
    report += "- The sentiment analysis model may not capture nuances in Ethiopian context or banking-specific terminology.\n"
    report += "- Theme identification is based on predefined categories and may miss emerging concerns.\n"
    
    # Save report
    output_path = 'Customer_Experience_Analytics_Report.md'
    with open(output_path, 'w') as f:
        f.write(report)
    
    logging.info(f"Summary report saved to {output_path}")
    
    return output_path

In [None]:

def run_insights_generation(csv_path):
    """Run the complete insights generation process"""
    logging.info(f"Starting insights generation from {csv_path}")
    
    # Load data
    df = load_data(csv_path)
    if df is None:
        return
    
    # Generate visualizations
    generate_sentiment_distribution(df)
    generate_rating_distribution(df)
    generate_theme_distribution(df)
    generate_sentiment_by_theme(df)
    generate_word_clouds(df)
    
    # Identify insights
    insights = identify_key_insights(df)
    
    # Generate recommendations
    recommendations = generate_recommendations(insights)
    
    # Create summary report
    create_summary_report(df, insights, recommendations)
    
    logging.info("Insights generation complete")

if __name__ == "__main__":
    # Replace with your analyzed data file
    input_file = "all_banks_reviews_clean_20250611_000000_analyzed.csv"  # Update with your actual filename
    run_insights_generation(input_file)