# Task 2-Sentiment & Thematic Analysis - Bank Reviews EDA
- Exploratory Data Analysis for Bank Reviews with Sentiment and Theme Insights


In [None]:
"""
## 1. Setup and Configuration
"""

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import warnings
from wordcloud import WordCloud
from collections import Counter
import ast
import os
from datetime import datetime
import textwrap

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 2. Load Data
- Load all analysis files from the data/preprocessed directory.



In [None]:

# Define file paths
DATA_DIR = "data/preprocessed"

# List expected files
expected_files = {
    'processed': 'google_play_processed_reviews.csv',
    'sentiment': 'sentiment_analyzed.csv',  # Changed from sentiment_preprocessed.csv
    'thematic': 'thematic_analysis.csv',
    'thematic_summary': 'thematic_summary.csv',
    'thematic_metrics': 'thematic_metrics.csv',
    'sentiment_report': 'sentiment_analyzed_report.json',  # Changed from sentiment_report.json
    'thematic_report': 'thematic_analysis_report.json'
}

# Load data
data_files = {}

for file_type, filename in expected_files.items():
    filepath = os.path.join(DATA_DIR, filename)
    try:
        if filename.endswith('.csv'):
            data_files[file_type] = pd.read_csv(filepath)
            print(f"‚úÖ Loaded {file_type}: {filename} ({len(data_files[file_type])} rows)")
        elif filename.endswith('.json'):
            with open(filepath, 'r', encoding='utf-8') as f:
                data_files[file_type] = json.load(f)
            print(f"‚úÖ Loaded {file_type}: {filename}")
    except FileNotFoundError:
        print(f"‚ùå File not found: {filename}")
        data_files[file_type] = None




## 3. Data Overview
- Understand the structure and basic statistics of our data.



In [None]:
# Display basic info about the main datasets
print("="*80)
print("DATA OVERVIEW")
print("="*80)

if data_files['processed'] is not None:
    print("\nüìä PROCESSED REVIEWS:")
    print(f"Shape: {data_files['processed'].shape}")
    print(f"Columns: {list(data_files['processed'].columns)}")
    print("\nFirst 3 rows:")
    print(data_files['processed'].head(3))

if data_files['sentiment'] is not None:
    print("\n\nüìä SENTIMENT ANALYSIS RESULTS:")
    print(f"Shape: {data_files['sentiment'].shape}")
    sentiment_cols = [col for col in data_files['sentiment'].columns 
                     if 'sentiment' in col.lower() or 'score' in col.lower()]
    print(f"Sentiment columns: {sentiment_cols}")
    print("\nSentiment column sample:")
    print(data_files['sentiment'][sentiment_cols].head(3))

if data_files['thematic'] is not None:
    print("\n\nüìä THEMATIC ANALYSIS RESULTS:")
    print(f"Shape: {data_files['thematic'].shape}")
    theme_cols = [col for col in data_files['thematic'].columns 
                 if 'theme' in col.lower() or 'keyword' in col.lower()]
    print(f"Thematic columns: {theme_cols}")
    if theme_cols:
        print("\nThematic column sample:")
        print(data_files['thematic'][theme_cols].head(3))

# %% [markdown]
"""
## 4. Sentiment Analysis Visualization
"""

# %%
def visualize_sentiment_analysis():
    """Create comprehensive sentiment analysis visualizations."""
    
    if data_files['sentiment'] is None:
        print("‚ùå Sentiment data not available")
        return
    
    df = data_files['sentiment'].copy()
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=3,
        subplot_titles=('Sentiment Distribution', 
                       'Average Sentiment Score by Bank',
                       'Sentiment vs Rating Correlation',
                       'Sentiment Score Distribution',
                       'Positive/Negative Ratio by Bank',
                       'Sentiment Heatmap by Rating'),
        specs=[[{'type': 'pie'}, {'type': 'bar'}, {'type': 'scatter'}],
               [{'type': 'histogram'}, {'type': 'bar'}, {'type': 'heatmap'}]],
        vertical_spacing=0.1,
        horizontal_spacing=0.1
    )
    
    # 1. Sentiment Distribution (Pie Chart)
    if 'sentiment_label' in df.columns:
        sentiment_counts = df['sentiment_label'].value_counts()
        colors = {'positive': '#2ecc71', 'neutral': '#3498db', 'negative': '#e74c3c'}
        
        fig.add_trace(
            go.Pie(
                labels=sentiment_counts.index,
                values=sentiment_counts.values,
                marker=dict(colors=[colors.get(label, '#95a5a6') for label in sentiment_counts.index]),
                hole=0.3,
                textinfo='percent+label'
            ),
            row=1, col=1
        )
    
    # 2. Average Sentiment Score by Bank (Bar Chart)
    if all(col in df.columns for col in ['bank_name', 'sentiment_score']):
        avg_sentiment = df.groupby('bank_name')['sentiment_score'].mean().sort_values()
        
        fig.add_trace(
            go.Bar(
                x=avg_sentiment.values,
                y=avg_sentiment.index,
                orientation='h',
                marker_color='rgba(52, 152, 219, 0.7)',
                text=[f"{val:.3f}" for val in avg_sentiment.values],
                textposition='auto'
            ),
            row=1, col=2
        )
        fig.update_xaxes(title_text="Average Sentiment Score", row=1, col=2)
    
    # 3. Sentiment vs Rating Correlation (Scatter Plot)
    if all(col in df.columns for col in ['rating', 'sentiment_score']):
        # Sample for better visualization if data is large
        sample_df = df.sample(min(1000, len(df)), random_state=42) if len(df) > 1000 else df
        
        fig.add_trace(
            go.Scatter(
                x=sample_df['rating'],
                y=sample_df['sentiment_score'],
                mode='markers',
                marker=dict(
                    size=8,
                    color=sample_df['sentiment_score'],
                    colorscale='RdYlGn',
                    showscale=True,
                    colorbar=dict(title="Sentiment")
                ),
                text=sample_df['bank_name'] if 'bank_name' in df.columns else None,
                opacity=0.6
            ),
            row=1, col=3
        )
        fig.update_xaxes(title_text="Rating", row=1, col=3)
        fig.update_yaxes(title_text="Sentiment Score", row=1, col=3)
    
    # 4. Sentiment Score Distribution (Histogram)
    if 'sentiment_score' in df.columns:
        fig.add_trace(
            go.Histogram(
                x=df['sentiment_score'],
                nbinsx=30,
                marker_color='#9b59b6',
                opacity=0.7
            ),
            row=2, col=1
        )
        fig.update_xaxes(title_text="Sentiment Score", row=2, col=1)
        fig.update_yaxes(title_text="Count", row=2, col=1)
    
    # 5. Positive/Negative Ratio by Bank (Stacked Bar Chart)
    if all(col in df.columns for col in ['bank_name', 'sentiment_label']):
        sentiment_by_bank = pd.crosstab(df['bank_name'], df['sentiment_label'], normalize='index')
        
        for sentiment in ['positive', 'neutral', 'negative']:
            if sentiment in sentiment_by_bank.columns:
                fig.add_trace(
                    go.Bar(
                        x=sentiment_by_bank.index,
                        y=sentiment_by_bank[sentiment],
                        name=sentiment.capitalize(),
                        marker_color=colors.get(sentiment, '#95a5a6')
                    ),
                    row=2, col=2
                )
        
        fig.update_xaxes(title_text="Bank", row=2, col=2, tickangle=45)
        fig.update_yaxes(title_text="Percentage", row=2, col=2)
        fig.update_layout(barmode='stack', showlegend=True)
    
    # 6. Sentiment Heatmap by Rating
    if all(col in df.columns for col in ['rating', 'sentiment_label']):
        heatmap_data = pd.crosstab(df['rating'], df['sentiment_label'], normalize='index')
        
        fig.add_trace(
            go.Heatmap(
                z=heatmap_data.values,
                x=heatmap_data.columns,
                y=heatmap_data.index,
                colorscale='RdYlGn',
                text=heatmap_data.values,
                texttemplate='%{text:.1%}',
                showscale=True,
                colorbar=dict(title="Percentage")
            ),
            row=2, col=3
        )
        fig.update_xaxes(title_text="Sentiment", row=2, col=3)
        fig.update_yaxes(title_text="Rating", row=2, col=3)
    
    # Update layout
    fig.update_layout(
        height=900,
        showlegend=True,
        title_text="Comprehensive Sentiment Analysis Dashboard",
        title_font_size=20
    )
    
    fig.show()
    
    # Additional analysis
    print("\n" + "="*80)
    print("üìà SENTIMENT ANALYSIS SUMMARY")
    print("="*80)
    
    if 'sentiment_label' in df.columns:
        total_reviews = len(df)
        positive = (df['sentiment_label'] == 'positive').sum()
        neutral = (df['sentiment_label'] == 'neutral').sum()
        negative = (df['sentiment_label'] == 'negative').sum()
        
        print(f"\nüìä Overall Sentiment Distribution:")
        print(f"   Total Reviews: {total_reviews:,}")
        print(f"   Positive: {positive:,} ({positive/total_reviews*100:.1f}%)")
        print(f"   Neutral: {neutral:,} ({neutral/total_reviews*100:.1f}%)")
        print(f"   Negative: {negative:,} ({negative/total_reviews*100:.1f}%)")
    
    if 'sentiment_score' in df.columns:
        print(f"\nüìà Sentiment Score Statistics:")
        print(f"   Mean: {df['sentiment_score'].mean():.3f}")
        print(f"   Std: {df['sentiment_score'].std():.3f}")
        print(f"   Min: {df['sentiment_score'].min():.3f}")
        print(f"   Max: {df['sentiment_score'].max():.3f}")
    
    # Bank-level sentiment analysis
    if all(col in df.columns for col in ['bank_name', 'sentiment_label']):
        print(f"\nüè¶ Bank-Level Sentiment Analysis:")
        bank_stats = df.groupby('bank_name').agg({
            'sentiment_score': ['mean', 'std', 'count'],
            'sentiment_label': lambda x: (x == 'positive').sum() / len(x) * 100
        }).round(3)
        
        bank_stats.columns = ['avg_score', 'std_score', 'review_count', 'positive_percentage']
        bank_stats = bank_stats.sort_values('avg_score', ascending=False)
        
        print("\nTop 5 Banks by Average Sentiment Score:")
        print(bank_stats.head(5))
        
        print("\nBottom 5 Banks by Average Sentiment Score:")
        print(bank_stats.tail(5))

# Execute sentiment visualization
visualize_sentiment_analysis()

# %% [markdown]
"""
## 5. Thematic Analysis Visualization
"""

# %%
def visualize_thematic_analysis():
    """Create comprehensive thematic analysis visualizations."""
    
    if data_files['thematic'] is None:
        print("‚ùå Thematic analysis data not available")
        return
    
    df = data_files['thematic'].copy()
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Theme Distribution Across All Reviews',
                       'Top Themes by Bank',
                       'Theme Co-occurrence Matrix',
                       'Theme-Rating Relationship'),
        specs=[[{'type': 'bar'}, {'type': 'heatmap'}],
               [{'type': 'heatmap'}, {'type': 'box'}]],
        vertical_spacing=0.15,
        horizontal_spacing=0.15
    )
    
    # 1. Theme Distribution (Bar Chart)
    if 'identified_themes' in df.columns:
        # Extract all themes
        all_themes = []
        for themes in df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                theme_list = [t.strip() for t in themes.split(',') if t.strip()]
                all_themes.extend(theme_list)
        
        theme_counts = Counter(all_themes).most_common(15)
        
        if theme_counts:
            themes, counts = zip(*theme_counts)
            
            fig.add_trace(
                go.Bar(
                    x=counts,
                    y=themes,
                    orientation='h',
                    marker_color='rgba(46, 204, 113, 0.7)',
                    text=[f"{count:,}" for count in counts],
                    textposition='auto'
                ),
                row=1, col=1
            )
            fig.update_xaxes(title_text="Number of Reviews", row=1, col=1)
    
    # 2. Top Themes by Bank (Heatmap)
    if all(col in df.columns for col in ['bank_name', 'identified_themes']):
        # Get top themes for each bank
        bank_themes = {}
        for bank in df['bank_name'].unique():
            bank_df = df[df['bank_name'] == bank]
            themes = []
            for t in bank_df['identified_themes'].fillna(''):
                if isinstance(t, str):
                    themes.extend([theme.strip() for theme in t.split(',') if theme.strip()])
            
            theme_counts = Counter(themes).most_common(5)
            bank_themes[bank] = dict(theme_counts)
        
        # Create heatmap data
        all_unique_themes = sorted(set([theme for themes in bank_themes.values() 
                                       for theme in themes.keys()]))
        
        heatmap_data = []
        for bank, themes in bank_themes.items():
            row = [themes.get(theme, 0) for theme in all_unique_themes]
            heatmap_data.append(row)
        
        fig.add_trace(
            go.Heatmap(
                z=heatmap_data,
                x=all_unique_themes,
                y=list(bank_themes.keys()),
                colorscale='Viridis',
                text=heatmap_data,
                texttemplate='%{text}',
                showscale=True,
                colorbar=dict(title="Count")
            ),
            row=1, col=2
        )
        fig.update_xaxes(title_text="Theme", row=1, col=2, tickangle=45)
        fig.update_yaxes(title_text="Bank", row=1, col=2)
    
    # 3. Theme Co-occurrence Matrix
    if 'identified_themes' in df.columns:
        # Create co-occurrence matrix
        all_themes_list = []
        for themes in df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                theme_list = [t.strip() for t in themes.split(',') if t.strip()]
                if len(theme_list) >= 2:
                    all_themes_list.append(theme_list)
        
        # Get top themes
        flat_themes = [theme for sublist in all_themes_list for theme in sublist]
        top_themes = [theme for theme, _ in Counter(flat_themes).most_common(10)]
        
        # Create matrix
        co_matrix = np.zeros((len(top_themes), len(top_themes)))
        for themes in all_themes_list:
            for i, theme1 in enumerate(top_themes):
                if theme1 in themes:
                    for j, theme2 in enumerate(top_themes):
                        if theme2 in themes and theme1 != theme2:
                            co_matrix[i, j] += 1
        
        fig.add_trace(
            go.Heatmap(
                z=co_matrix,
                x=top_themes,
                y=top_themes,
                colorscale='Blues',
                text=co_matrix.astype(int),
                texttemplate='%{text}',
                showscale=True,
                colorbar=dict(title="Co-occurrence")
            ),
            row=2, col=1
        )
        fig.update_xaxes(title_text="Theme", row=2, col=1, tickangle=45)
        fig.update_yaxes(title_text="Theme", row=2, col=1)
    
    # 4. Theme-Rating Relationship (Box Plot)
    if all(col in df.columns for col in ['rating', 'identified_themes']):
        # Get top themes
        all_themes = []
        for themes in df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                all_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        top_5_themes = [theme for theme, _ in Counter(all_themes).most_common(5)]
        
        # Create box plots for each theme
        for i, theme in enumerate(top_5_themes):
            theme_ratings = []
            for idx, row in df.iterrows():
                if isinstance(row['identified_themes'], str) and theme in row['identified_themes']:
                    theme_ratings.append(row['rating'])
            
            if theme_ratings:
                fig.add_trace(
                    go.Box(
                        y=theme_ratings,
                        name=theme,
                        boxpoints='outliers',
                        marker_color=px.colors.qualitative.Set2[i % len(px.colors.qualitative.Set2)]
                    ),
                    row=2, col=2
                )
        
        fig.update_yaxes(title_text="Rating", row=2, col=2)
        fig.update_xaxes(title_text="Theme", row=2, col=2, tickangle=45)
    
    # Update layout
    fig.update_layout(
        height=900,
        showlegend=False,
        title_text="Thematic Analysis Dashboard",
        title_font_size=20
    )
    
    fig.show()
    
    # Additional analysis
    print("\n" + "="*80)
    print("üìä THEMATIC ANALYSIS SUMMARY")
    print("="*80)
    
    if 'identified_themes' in df.columns:
        # Theme statistics
        reviews_with_themes = df['identified_themes'].notna().sum()
        total_reviews = len(df)
        
        print(f"\nüìà Theme Coverage:")
        print(f"   Total Reviews: {total_reviews:,}")
        print(f"   Reviews with Themes: {reviews_with_themes:,} ({reviews_with_themes/total_reviews*100:.1f}%)")
        
        # Average themes per review
        avg_themes = df['identified_themes'].apply(
            lambda x: len(str(x).split(',')) if isinstance(x, str) and x.strip() else 0
        ).mean()
        print(f"   Average Themes per Review: {avg_themes:.2f}")
    
    # Bank-level theme analysis
    if all(col in df.columns for col in ['bank_name', 'identified_themes']):
        print(f"\nüè¶ Bank-Level Theme Analysis:")
        
        for bank in df['bank_name'].unique()[:5]:  # Show top 5 banks
            bank_df = df[df['bank_name'] == bank]
            
            # Get top themes for this bank
            bank_themes = []
            for themes in bank_df['identified_themes'].fillna(''):
                if isinstance(themes, str):
                    bank_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
            
            theme_counts = Counter(bank_themes).most_common(3)
            
            if theme_counts:
                print(f"\n  {bank}:")
                print(f"    Reviews: {len(bank_df):,}")
                print(f"    Top 3 Themes:")
                for theme, count in theme_counts:
                    percentage = (count / len(bank_df)) * 100
                    print(f"      ‚Ä¢ {theme}: {count:,} ({percentage:.1f}%)")

# Execute thematic visualization
visualize_thematic_analysis()

# %% [markdown]
"""
## 6. Combined Sentiment & Thematic Analysis
"""

# %%
def visualize_combined_analysis():
    """Combine sentiment and thematic analysis."""
    
    if data_files['sentiment'] is None or data_files['thematic'] is None:
        print("‚ùå Both sentiment and thematic data required for combined analysis")
        return
    
    # Merge data if possible
    sentiment_df = data_files['sentiment'].copy()
    thematic_df = data_files['thematic'].copy()
    
    # Try to merge on common columns
    merge_cols = ['review_id'] if 'review_id' in sentiment_df.columns and 'review_id' in thematic_df.columns else None
    
    if merge_cols:
        combined_df = pd.merge(sentiment_df, thematic_df, on=merge_cols, how='inner')
    else:
        # If no common columns, use index alignment (assuming same order)
        combined_df = sentiment_df.copy()
        for col in thematic_df.columns:
            if col not in combined_df.columns:
                combined_df[col] = thematic_df[col].values if len(thematic_df) == len(combined_df) else None
    
    # Create combined visualizations
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Average Sentiment by Theme',
                       'Theme Distribution by Sentiment',
                       'Top Themes for Positive vs Negative Reviews',
                       'Theme-Sentiment Correlation'),
        specs=[[{'type': 'bar'}, {'type': 'bar'}],
               [{'type': 'bar'}, {'type': 'scatter'}]],
        vertical_spacing=0.15,
        horizontal_spacing=0.15
    )
    
    # 1. Average Sentiment by Theme
    if all(col in combined_df.columns for col in ['identified_themes', 'sentiment_score']):
        # Expand themes
        theme_sentiments = []
        for idx, row in combined_df.iterrows():
            if isinstance(row['identified_themes'], str):
                themes = [t.strip() for t in row['identified_themes'].split(',') if t.strip()]
                for theme in themes:
                    theme_sentiments.append({
                        'theme': theme,
                        'sentiment_score': row['sentiment_score']
                    })
        
        if theme_sentiments:
            theme_sentiment_df = pd.DataFrame(theme_sentiments)
            avg_sentiment_by_theme = theme_sentiment_df.groupby('theme')['sentiment_score'].mean().sort_values()
            
            top_10 = avg_sentiment_by_theme.tail(10)  # Top 10 by sentiment
            
            fig.add_trace(
                go.Bar(
                    x=top_10.values,
                    y=top_10.index,
                    orientation='h',
                    marker_color='rgba(52, 152, 219, 0.7)',
                    text=[f"{val:.3f}" for val in top_10.values],
                    textposition='auto'
                ),
                row=1, col=1
            )
            fig.update_xaxes(title_text="Average Sentiment Score", row=1, col=1)
    
    # 2. Theme Distribution by Sentiment
    if all(col in combined_df.columns for col in ['identified_themes', 'sentiment_label']):
        # Get top themes
        all_themes = []
        for themes in combined_df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                all_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        top_5_themes = [theme for theme, _ in Counter(all_themes).most_common(5)]
        
        # Create grouped bar chart
        sentiment_data = {}
        for sentiment in ['positive', 'neutral', 'negative']:
            sentiment_df = combined_df[combined_df['sentiment_label'] == sentiment]
            theme_counts = []
            for theme in top_5_themes:
                count = sum(1 for t in sentiment_df['identified_themes'].fillna('') 
                           if isinstance(t, str) and theme in t)
                theme_counts.append(count)
            sentiment_data[sentiment] = theme_counts
        
        for i, (sentiment, counts) in enumerate(sentiment_data.items()):
            fig.add_trace(
                go.Bar(
                    x=top_5_themes,
                    y=counts,
                    name=sentiment.capitalize(),
                    marker_color=['#2ecc71', '#3498db', '#e74c3c'][i]
                ),
                row=1, col=2
            )
        
        fig.update_xaxes(title_text="Theme", row=1, col=2, tickangle=45)
        fig.update_yaxes(title_text="Count", row=1, col=2)
    
    # 3. Top Themes for Positive vs Negative Reviews
    if all(col in combined_df.columns for col in ['identified_themes', 'sentiment_label']):
        positive_df = combined_df[combined_df['sentiment_label'] == 'positive']
        negative_df = combined_df[combined_df['sentiment_label'] == 'negative']
        
        # Get top themes for positive reviews
        positive_themes = []
        for themes in positive_df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                positive_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        positive_top = Counter(positive_themes).most_common(5)
        
        # Get top themes for negative reviews
        negative_themes = []
        for themes in negative_df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                negative_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        negative_top = Counter(negative_themes).most_common(5)
        
        # Create side-by-side comparison
        themes = list(set([t for t, _ in positive_top] + [t for t, _ in negative_top]))
        
        pos_counts = [dict(positive_top).get(t, 0) for t in themes]
        neg_counts = [dict(negative_top).get(t, 0) for t in themes]
        
        fig.add_trace(
            go.Bar(
                x=themes,
                y=pos_counts,
                name='Positive Reviews',
                marker_color='#2ecc71'
            ),
            row=2, col=1
        )
        
        fig.add_trace(
            go.Bar(
                x=themes,
                y=neg_counts,
                name='Negative Reviews',
                marker_color='#e74c3c'
            ),
            row=2, col=1
        )
        
        fig.update_xaxes(title_text="Theme", row=2, col=1, tickangle=45)
        fig.update_yaxes(title_text="Count", row=2, col=1)
    
    # 4. Theme-Sentiment Correlation
    if all(col in combined_df.columns for col in ['identified_themes', 'sentiment_score']):
        # Calculate average sentiment for each theme occurrence
        theme_sentiment_pairs = []
        for idx, row in combined_df.iterrows():
            if isinstance(row['identified_themes'], str):
                themes = [t.strip() for t in row['identified_themes'].split(',') if t.strip()]
                for theme in themes:
                    theme_sentiment_pairs.append({
                        'theme': theme,
                        'sentiment': row['sentiment_score']
                    })
        
        if theme_sentiment_pairs:
            pairs_df = pd.DataFrame(theme_sentiment_pairs)
            theme_stats = pairs_df.groupby('theme').agg({
                'sentiment': ['mean', 'count']
            }).round(3)
            
            theme_stats.columns = ['avg_sentiment', 'count']
            theme_stats = theme_stats[theme_stats['count'] >= 10]  # Filter for themes with enough data
            
            fig.add_trace(
                go.Scatter(
                    x=theme_stats['count'],
                    y=theme_stats['avg_sentiment'],
                    mode='markers+text',
                    marker=dict(
                        size=np.sqrt(theme_stats['count']) * 2,
                        color=theme_stats['avg_sentiment'],
                        colorscale='RdYlGn',
                        showscale=True,
                        colorbar=dict(title="Avg Sentiment")
                    ),
                    text=theme_stats.index,
                    textposition='top center',
                    opacity=0.7
                ),
                row=2, col=2
            )
            
            fig.update_xaxes(title_text="Number of Occurrences", row=2, col=2, type='log')
            fig.update_yaxes(title_text="Average Sentiment Score", row=2, col=2)
    
    # Update layout
    fig.update_layout(
        height=900,
        showlegend=True,
        title_text="Combined Sentiment & Thematic Analysis",
        title_font_size=20,
        barmode='group'
    )
    
    fig.show()
    
    # Key insights
    print("\n" + "="*80)
    print("üîç KEY INSIGHTS - COMBINED ANALYSIS")
    print("="*80)
    
    if all(col in combined_df.columns for col in ['sentiment_label', 'identified_themes']):
        # Insight 1: Most positive themes
        positive_df = combined_df[combined_df['sentiment_label'] == 'positive']
        positive_themes = []
        for themes in positive_df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                positive_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        if positive_themes:
            top_positive = Counter(positive_themes).most_common(3)
            print(f"\nüéØ Most Common Themes in POSITIVE Reviews:")
            for theme, count in top_positive:
                print(f"   ‚Ä¢ {theme}: {count:,} occurrences")
        
        # Insight 2: Most negative themes
        negative_df = combined_df[combined_df['sentiment_label'] == 'negative']
        negative_themes = []
        for themes in negative_df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                negative_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        if negative_themes:
            top_negative = Counter(negative_themes).most_common(3)
            print(f"\n‚ö†Ô∏è  Most Common Themes in NEGATIVE Reviews:")
            for theme, count in top_negative:
                print(f"   ‚Ä¢ {theme}: {count:,} occurrences")

# Execute combined visualization
visualize_combined_analysis()

# %% [markdown]
"""
## 7. Word Cloud Visualization
"""

# %%
def create_word_clouds():
    """Create word clouds for different sentiment categories."""
    
    if data_files['processed'] is None:
        print("‚ùå Processed data not available for word clouds")
        return
    
    df = data_files['processed'].copy()
    
    # If we have sentiment data, merge it
    if data_files['sentiment'] is not None and 'clean_text' in df.columns:
        # Try to align data
        sentiment_df = data_files['sentiment']
        if 'review_id' in df.columns and 'review_id' in sentiment_df.columns:
            df = pd.merge(df, sentiment_df[['review_id', 'sentiment_label']], 
                         on='review_id', how='left')
        elif len(df) == len(sentiment_df):
            df['sentiment_label'] = sentiment_df['sentiment_label'].values
    
    if 'sentiment_label' not in df.columns or 'clean_text' not in df.columns:
        print("‚ùå Required columns not available for word clouds")
        return
    
    # Prepare text for each sentiment
    sentiment_texts = {
        'positive': ' '.join(df[df['sentiment_label'] == 'positive']['clean_text'].fillna('')),
        'neutral': ' '.join(df[df['sentiment_label'] == 'neutral']['clean_text'].fillna('')),
        'negative': ' '.join(df[df['sentiment_label'] == 'negative']['clean_text'].fillna(''))
    }
    
    # Create word clouds
    fig, axes = plt.subplots(1, 3, figsize=(20, 10))
    
    colors = {
        'positive': 'Greens',
        'neutral': 'Blues',
        'negative': 'Reds'
    }
    
    for idx, (sentiment, text) in enumerate(sentiment_texts.items()):
        if text.strip():
            wordcloud = WordCloud(
                width=800,
                height=400,
                background_color='white',
                colormap=colors[sentiment],
                max_words=100,
                contour_width=1,
                contour_color='steelblue'
            ).generate(text)
            
            axes[idx].imshow(wordcloud, interpolation='bilinear')
            axes[idx].set_title(f'{sentiment.capitalize()} Reviews', fontsize=16, fontweight='bold')
            axes[idx].axis('off')
        else:
            axes[idx].text(0.5, 0.5, f'No {sentiment} reviews available', 
                         ha='center', va='center', fontsize=12)
            axes[idx].axis('off')
    
    plt.suptitle('Word Clouds by Sentiment Category', fontsize=20, fontweight='bold', y=1.05)
    plt.tight_layout()
    plt.show()
    
    # Show most common words by sentiment
    print("\n" + "="*80)
    print("üìù MOST COMMON WORDS BY SENTIMENT")
    print("="*80)
    
    for sentiment in ['positive', 'neutral', 'negative']:
        sentiment_df = df[df['sentiment_label'] == sentiment]
        if len(sentiment_df) > 0:
            # Get all words
            all_words = ' '.join(sentiment_df['clean_text'].fillna('').astype(str)).lower().split()
            
            # Remove common stopwords
            stopwords = set(['the', 'and', 'to', 'of', 'a', 'in', 'is', 'it', 'that', 'for', 
                           'on', 'with', 'as', 'was', 'this', 'are', 'be', 'by', 'not', 
                           'have', 'has', 'had', 'but', 'they', 'i', 'you', 'he', 'she'])
            
            filtered_words = [word for word in all_words if word not in stopwords and len(word) > 2]
            
            # Get top 10 words
            top_words = Counter(filtered_words).most_common(10)
            
            if top_words:
                print(f"\n{sentiment.upper()} Reviews - Top 10 Words:")
                for word, count in top_words:
                    print(f"   {word:15} : {count:,}")

# Create word clouds
create_word_clouds()

# %% [markdown]
"""
## 8. Bank Performance Dashboard
"""

# %%
def create_bank_performance_dashboard():
    """Create a comprehensive bank performance dashboard."""
    
    if data_files['sentiment'] is None or 'bank_name' not in data_files['sentiment'].columns:
        print("‚ùå Bank data not available for performance dashboard")
        return
    
    df = data_files['sentiment'].copy()
    
    # Calculate bank metrics
    bank_metrics = df.groupby('bank_name').agg({
        'sentiment_score': ['mean', 'std', 'count'],
        'sentiment_label': lambda x: (x == 'positive').sum() / len(x) * 100
    }).round(3)
    
    bank_metrics.columns = ['avg_sentiment', 'sentiment_std', 'review_count', 'positive_percentage']
    bank_metrics = bank_metrics.sort_values('avg_sentiment', ascending=False)
    
    # Create dashboard
    fig = make_subplots(
        rows=3, cols=3,
        subplot_titles=('Bank Sentiment Ranking',
                       'Review Volume by Bank',
                       'Sentiment Consistency (Std Dev)',
                       'Positive Review Percentage',
                       'Sentiment vs Review Count',
                       'Top 5 Banks - Detailed View',
                       'Bottom 5 Banks - Detailed View',
                       'Sentiment Distribution by Bank',
                       'Performance Matrix'),
        specs=[[{'type': 'bar'}, {'type': 'bar'}, {'type': 'bar'}],
               [{'type': 'scatter'}, {'type': 'table'}, {'type': 'table'}],
               [{'type': 'box'}, {'type': 'scatter'}, {'type': 'heatmap'}]],
        vertical_spacing=0.1,
        horizontal_spacing=0.15,
        row_heights=[0.3, 0.3, 0.4]
    )
    
    # 1. Bank Sentiment Ranking
    fig.add_trace(
        go.Bar(
            x=bank_metrics['avg_sentiment'],
            y=bank_metrics.index,
            orientation='h',
            marker=dict(
                color=bank_metrics['avg_sentiment'],
                colorscale='RdYlGn',
                showscale=False
            ),
            text=[f"{val:.3f}" for val in bank_metrics['avg_sentiment']],
            textposition='auto'
        ),
        row=1, col=1
    )
    fig.update_xaxes(title_text="Average Sentiment", row=1, col=1)
    
    # 2. Review Volume by Bank
    fig.add_trace(
        go.Bar(
            x=bank_metrics.index,
            y=bank_metrics['review_count'],
            marker_color='rgba(52, 152, 219, 0.7)',
            text=[f"{count:,}" for count in bank_metrics['review_count']],
            textposition='auto'
        ),
        row=1, col=2
    )
    fig.update_xaxes(title_text="Bank", row=1, col=2, tickangle=45)
    fig.update_yaxes(title_text="Review Count", row=1, col=2)
    
    # 3. Sentiment Consistency
    fig.add_trace(
        go.Bar(
            x=bank_metrics.index,
            y=bank_metrics['sentiment_std'],
            marker_color='rgba(155, 89, 182, 0.7)',
            text=[f"{std:.3f}" for std in bank_metrics['sentiment_std']],
            textposition='auto'
        ),
        row=1, col=3
    )
    fig.update_xaxes(title_text="Bank", row=1, col=3, tickangle=45)
    fig.update_yaxes(title_text="Sentiment Std Dev", row=1, col=3)
    
    # 4. Positive Review Percentage
    fig.add_trace(
        go.Scatter(
            x=bank_metrics.index,
            y=bank_metrics['positive_percentage'],
            mode='markers+lines',
            marker=dict(
                size=bank_metrics['review_count'] / bank_metrics['review_count'].max() * 50,
                color=bank_metrics['positive_percentage'],
                colorscale='RdYlGn',
                showscale=False
            ),
            line=dict(color='rgba(149, 165, 166, 0.3)')
        ),
        row=2, col=1
    )
    fig.update_xaxes(title_text="Bank", row=2, col=1, tickangle=45)
    fig.update_yaxes(title_text="Positive %", row=2, col=1)
    
    # 5. Sentiment vs Review Count
    fig.add_trace(
        go.Scatter(
            x=bank_metrics['review_count'],
            y=bank_metrics['avg_sentiment'],
            mode='markers+text',
            marker=dict(
                size=bank_metrics['positive_percentage'] / 5,
                color=bank_metrics['avg_sentiment'],
                colorscale='RdYlGn',
                showscale=False
            ),
            text=bank_metrics.index,
            textposition='top center'
        ),
        row=2, col=2
    )
    fig.update_xaxes(title_text="Review Count", row=2, col=2, type='log')
    fig.update_yaxes(title_text="Average Sentiment", row=2, col=2)
    
    # 6. Top 5 Banks Table
    top_5 = bank_metrics.head(5)
    fig.add_trace(
        go.Table(
            header=dict(values=['Bank', 'Avg Sentiment', 'Reviews', 'Positive %'],
                       fill_color='paleturquoise',
                       align='left'),
            cells=dict(values=[top_5.index, 
                             top_5['avg_sentiment'].round(3), 
                             top_5['review_count'], 
                             top_5['positive_percentage'].round(1)],
                      fill_color='lavender',
                      align='left')
        ),
        row=2, col=3
    )
    
    # 7. Bottom 5 Banks Table
    bottom_5 = bank_metrics.tail(5)
    fig.add_trace(
        go.Table(
            header=dict(values=['Bank', 'Avg Sentiment', 'Reviews', 'Positive %'],
                       fill_color='lightcoral',
                       align='left'),
            cells=dict(values=[bottom_5.index, 
                             bottom_5['avg_sentiment'].round(3), 
                             bottom_5['review_count'], 
                             bottom_5['positive_percentage'].round(1)],
                      fill_color='mistyrose',
                      align='left')
        ),
        row=3, col=1
    )
    
    # 8. Sentiment Distribution by Bank (Box Plot)
    bank_names = bank_metrics.index.tolist()
    for bank in bank_names[:10]:  # Limit to first 10 banks for readability
        bank_scores = df[df['bank_name'] == bank]['sentiment_score']
        fig.add_trace(
            go.Box(
                y=bank_scores,
                name=bank[:15] + '...' if len(bank) > 15 else bank,
                boxpoints='outliers',
                marker_color='rgba(93, 164, 214, 0.5)',
                line_color='rgba(93, 164, 214, 0.8)'
            ),
            row=3, col=2
        )
    fig.update_yaxes(title_text="Sentiment Score", row=3, col=2)
    
    # 9. Performance Matrix (Heatmap)
    performance_matrix = bank_metrics.copy()
    # Normalize metrics for heatmap
    for col in ['avg_sentiment', 'review_count', 'positive_percentage']:
        if col in performance_matrix.columns:
            performance_matrix[col + '_norm'] = (performance_matrix[col] - performance_matrix[col].min()) / \
                                               (performance_matrix[col].max() - performance_matrix[col].min())
    
    heatmap_data = performance_matrix[['avg_sentiment_norm', 'review_count_norm', 'positive_percentage_norm']].T
    
    fig.add_trace(
        go.Heatmap(
            z=heatmap_data.values,
            x=heatmap_data.columns,
            y=['Sentiment', 'Volume', 'Positive %'],
            colorscale='RdYlGn',
            showscale=True,
            colorbar=dict(title="Normalized Score")
        ),
        row=3, col=3
    )
    fig.update_xaxes(title_text="Bank", row=3, col=3, tickangle=45)
    
    # Update layout
    fig.update_layout(
        height=1200,
        showlegend=False,
        title_text="Bank Performance Dashboard",
        title_font_size=20
    )
    
    fig.show()
    
    # Print key findings
    print("\n" + "="*80)
    print("üèÜ BANK PERFORMANCE SUMMARY")
    print("="*80)
    
    print(f"\nüìä Overall Statistics:")
    print(f"   Total Banks: {len(bank_metrics)}")
    print(f"   Total Reviews: {bank_metrics['review_count'].sum():,}")
    print(f"   Average Reviews per Bank: {bank_metrics['review_count'].mean():,.0f}")
    print(f"   Overall Average Sentiment: {bank_metrics['avg_sentiment'].mean():.3f}")
    
    print(f"\nüéñÔ∏è  Top Performing Banks:")
    for i, (bank, row) in enumerate(bank_metrics.head(3).iterrows(), 1):
        print(f"   {i}. {bank}:")
        print(f"      Sentiment: {row['avg_sentiment']:.3f}")
        print(f"      Reviews: {row['review_count']:,}")
        print(f"      Positive: {row['positive_percentage']:.1f}%")
    
    print(f"\nüìâ Lowest Performing Banks:")
    for i, (bank, row) in enumerate(bank_metrics.tail(3).iterrows(), 1):
        print(f"   {i}. {bank}:")
        print(f"      Sentiment: {row['avg_sentiment']:.3f}")
        print(f"      Reviews: {row['review_count']:,}")
        print(f"      Positive: {row['positive_percentage']:.1f}%")

# Create bank performance dashboard
create_bank_performance_dashboard()

# %% [markdown]
"""
## 9. Export Key Insights
"""

# %%
def export_key_insights():
    """Export key insights to a text file."""
    
    insights = []
    insights.append("="*80)
    insights.append("BANK REVIEWS ANALYSIS - KEY INSIGHTS")
    insights.append("="*80)
    insights.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    insights.append("")
    
    # Sentiment insights
    if data_files['sentiment'] is not None:
        df = data_files['sentiment']
        insights.append("üìà SENTIMENT ANALYSIS INSIGHTS")
        insights.append("-"*40)
        
        if 'sentiment_label' in df.columns:
            total = len(df)
            positive = (df['sentiment_label'] == 'positive').sum()
            neutral = (df['sentiment_label'] == 'neutral').sum()
            negative = (df['sentiment_label'] == 'negative').sum()
            
            insights.append(f"Total Reviews Analyzed: {total:,}")
            insights.append(f"Positive Reviews: {positive:,} ({positive/total*100:.1f}%)")
            insights.append(f"Neutral Reviews: {neutral:,} ({neutral/total*100:.1f}%)")
            insights.append(f"Negative Reviews: {negative:,} ({negative/total*100:.1f}%)")
        
        if 'sentiment_score' in df.columns:
            insights.append(f"\nSentiment Score Statistics:")
            insights.append(f"  Average: {df['sentiment_score'].mean():.3f}")
            insights.append(f"  Standard Deviation: {df['sentiment_score'].std():.3f}")
            insights.append(f"  Range: [{df['sentiment_score'].min():.3f}, {df['sentiment_score'].max():.3f}]")
    
    # Thematic insights
    if data_files['thematic'] is not None:
        df = data_files['thematic']
        insights.append("\n\nüìä THEMATIC ANALYSIS INSIGHTS")
        insights.append("-"*40)
        
        if 'identified_themes' in df.columns:
            reviews_with_themes = df['identified_themes'].notna().sum()
            total = len(df)
            
            insights.append(f"Reviews with Themes Identified: {reviews_with_themes:,} ({reviews_with_themes/total*100:.1f}%)")
            
            # Get top themes
            all_themes = []
            for themes in df['identified_themes'].fillna(''):
                if isinstance(themes, str):
                    all_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
            
            if all_themes:
                top_themes = Counter(all_themes).most_common(5)
                insights.append("\nTop 5 Themes Overall:")
                for theme, count in top_themes:
                    percentage = (count / len(all_themes)) * 100
                    insights.append(f"  ‚Ä¢ {theme}: {count:,} ({percentage:.1f}%)")
    
    # Bank performance insights
    if data_files['sentiment'] is not None and 'bank_name' in data_files['sentiment'].columns:
        df = data_files['sentiment']
        bank_metrics = df.groupby('bank_name').agg({
            'sentiment_score': 'mean',
            'sentiment_label': lambda x: (x == 'positive').sum() / len(x) * 100
        }).round(3)
        
        bank_metrics.columns = ['avg_sentiment', 'positive_percentage']
        bank_metrics = bank_metrics.sort_values('avg_sentiment', ascending=False)
        
        insights.append("\n\nüè¶ BANK PERFORMANCE INSIGHTS")
        insights.append("-"*40)
        
        if len(bank_metrics) > 0:
            insights.append(f"Number of Banks Analyzed: {len(bank_metrics)}")
            insights.append(f"\nBest Performing Bank: {bank_metrics.index[0]}")
            insights.append(f"  Average Sentiment: {bank_metrics.iloc[0]['avg_sentiment']:.3f}")
            insights.append(f"  Positive Reviews: {bank_metrics.iloc[0]['positive_percentage']:.1f}%")
            
            insights.append(f"\nWorst Performing Bank: {bank_metrics.index[-1]}")
            insights.append(f"  Average Sentiment: {bank_metrics.iloc[-1]['avg_sentiment']:.3f}")
            insights.append(f"  Positive Reviews: {bank_metrics.iloc[-1]['positive_percentage']:.1f}%")
    
    # Combined insights
    insights.append("\n\nüîç RECOMMENDATIONS & NEXT STEPS")
    insights.append("-"*40)
    insights.append("1. Focus on improving themes with negative sentiment associations")
    insights.append("2. Investigate banks with low sentiment scores for root causes")
    insights.append("3. Monitor theme trends over time for early warning signs")
    insights.append("4. Use insights to prioritize feature development")
    insights.append("5. Consider targeted improvements based on bank-specific themes")
    
    # Save to file
    output_dir = "data/preprocessed"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "analysis_insights.txt")
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(insights))
    
    print(f"‚úÖ Insights saved to: {output_path}")
    
    # Print to notebook
    print("\n" + "="*80)
    print("üìã KEY INSIGHTS SUMMARY")
    print("="*80)
    for line in insights[:20]:  # Print first 20 lines
        print(line)

# Export insights
export_key_insights()

# %% [markdown]
"""
## 10. Summary and Next Steps
"""

# %%
print("\n" + "="*80)
print("üéâ ANALYSIS COMPLETE - SUMMARY")
print("="*80)

# Check what we have
available_data = []
for file_type, data in data_files.items():
    if data is not None:
        if isinstance(data, pd.DataFrame):
            available_data.append(f"‚úÖ {file_type}: {len(data):,} rows")
        else:
            available_data.append(f"‚úÖ {file_type}: JSON data loaded")

print("\nüìÅ Data Files Loaded:")
for item in available_data:
    print(f"  {item}")

print("\nüìä Analysis Completed:")
print("  1. ‚úÖ Sentiment Analysis Visualization")
print("  2. ‚úÖ Thematic Analysis Visualization")
print("  3. ‚úÖ Combined Sentiment & Thematic Analysis")
print("  4. ‚úÖ Word Cloud Generation")
print("  5. ‚úÖ Bank Performance Dashboard")
print("  6. ‚úÖ Key Insights Export")

print("\nüìà Key Outputs Generated:")
print("  ‚Ä¢ Interactive dashboards with Plotly")
print("  ‚Ä¢ Comprehensive visualizations")
print("  ‚Ä¢ Statistical summaries")
print("  ‚Ä¢ Bank performance metrics")
print("  ‚Ä¢ Text file with key insights")

print("\nüöÄ Next Steps:")
print("  1. Review the interactive dashboards above")
print("  2. Check 'data/preprocessed/analysis_insights.txt' for key findings")
print("  3. Use insights for strategic decision making")
print("  4. Consider time-series analysis for trend identification")
print("  5. Explore machine learning for predictive analytics")

print("\n" + "="*80)
print("Thank you for using the Bank Reviews Analysis Dashboard!")
print("="*80)