In [None]:
# %% [markdown]
"""
# Sentiment & Thematic Analysis EDA
## Quick EDA for Bank Reviews with Sentiment and Theme Insights

**Note:** All data files are already available in `data/preprocessed/`
"""

# %% [markdown]
"""
## 1. Setup and Load Data
"""

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import os
import warnings
warnings.filterwarnings('ignore')

# Configure
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("üìÅ Current working directory:", os.getcwd())

# %%
# Load data from the correct location
# Files are in project_root/data/preprocessed, notebook is in notebooks/
DATA_DIR = "../data/preprocessed"  # Go up one level from notebooks/ to project root, then into data/preprocessed

files = {
    'sentiment': 'sentiment_analyzed.csv',
    'thematic': 'thematic_analysis.csv',
    'thematic_summary': 'thematic_summary.csv',
    'thematic_metrics': 'thematic_metrics.csv',
    'processed': 'google_play_processed_reviews.csv',
    'sentiment_report': 'sentiment_analyzed_report.json',
    'thematic_report': 'thematic_analysis_report.json'
}

# Load files
data = {}
for name, file in files.items():
    path = os.path.join(DATA_DIR, file)
    try:
        if file.endswith('.csv'):
            data[name] = pd.read_csv(path)
            print(f"‚úÖ Loaded {name}: {len(data[name])} rows")
        elif file.endswith('.json'):
            with open(path, 'r', encoding='utf-8') as f:
                data[name] = json.load(f)
            print(f"‚úÖ Loaded {name}: JSON report")
    except Exception as e:
        print(f"‚ùå Error loading {file}: {str(e)}")
        data[name] = None

# %%
# Quick preview
print("\n" + "="*80)
print("DATA PREVIEW")
print("="*80)

if data['sentiment'] is not None:
    print("\nüìä Sentiment Data:")
    print(f"Shape: {data['sentiment'].shape}")
    print(f"Columns: {list(data['sentiment'].columns)}")
    
    # Show sample with sentiment columns
    sentiment_cols = ['sentiment_label', 'sentiment_score', 'rating', 'bank_name']
    available_cols = [col for col in sentiment_cols if col in data['sentiment'].columns]
    if available_cols:
        display(data['sentiment'][available_cols].head())
    
    # Sentiment distribution
    if 'sentiment_label' in data['sentiment'].columns:
        dist = data['sentiment']['sentiment_label'].value_counts()
        print(f"\nSentiment Distribution:")
        for label, count in dist.items():
            pct = count / len(data['sentiment']) * 100
            print(f"  {label}: {count:,} ({pct:.1f}%)")

if data['thematic'] is not None:
    print("\nüéØ Thematic Data:")
    print(f"Shape: {data['thematic'].shape}")
    
    # Show sample with theme columns
    theme_cols = ['identified_themes', 'keywords', 'bank_name']
    available_cols = [col for col in theme_cols if col in data['thematic'].columns]
    if available_cols:
        display(data['thematic'][available_cols].head())
    
    # Theme coverage
    if 'identified_themes' in data['thematic'].columns:
        with_themes = data['thematic']['identified_themes'].notna().sum()
        pct = with_themes / len(data['thematic']) * 100
        print(f"\nReviews with themes: {with_themes:,} ({pct:.1f}%)")



In [None]:
# %% [markdown]
"""
## 2. Sentiment Analysis Dashboard
"""

# %%
def sentiment_dashboard(df):
    """Create quick sentiment visualizations."""
    
    fig = make_subplots(
        rows=2, cols=3,
        subplot_titles=('Sentiment Distribution', 'Avg Sentiment by Bank', 
                       'Sentiment vs Rating', 'Score Distribution', 
                       'Bank-wise Sentiment', 'Rating Heatmap'),
        specs=[[{'type': 'pie'}, {'type': 'bar'}, {'type': 'scatter'}],
               [{'type': 'histogram'}, {'type': 'bar'}, {'type': 'heatmap'}]],
        vertical_spacing=0.12,
        horizontal_spacing=0.1
    )
    
    # Pie chart
    if 'sentiment_label' in df.columns:
        counts = df['sentiment_label'].value_counts()
        colors = {'positive': '#2ecc71', 'neutral': '#3498db', 'negative': '#e74c3c'}
        fig.add_trace(
            go.Pie(labels=counts.index, values=counts.values, hole=0.3,
                  marker=dict(colors=[colors.get(label, '#95a5a6') for label in counts.index])),
            row=1, col=1
        )
    
    # Avg sentiment by bank
    if all(col in df.columns for col in ['bank_name', 'sentiment_score']):
        avg = df.groupby('bank_name')['sentiment_score'].mean().sort_values()
        fig.add_trace(
            go.Bar(x=avg.values, y=avg.index, orientation='h',
                  marker_color='rgba(52, 152, 219, 0.7)',
                  text=[f"{val:.3f}" for val in avg.values], textposition='auto'),
            row=1, col=2
        )
        fig.update_xaxes(title_text="Average Sentiment", row=1, col=2)
    
    # Sentiment vs rating scatter
    if all(col in df.columns for col in ['rating', 'sentiment_score']):
        sample_size = min(1000, len(df))
        sample = df.sample(sample_size, random_state=42) if len(df) > 1000 else df
        fig.add_trace(
            go.Scatter(x=sample['rating'], y=sample['sentiment_score'],
                      mode='markers', opacity=0.6,
                      marker=dict(size=8, color=sample['sentiment_score'],
                                colorscale='RdYlGn', showscale=True)),
            row=1, col=3
        )
        fig.update_xaxes(title_text="Rating", row=1, col=3)
        fig.update_yaxes(title_text="Sentiment Score", row=1, col=3)
    
    # Score histogram
    if 'sentiment_score' in df.columns:
        fig.add_trace(
            go.Histogram(x=df['sentiment_score'], nbinsx=30,
                        marker_color='#9b59b6', opacity=0.7),
            row=2, col=1
        )
        fig.update_xaxes(title_text="Sentiment Score", row=2, col=1)
        fig.update_yaxes(title_text="Count", row=2, col=1)
    
    # Bank sentiment stacked
    if all(col in df.columns for col in ['bank_name', 'sentiment_label']):
        sentiment_by_bank = pd.crosstab(df['bank_name'], df['sentiment_label'], normalize='index')
        colors = {'positive': '#2ecc71', 'neutral': '#3498db', 'negative': '#e74c3c'}
        
        for sentiment in ['positive', 'neutral', 'negative']:
            if sentiment in sentiment_by_bank.columns:
                fig.add_trace(
                    go.Bar(x=sentiment_by_bank.index, y=sentiment_by_bank[sentiment],
                          name=sentiment.capitalize(),
                          marker_color=colors.get(sentiment, '#95a5a6')),
                    row=2, col=2
                )
        fig.update_xaxes(title_text="Bank", row=2, col=2, tickangle=45)
        fig.update_yaxes(title_text="Percentage", row=2, col=2)
    
    # Heatmap
    if all(col in df.columns for col in ['rating', 'sentiment_label']):
        heatmap = pd.crosstab(df['rating'], df['sentiment_label'], normalize='index')
        fig.add_trace(
            go.Heatmap(z=heatmap.values, x=heatmap.columns, y=heatmap.index,
                      colorscale='RdYlGn', text=heatmap.values,
                      texttemplate='%{text:.1%}', showscale=True),
            row=2, col=3
        )
        fig.update_xaxes(title_text="Sentiment", row=2, col=3)
        fig.update_yaxes(title_text="Rating", row=2, col=3)
    
    fig.update_layout(height=800, showlegend=True, 
                     title_text="Sentiment Analysis Dashboard", title_font_size=16)
    fig.show()
    
    # Print summary
    print("="*80)
    print("SENTIMENT SUMMARY")
    print("="*80)
    if 'sentiment_label' in df.columns:
        total = len(df)
        print(f"\nTotal Reviews: {total:,}")
        for label in ['positive', 'neutral', 'negative']:
            if label in df['sentiment_label'].unique():
                count = (df['sentiment_label'] == label).sum()
                print(f"  {label.title()}: {count:,} ({count/total*100:.1f}%)")
    
    if 'sentiment_score' in df.columns:
        print(f"\nüìà Score Statistics:")
        print(f"  Mean: {df['sentiment_score'].mean():.3f}")
        print(f"  Std: {df['sentiment_score'].std():.3f}")
        print(f"  Min: {df['sentiment_score'].min():.3f}")
        print(f"  Max: {df['sentiment_score'].max():.3f}")
    
    # Bank-level summary
    if all(col in df.columns for col in ['bank_name', 'sentiment_score']):
        bank_stats = df.groupby('bank_name').agg({
            'sentiment_score': 'mean',
            'sentiment_label': lambda x: (x == 'positive').sum() / len(x) * 100
        }).round(3)
        bank_stats.columns = ['avg_score', 'positive_pct']
        bank_stats = bank_stats.sort_values('avg_score', ascending=False)
        
        print(f"\nüè¶ Bank Performance (Top 3):")
        for i, (bank, row) in enumerate(bank_stats.head(3).iterrows(), 1):
            print(f"{i}. {bank}")
            print(f"   Score: {row['avg_score']:.3f}")
            print(f"   Positive: {row['positive_pct']:.1f}%")

# %%
if data['sentiment'] is not None:
    sentiment_dashboard(data['sentiment'])
else:
    print("‚ùå Sentiment data not available")



In [None]:
# %% [markdown]
"""
## 3. Thematic Analysis Dashboard
"""

# %%
def thematic_dashboard(df):
    """Create quick thematic visualizations."""
    
    if 'identified_themes' not in df.columns:
        print("‚ùå 'identified_themes' column not found in data")
        return
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Top Themes', 'Themes by Bank', 
                       'Theme Co-occurrence', 'Theme-Rating Box'),
        specs=[[{'type': 'bar'}, {'type': 'heatmap'}],
               [{'type': 'heatmap'}, {'type': 'box'}]],
        vertical_spacing=0.15,
        horizontal_spacing=0.15
    )
    
    # Theme distribution
    all_themes = []
    for themes in df['identified_themes'].fillna(''):
        if isinstance(themes, str):
            all_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
    
    if all_themes:
        theme_counts = pd.Series(all_themes).value_counts().head(15)
        fig.add_trace(
            go.Bar(x=theme_counts.values, y=theme_counts.index,
                  orientation='h', marker_color='rgba(46, 204, 113, 0.7)',
                  text=[f"{count:,}" for count in theme_counts.values], textposition='auto'),
            row=1, col=1
        )
        fig.update_xaxes(title_text="Number of Reviews", row=1, col=1)
    
    # Heatmap by bank
    if 'bank_name' in df.columns:
        bank_themes = {}
        for bank in df['bank_name'].unique():
            bank_df = df[df['bank_name'] == bank]
            themes = []
            for t in bank_df['identified_themes'].fillna(''):
                if isinstance(t, str):
                    themes.extend([theme.strip() for theme in t.split(',') if theme.strip()])
            bank_themes[bank] = pd.Series(themes).value_counts().head(5).to_dict()
        
        if bank_themes:
            all_unique_themes = sorted(set(theme for themes in bank_themes.values() for theme in themes.keys()))
            heat_data = []
            for bank in bank_themes.keys():
                heat_data.append([bank_themes[bank].get(theme, 0) for theme in all_unique_themes])
            
            fig.add_trace(
                go.Heatmap(z=heat_data, x=all_unique_themes, y=list(bank_themes.keys()),
                          colorscale='Viridis', showscale=True,
                          colorbar=dict(title="Count")),
                row=1, col=2
            )
            fig.update_xaxes(title_text="Theme", row=1, col=2, tickangle=45)
            fig.update_yaxes(title_text="Bank", row=1, col=2)
    
    # Theme co-occurrence
    all_themes_list = []
    for themes in df['identified_themes'].fillna(''):
        if isinstance(themes, str):
            theme_list = [t.strip() for t in themes.split(',') if t.strip()]
            if len(theme_list) >= 2:
                all_themes_list.append(theme_list)
    
    if all_themes_list:
        flat_themes = [theme for sublist in all_themes_list for theme in sublist]
        top_themes = pd.Series(flat_themes).value_counts().head(10).index.tolist()
        
        # Build co-occurrence matrix
        co_matrix = np.zeros((len(top_themes), len(top_themes)))
        for themes in all_themes_list:
            for i, t1 in enumerate(top_themes):
                if t1 in themes:
                    for j, t2 in enumerate(top_themes):
                        if t2 in themes and t1 != t2:
                            co_matrix[i, j] += 1
        
        fig.add_trace(
            go.Heatmap(z=co_matrix, x=top_themes, y=top_themes,
                      colorscale='Blues', showscale=True,
                      colorbar=dict(title="Co-occurrence")),
            row=2, col=1
        )
        fig.update_xaxes(title_text="Theme", row=2, col=1, tickangle=45)
        fig.update_yaxes(title_text="Theme", row=2, col=1)
    
    # Theme-rating box plot
    if 'rating' in df.columns:
        all_themes_flat = []
        for themes in df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                all_themes_flat.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        if all_themes_flat:
            top_5 = pd.Series(all_themes_flat).value_counts().head(5).index.tolist()
            
            colors = px.colors.qualitative.Set2
            for i, theme in enumerate(top_5):
                theme_ratings = []
                for idx, row in df.iterrows():
                    if isinstance(row['identified_themes'], str) and theme in row['identified_themes']:
                        theme_ratings.append(row['rating'])
                if theme_ratings:
                    fig.add_trace(
                        go.Box(y=theme_ratings, name=theme,
                              marker_color=colors[i % len(colors)]),
                        row=2, col=2
                    )
            fig.update_yaxes(title_text="Rating", row=2, col=2)
            fig.update_xaxes(title_text="Theme", row=2, col=2, tickangle=45)
    
    fig.update_layout(height=800, showlegend=False,
                     title_text="Thematic Analysis Dashboard", title_font_size=16)
    fig.show()
    
    # Print summary
    print("="*80)
    print("THEMATIC SUMMARY")
    print("="*80)
    
    total = len(df)
    with_themes = df['identified_themes'].notna().sum()
    print(f"\nTotal Reviews: {total:,}")
    print(f"Reviews with themes: {with_themes:,} ({with_themes/total*100:.1f}%)")
    
    if all_themes:
        avg_themes = len(all_themes) / with_themes if with_themes > 0 else 0
        print(f"Average themes per review: {avg_themes:.2f}")
        
        print(f"\nüéØ Top 5 Themes:")
        for theme, count in pd.Series(all_themes).value_counts().head(5).items():
            pct = count / len(all_themes) * 100
            print(f"  ‚Ä¢ {theme}: {count:,} ({pct:.1f}%)")

# %%
if data['thematic'] is not None:
    thematic_dashboard(data['thematic'])
else:
    print("‚ùå Thematic data not available")



In [None]:
# %% [markdown]
"""
## 4. Combined Sentiment & Thematic Analysis
"""

# %%
def combined_analysis(sentiment_df, thematic_df):
    """Combine sentiment and thematic analysis."""
    
    # Merge data
    merge_cols = ['review_id'] if 'review_id' in sentiment_df.columns and 'review_id' in thematic_df.columns else None
    
    if merge_cols:
        combined = pd.merge(sentiment_df, thematic_df, on=merge_cols, how='inner')
    else:
        # Try to align by index
        combined = sentiment_df.copy()
        if len(sentiment_df) == len(thematic_df):
            for col in thematic_df.columns:
                if col not in combined.columns:
                    combined[col] = thematic_df[col].values
        else:
            print("‚ö†Ô∏è Cannot merge: dataframes have different lengths")
            return
    
    print(f"Combined data: {len(combined)} reviews")
    
    # Create visualization
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Avg Sentiment by Theme', 'Themes by Sentiment',
                       'Positive vs Negative Themes', 'Theme-Sentiment Correlation'),
        specs=[[{'type': 'bar'}, {'type': 'bar'}],
               [{'type': 'bar'}, {'type': 'scatter'}]],
        vertical_spacing=0.15,
        horizontal_spacing=0.15
    )
    
    # Avg sentiment by theme
    if all(col in combined.columns for col in ['identified_themes', 'sentiment_score']):
        theme_scores = []
        for idx, row in combined.iterrows():
            if isinstance(row['identified_themes'], str):
                themes = [t.strip() for t in row['identified_themes'].split(',') if t.strip()]
                for theme in themes:
                    theme_scores.append({'theme': theme, 'score': row['sentiment_score']})
        
        if theme_scores:
            theme_df = pd.DataFrame(theme_scores)
            # Filter themes with at least 5 occurrences
            theme_counts = theme_df['theme'].value_counts()
            valid_themes = theme_counts[theme_counts >= 5].index
            theme_df = theme_df[theme_df['theme'].isin(valid_themes)]
            
            if len(theme_df) > 0:
                avg_scores = theme_df.groupby('theme')['score'].mean().sort_values()
                top_10 = avg_scores.tail(10)  # Top 10 by sentiment
                fig.add_trace(
                    go.Bar(x=top_10.values, y=top_10.index,
                          orientation='h', marker_color='rgba(52, 152, 219, 0.7)'),
                    row=1, col=1
                )
                fig.update_xaxes(title_text="Avg Sentiment", row=1, col=1)
    
    # Themes by sentiment
    if all(col in combined.columns for col in ['identified_themes', 'sentiment_label']):
        # Get top 5 themes
        all_themes = []
        for themes in combined['identified_themes'].fillna(''):
            if isinstance(themes, str):
                all_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        if all_themes:
            top_5 = pd.Series(all_themes).value_counts().head(5).index.tolist()
            
            # Count by sentiment
            sentiment_data = {}
            for sentiment in ['positive', 'neutral', 'negative']:
                sentiment_df = combined[combined['sentiment_label'] == sentiment]
                counts = []
                for theme in top_5:
                    count = sum(1 for t in sentiment_df['identified_themes'].fillna('') 
                               if isinstance(t, str) and theme in t)
                    counts.append(count)
                sentiment_data[sentiment] = counts
            
            colors = {'positive': '#2ecc71', 'neutral': '#3498db', 'negative': '#e74c3c'}
            for sentiment, counts in sentiment_data.items():
                fig.add_trace(
                    go.Bar(x=top_5, y=counts, name=sentiment.capitalize(),
                          marker_color=colors[sentiment]),
                    row=1, col=2
                )
            fig.update_xaxes(title_text="Theme", row=1, col=2, tickangle=45)
            fig.update_yaxes(title_text="Count", row=1, col=2)
    
    # Positive vs negative themes
    if all(col in combined.columns for col in ['identified_themes', 'sentiment_label']):
        pos_df = combined[combined['sentiment_label'] == 'positive']
        neg_df = combined[combined['sentiment_label'] == 'negative']
        
        # Get top themes for each
        pos_themes, neg_themes = [], []
        
        for themes in pos_df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                pos_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        for themes in neg_df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                neg_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        if pos_themes and neg_themes:
            pos_top = pd.Series(pos_themes).value_counts().head(5)
            neg_top = pd.Series(neg_themes).value_counts().head(5)
            
            themes = list(set(list(pos_top.index) + list(neg_top.index)))
            pos_counts = [pos_top.get(t, 0) for t in themes]
            neg_counts = [neg_top.get(t, 0) for t in themes]
            
            fig.add_trace(
                go.Bar(x=themes, y=pos_counts, name='Positive',
                      marker_color='#2ecc71'),
                row=2, col=1
            )
            fig.add_trace(
                go.Bar(x=themes, y=neg_counts, name='Negative',
                      marker_color='#e74c3c'),
                row=2, col=1
            )
            fig.update_xaxes(title_text="Theme", row=2, col=1, tickangle=45)
            fig.update_yaxes(title_text="Count", row=2, col=1)
    
    # Theme-sentiment correlation
    if all(col in combined.columns for col in ['identified_themes', 'sentiment_score']):
        theme_scores = []
        for idx, row in combined.iterrows():
            if isinstance(row['identified_themes'], str):
                themes = [t.strip() for t in row['identified_themes'].split(',') if t.strip()]
                for theme in themes:
                    theme_scores.append({'theme': theme, 'score': row['sentiment_score']})
        
        if theme_scores:
            scores_df = pd.DataFrame(theme_scores)
            theme_stats = scores_df.groupby('theme').agg({'score': ['mean', 'count']})
            theme_stats.columns = ['avg_score', 'count']
            theme_stats = theme_stats[theme_stats['count'] >= 5]
            
            if len(theme_stats) > 0:
                fig.add_trace(
                    go.Scatter(x=theme_stats['count'], y=theme_stats['avg_score'],
                             mode='markers+text', text=theme_stats.index,
                             marker=dict(size=np.sqrt(theme_stats['count']) * 2,
                                       color=theme_stats['avg_score'],
                                       colorscale='RdYlGn', showscale=True),
                             textposition='top center'),
                    row=2, col=2
                )
                fig.update_xaxes(title_text="Occurrences", row=2, col=2, type='log')
                fig.update_yaxes(title_text="Avg Sentiment", row=2, col=2)
    
    fig.update_layout(height=800, showlegend=True, barmode='group',
                     title_text="Combined Sentiment & Thematic Analysis", title_font_size=16)
    fig.show()
    
    # Print insights
    print("="*80)
    print("COMBINED INSIGHTS")
    print("="*80)
    
    if all(col in combined.columns for col in ['sentiment_label', 'identified_themes']):
        # Most positive themes
        pos_df = combined[combined['sentiment_label'] == 'positive']
        pos_themes = []
        for themes in pos_df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                pos_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        if pos_themes:
            top_pos = pd.Series(pos_themes).value_counts().head(3)
            print("\nüéØ Top Themes in POSITIVE Reviews:")
            for theme, count in top_pos.items():
                pct = count / len(pos_themes) * 100
                print(f"  ‚Ä¢ {theme}: {count:,} ({pct:.1f}%)")
        
        # Most negative themes
        neg_df = combined[combined['sentiment_label'] == 'negative']
        neg_themes = []
        for themes in neg_df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                neg_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        if neg_themes:
            top_neg = pd.Series(neg_themes).value_counts().head(3)
            print("\n‚ö†Ô∏è  Top Themes in NEGATIVE Reviews:")
            for theme, count in top_neg.items():
                pct = count / len(neg_themes) * 100
                print(f"  ‚Ä¢ {theme}: {count:,} ({pct:.1f}%)")

# %%
if data['sentiment'] is not None and data['thematic'] is not None:
    combined_analysis(data['sentiment'], data['thematic'])
else:
    print("‚ùå Both sentiment and thematic data required for combined analysis")



In [None]:
# %% [markdown]
"""
## 5. Export Insights
"""

# %%
def export_insights(sentiment_df, thematic_df):
    """Export key insights to text file."""
    
    insights = []
    insights.append("="*80)
    insights.append("BANK REVIEWS ANALYSIS - KEY INSIGHTS")
    insights.append("="*80)
    insights.append(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
    insights.append("")
    
    # Sentiment insights
    if sentiment_df is not None:
        total = len(sentiment_df)
        insights.append("üìà SENTIMENT ANALYSIS")
        insights.append("-"*40)
        insights.append(f"Total Reviews Analyzed: {total:,}")
        
        if 'sentiment_label' in sentiment_df.columns:
            for label in ['positive', 'neutral', 'negative']:
                if label in sentiment_df['sentiment_label'].unique():
                    count = (sentiment_df['sentiment_label'] == label).sum()
                    insights.append(f"{label.title()}: {count:,} ({count/total*100:.1f}%)")
        
        if 'sentiment_score' in sentiment_df.columns:
            insights.append(f"Average Sentiment Score: {sentiment_df['sentiment_score'].mean():.3f}")
    
    # Thematic insights
    if thematic_df is not None and 'identified_themes' in thematic_df.columns:
        total = len(thematic_df)
        with_themes = thematic_df['identified_themes'].notna().sum()
        
        insights.append("\nüìä THEMATIC ANALYSIS")
        insights.append("-"*40)
        insights.append(f"Reviews with Themes: {with_themes:,} ({with_themes/total*100:.1f}%)")
        
        # Top themes
        all_themes = []
        for themes in thematic_df['identified_themes'].fillna(''):
            if isinstance(themes, str):
                all_themes.extend([t.strip() for t in themes.split(',') if t.strip()])
        
        if all_themes:
            top_5 = pd.Series(all_themes).value_counts().head(5)
            insights.append("\nTop 5 Themes:")
            for theme, count in top_5.items():
                pct = count / len(all_themes) * 100
                insights.append(f"  ‚Ä¢ {theme}: {count:,} ({pct:.1f}%)")
    
    # Combined insights
    if sentiment_df is not None and 'bank_name' in sentiment_df.columns:
        bank_metrics = sentiment_df.groupby('bank_name')['sentiment_score'].mean().sort_values(ascending=False)
        
        insights.append("\nüè¶ BANK PERFORMANCE")
        insights.append("-"*40)
        if len(bank_metrics) > 0:
            insights.append(f"Best Performing Bank: {bank_metrics.index[0]} (Score: {bank_metrics.iloc[0]:.3f})")
            insights.append(f"Worst Performing Bank: {bank_metrics.index[-1]} (Score: {bank_metrics.iloc[-1]:.3f})")
    
    # Recommendations
    insights.append("\nüéØ RECOMMENDATIONS")
    insights.append("-"*40)
    insights.append("1. Address themes frequently mentioned in negative reviews")
    insights.append("2. Investigate low-performing banks for systemic issues")
    insights.append("3. Monitor theme trends for proactive improvements")
    insights.append("4. Use sentiment-theme correlation for targeted feature development")
    insights.append("5. Consider A/B testing for high-impact theme improvements")
    
    # Save to file
    output_dir = "data/preprocessed"  # Save in notebooks/data/preprocessed
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "analysis_insights.txt")
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(insights))
    
    print(f"‚úÖ Insights saved to: {output_path}")
    
    # Print summary
    print("\n" + "="*80)
    print("KEY INSIGHTS SUMMARY")
    print("="*80)
    for line in insights[:20]:  # Print first 20 lines
        print(line)

# %%
export_insights(data['sentiment'], data['thematic'])



In [None]:
# %% [markdown]
"""
## 6. Summary
"""

# %%
print("\n" + "="*80)
print("üéâ ANALYSIS COMPLETE")
print("="*80)

print("\nüìä Analysis Completed:")
print("  ‚úÖ Sentiment Analysis Dashboard")
print("  ‚úÖ Thematic Analysis Dashboard")
print("  ‚úÖ Combined Sentiment & Thematic Analysis")
print("  ‚úÖ Key Insights Exported")

print("\nüìÅ Files Available:")
for name, df in data.items():
    if df is not None:
        if isinstance(df, pd.DataFrame):
            print(f"  ‚Ä¢ {name}: {len(df):,} rows")
        else:
            print(f"  ‚Ä¢ {name}: JSON report")

print("\nüöÄ Next Steps:")
print("  1. Review interactive dashboards above")
print("  2. Check notebooks/data/preprocessed/analysis_insights.txt")
print("  3. Use insights for strategic decision making")
print("  4. Consider time-series analysis for trends")
print("  5. Explore deeper correlations with statistical tests")

print("\n" + "="*80)
print("‚úÖ Ready for presentation and decision-making!")
print("="*80)