In [3]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# NLP and ML libraries
try:
    from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("⚠️ Transformers not available. Install with: pip install transformers torch")

try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    VADER_AVAILABLE = True
except ImportError:
    VADER_AVAILABLE = False
    print("⚠️ VADER not available. Install with: pip install vaderSentiment")

try:
    from textblob import TextBlob
    TEXTBLOB_AVAILABLE = True
except ImportError:
    TEXTBLOB_AVAILABLE = False
    print("⚠️ TextBlob not available. Install with: pip install textblob")

try:
    import spacy
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.cluster import KMeans
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    
    # Download required NLTK data
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    
    NLP_LIBRARIES_AVAILABLE = True
except ImportError:
    NLP_LIBRARIES_AVAILABLE = False
    print("⚠️ Some NLP libraries not available. Install with: pip install spacy scikit-learn nltk")


⚠️ Transformers not available. Install with: pip install transformers torch
⚠️ Some NLP libraries not available. Install with: pip install spacy scikit-learn nltk


In [6]:

class BankReviewAnalyzer:
    def __init__(self, data_path='combined_clean_bank_reviews.csv'):
        """
        Initialize the analyzer with cleaned review data
        """
        self.data_path = data_path
        self.df = None
        self.results_df = None
        
        # Initialize sentiment analyzers if available
        self.distilbert_analyzer = None
        self.vader_analyzer = None
        
        # Initialize NLP components
        self.stopwords = set()
        self.lemmatizer = None
        
        self._setup_analyzers()
        
    def _setup_analyzers(self):
        """Setup sentiment analyzers and NLP components"""
        
        # Setup DistilBERT
        if TRANSFORMERS_AVAILABLE:
            try:
                self.distilbert_analyzer = pipeline(
                    "sentiment-analysis",
                    model="distilbert-base-uncased-finetuned-sst-2-english",
                    return_all_scores=True
                )
                print("✅ DistilBERT sentiment analyzer loaded")
            except Exception as e:
                print(f"⚠️ DistilBERT loading failed: {e}")
        
        # Setup VADER
        if VADER_AVAILABLE:
            self.vader_analyzer = SentimentIntensityAnalyzer()
            print("✅ VADER sentiment analyzer loaded")
        
        # Setup NLP components
        if NLP_LIBRARIES_AVAILABLE:
            try:
                self.stopwords = set(stopwords.words('english'))
                self.lemmatizer = WordNetLemmatizer()
                print("✅ NLP components loaded")
            except:
                pass
    
    def load_data(self):
        """Load the cleaned review data"""
        try:
            self.df = pd.read_csv('../data/.combined_clean_bank_reviews.csv')
            print(f"✅ Loaded {len(self.df)} reviews from {self.data_path}")
            print(f"Banks: {', '.join(self.df['bank'].unique())}")
            return True
        except FileNotFoundError:
            print(f"❌ File {self.data_path} not found. Please run data cleaning first.")
            return False
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            return False
    
    def analyze_sentiment_distilbert(self, text):
        """Analyze sentiment using DistilBERT"""
        if not self.distilbert_analyzer:
            return None, None, None
        
        try:
            results = self.distilbert_analyzer(text[:512])  # Truncate for model limits
            
            # Extract scores
            positive_score = next((item['score'] for item in results[0] if item['label'] == 'POSITIVE'), 0)
            negative_score = next((item['score'] for item in results[0] if item['label'] == 'NEGATIVE'), 0)
            
            # Determine dominant sentiment
            if positive_score > negative_score:
                sentiment_label = 'POSITIVE'
                confidence = positive_score
            else:
                sentiment_label = 'NEGATIVE' 
                confidence = negative_score
            
            return sentiment_label, confidence, {'positive': positive_score, 'negative': negative_score}
            
        except Exception as e:
            print(f"DistilBERT error: {e}")
            return None, None, None
    
    def analyze_sentiment_vader(self, text):
        """Analyze sentiment using VADER"""
        if not self.vader_analyzer:
            return None, None, None
        
        try:
            scores = self.vader_analyzer.polarity_scores(text)
            
            # Determine sentiment label
            if scores['compound'] >= 0.05:
                sentiment_label = 'POSITIVE'
            elif scores['compound'] <= -0.05:
                sentiment_label = 'NEGATIVE'
            else:
                sentiment_label = 'NEUTRAL'
            
            return sentiment_label, abs(scores['compound']), scores
            
        except Exception as e:
            print(f"VADER error: {e}")
            return None, None, None
    
    def analyze_sentiment_textblob(self, text):
        """Analyze sentiment using TextBlob"""
        if not TEXTBLOB_AVAILABLE:
            return None, None, None
        
        try:
            blob = TextBlob(text)
            polarity = blob.sentiment.polarity
            
            if polarity > 0.1:
                sentiment_label = 'POSITIVE'
            elif polarity < -0.1:
                sentiment_label = 'NEGATIVE'
            else:
                sentiment_label = 'NEUTRAL'
            
            return sentiment_label, abs(polarity), {'polarity': polarity, 'subjectivity': blob.sentiment.subjectivity}
            
        except Exception as e:
            print(f"TextBlob error: {e}")
            return None, None, None
    
    def preprocess_text(self, text):
        """Preprocess text for thematic analysis"""
        if not isinstance(text, str):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenize
        if NLP_LIBRARIES_AVAILABLE:
            try:
                tokens = word_tokenize(text)
                # Remove stopwords and lemmatize
                tokens = [self.lemmatizer.lemmatize(token) for token in tokens 
                         if token not in self.stopwords and len(token) > 2]
                return ' '.join(tokens)
            except:
                pass
        
        # Fallback preprocessing
        words = text.split()
        words = [word for word in words if len(word) > 2]
        return ' '.join(words)
    
    def extract_keywords_tfidf(self, texts, max_features=100):
        """Extract keywords using TF-IDF"""
        try:
            vectorizer = TfidfVectorizer(
                max_features=max_features,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.8
            )
            
            tfidf_matrix = vectorizer.fit_transform(texts)
            feature_names = vectorizer.get_feature_names_out()
            
            # Get average TF-IDF scores
            mean_scores = np.array(tfidf_matrix.mean(axis=0)).flatten()
            
            # Create keyword-score pairs
            keyword_scores = list(zip(feature_names, mean_scores))
            keyword_scores.sort(key=lambda x: x[1], reverse=True)
            
            return keyword_scores
            
        except Exception as e:
            print(f"TF-IDF error: {e}")
            return []
    
    def identify_themes(self, keywords, bank_name):
        """Rule-based theme identification from keywords"""
        
        # Define theme patterns
        theme_patterns = {
            'Account Access Issues': [
                'login', 'password', 'access', 'crash', 'error', 'bug', 'not working',
                'cant login', 'login problem', 'authentication', 'account lock'
            ],
            'Transaction Performance': [
                'transfer', 'payment', 'transaction', 'slow', 'fast', 'quick',
                'money transfer', 'send money', 'balance', 'deposit', 'withdraw'
            ],
            'User Interface & Experience': [
                'app', 'interface', 'design', 'easy', 'simple', 'navigation',
                'user friendly', 'ui', 'layout', 'menu', 'screen'
            ],
            'Customer Support': [
                'support', 'help', 'service', 'staff', 'customer', 'response',
                'assistance', 'call center', 'feedback'
            ],
            'Feature Requests': [
                'need', 'want', 'should', 'feature', 'update', 'improvement',
                'add', 'missing', 'wish', 'hope'
            ]
        }
        
        # Score themes based on keyword matches
        theme_scores = {theme: 0 for theme in theme_patterns}
        
        for keyword, score in keywords[:50]:  # Top 50 keywords
            keyword_lower = keyword.lower()
            for theme, patterns in theme_patterns.items():
                for pattern in patterns:
                    if pattern in keyword_lower:
                        theme_scores[theme] += score
        
        # Get top themes
        sorted_themes = sorted(theme_scores.items(), key=lambda x: x[1], reverse=True)
        top_themes = [(theme, score) for theme, score in sorted_themes if score > 0]
        
        return top_themes[:5]  # Return top 5 themes
    
    def analyze_reviews(self):
        """Main analysis pipeline"""
        if not self.load_data():
            return False
        
        print("\n" + "="*50)
        print("STARTING SENTIMENT AND THEMATIC ANALYSIS")
        print("="*50)
        
        # Initialize results dataframe
        self.results_df = self.df.copy()
        
        # Add review ID
        self.results_df['review_id'] = range(1, len(self.results_df) + 1)
        
        # Sentiment analysis
        print("\n1. SENTIMENT ANALYSIS")
        print("-" * 30)
        
        sentiment_results = []
        
        for idx, row in self.df.iterrows():
            review_text = str(row['review'])
            result = {'review_id': idx + 1}
            
            # DistilBERT analysis
            if self.distilbert_analyzer:
                label, score, details = self.analyze_sentiment_distilbert(review_text)
                result['distilbert_sentiment'] = label
                result['distilbert_score'] = score
            
            # VADER analysis
            if self.vader_analyzer:
                label, score, details = self.analyze_sentiment_vader(review_text)
                result['vader_sentiment'] = label
                result['vader_score'] = score
            
            # TextBlob analysis
            label, score, details = self.analyze_sentiment_textblob(review_text)
            result['textblob_sentiment'] = label
            result['textblob_score'] = score
            
            sentiment_results.append(result)
            
            if (idx + 1) % 50 == 0:
                print(f"Processed {idx + 1} reviews...")
        
        # Add sentiment results to dataframe
        sentiment_df = pd.DataFrame(sentiment_results)
        self.results_df = self.results_df.merge(sentiment_df, left_index=True, right_on='review_id', how='left')
        
        print(f"✅ Sentiment analysis completed for {len(self.results_df)} reviews")
        
        # Thematic analysis by bank
        print(f"\n2. THEMATIC ANALYSIS")
        print("-" * 30)
        
        bank_themes = {}
        
        for bank in self.df['bank'].unique():
            print(f"\nAnalyzing themes for {bank}...")
            
            bank_reviews = self.df[self.df['bank'] == bank]['review'].tolist()
            
            # Preprocess reviews
            processed_reviews = [self.preprocess_text(review) for review in bank_reviews]
            processed_reviews = [text for text in processed_reviews if text.strip()]
            
            if processed_reviews:
                # Extract keywords
                keywords = self.extract_keywords_tfidf(processed_reviews)
                
                # Identify themes
                themes = self.identify_themes(keywords, bank)
                
                bank_themes[bank] = {
                    'keywords': keywords[:20],  # Top 20 keywords
                    'themes': themes
                }
                
                print(f"  Top themes: {[theme for theme, score in themes[:3]]}")
        
        # Add theme information to results
        theme_mapping = []
        for _, row in self.results_df.iterrows():
            bank = row['bank']
            review_text = str(row['review']).lower()
            
            identified_themes = []
            if bank in bank_themes:
                for theme, score in bank_themes[bank]['themes']:
                    # Simple rule-based theme assignment
                    theme_patterns = {
                        'Account Access Issues': ['login', 'password', 'crash', 'error', 'bug'],
                        'Transaction Performance': ['transfer', 'payment', 'slow', 'fast'],
                        'User Interface & Experience': ['app', 'interface', 'easy', 'simple'],
                        'Customer Support': ['support', 'help', 'service'],
                        'Feature Requests': ['need', 'want', 'feature', 'update']
                    }
                    
                    if theme in theme_patterns:
                        for pattern in theme_patterns[theme]:
                            if pattern in review_text:
                                identified_themes.append(theme)
                                break
            
            theme_mapping.append({
                'review_id': row['review_id'],
                'identified_themes': '; '.join(identified_themes[:2]) if identified_themes else 'General'
            })
        
        theme_df = pd.DataFrame(theme_mapping)
        self.results_df = self.results_df.merge(theme_df, on='review_id', how='left')
        
        # Generate summary statistics
        self.generate_summary_report(bank_themes)
        
        # Save results
        self.save_results()
        
        return True
    
    def generate_summary_report(self, bank_themes):
        """Generate comprehensive analysis report"""
        print(f"\n" + "="*50)
        print("ANALYSIS SUMMARY REPORT")
        print("="*50)
        
        # Sentiment summary
        print(f"\n1. SENTIMENT ANALYSIS SUMMARY")
        print("-" * 30)
        
        # Overall sentiment distribution
        if 'vader_sentiment' in self.results_df.columns:
            sentiment_dist = self.results_df['vader_sentiment'].value_counts()
            print(f"Overall Sentiment Distribution (VADER):")
            for sentiment, count in sentiment_dist.items():
                percentage = (count / len(self.results_df)) * 100
                print(f"  {sentiment}: {count} ({percentage:.1f}%)")
        
        # Sentiment by bank
        print(f"\nSentiment by Bank:")
        for bank in self.results_df['bank'].unique():
            bank_data = self.results_df[self.results_df['bank'] == bank]
            if 'vader_sentiment' in bank_data.columns:
                pos_count = (bank_data['vader_sentiment'] == 'POSITIVE').sum()
                neg_count = (bank_data['vader_sentiment'] == 'NEGATIVE').sum()
                neu_count = (bank_data['vader_sentiment'] == 'NEUTRAL').sum()
                total = len(bank_data)
                
                print(f"  {bank}: Pos: {pos_count}({pos_count/total*100:.1f}%) "
                      f"Neg: {neg_count}({neg_count/total*100:.1f}%) "
                      f"Neu: {neu_count}({neu_count/total*100:.1f}%)")
        
        # Sentiment by rating
        print(f"\nSentiment by Rating:")
        if 'vader_sentiment' in self.results_df.columns:
            sentiment_rating = self.results_df.groupby(['rating', 'vader_sentiment']).size().unstack(fill_value=0)
            print(sentiment_rating)
        
        # Thematic analysis summary
        print(f"\n2. THEMATIC ANALYSIS SUMMARY")
        print("-" * 30)
        
        for bank, analysis in bank_themes.items():
            print(f"\n{bank}:")
            print(f"  Top Themes:")
            for i, (theme, score) in enumerate(analysis['themes'][:3], 1):
                print(f"    {i}. {theme} (score: {score:.3f})")
            
            print(f"  Top Keywords:")
            for i, (keyword, score) in enumerate(analysis['keywords'][:5], 1):
                print(f"    {i}. {keyword} ({score:.3f})")
        
        # Theme distribution
        if 'identified_themes' in self.results_df.columns:
            theme_dist = self.results_df['identified_themes'].value_counts()
            print(f"\nOverall Theme Distribution:")
            for theme, count in theme_dist.head(10).items():
                percentage = (count / len(self.results_df)) * 100
                print(f"  {theme}: {count} ({percentage:.1f}%)")
    
    def save_results(self):
        """Save analysis results to CSV"""
        
        # Select final columns for output
        output_columns = [
            'review_id', 'review', 'rating', 'date', 'source', 'bank'
        ]
        
        # Add available sentiment columns
        sentiment_columns = ['distilbert_sentiment', 'distilbert_score', 
                           'vader_sentiment', 'vader_score', 
                           'textblob_sentiment', 'textblob_score']
        
        for col in sentiment_columns:
            if col in self.results_df.columns:
                output_columns.append(col)
        
        # Add theme column
        if 'identified_themes' in self.results_df.columns:
            output_columns.append('identified_themes')
        
        # Create final output dataframe
        output_df = self.results_df[output_columns].copy()
        
        # Save to CSV
        output_filename = 'bank_reviews_analysis_results.csv'
        output_df.to_csv(f'../data/{output_filename}', index=False)
        
        print(f"\n✅ Analysis results saved to: {output_filename}")
        print(f"   Total reviews analyzed: {len(output_df)}")
        print(f"   Columns included: {', '.join(output_columns)}")
        
        return output_filename

def main():
    """Main execution function"""
    print("🏦 BANK REVIEWS SENTIMENT & THEMATIC ANALYSIS")
    print("=" * 60)
    
    # Initialize analyzer
    analyzer = BankReviewAnalyzer()
    
    # Run analysis
    success = analyzer.analyze_reviews()
    
    if success:
        print(f"\n🎉 Analysis completed successfully!")
        print(f"📊 Results saved to: bank_reviews_analysis_results.csv")
        print(f"📈 Ready for CX insights and recommendations!")
    else:
        print(f"\n❌ Analysis failed. Please check data and dependencies.")

if __name__ == "__main__":
    main()

🏦 BANK REVIEWS SENTIMENT & THEMATIC ANALYSIS
✅ VADER sentiment analyzer loaded
✅ Loaded 3478 reviews from combined_clean_bank_reviews.csv
Banks: Bank of Abyssinia, Commercial Bank of Ethiopia, Dashen Bank

STARTING SENTIMENT AND THEMATIC ANALYSIS

1. SENTIMENT ANALYSIS
------------------------------
Processed 50 reviews...
Processed 100 reviews...
Processed 150 reviews...
Processed 200 reviews...
Processed 250 reviews...
Processed 300 reviews...
Processed 350 reviews...
Processed 400 reviews...
Processed 450 reviews...
Processed 500 reviews...
Processed 550 reviews...
Processed 600 reviews...
Processed 650 reviews...
Processed 700 reviews...
Processed 750 reviews...
Processed 800 reviews...
Processed 850 reviews...
Processed 900 reviews...
Processed 950 reviews...
Processed 1000 reviews...
Processed 1050 reviews...
Processed 1100 reviews...
Processed 1150 reviews...
Processed 1200 reviews...
Processed 1250 reviews...
Processed 1300 reviews...
Processed 1350 reviews...
Processed 1400 re