Task 2: Sentiment and Thematic Analysis

<li>Analyzes sentiment using DistilBERT
<li>Extracts keywords and identifies themes
<li>Saves analysis results as CSV

In [5]:
# import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import logging

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [1]:
pip install spacy

Collecting spacy
  Using cached spacy-3.8.7-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Using cached thinc-8.3.6-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting weasel<0.5.0,>=0.1.0 (from spacy)
  Using cached weasel-0.4.1-py3-none-any.whl.metadata (4.6 kB)
Collecting langcodes<4.0.0,>=3.2.0 (from spacy)
  Using cached langcodes-3.5.0-py3-none-any.whl.metadata (29 kB)
Collecting blis<1.4.0,>=1.3.0 (from thinc<8.4.0,>=8.3.4->spacy)
  Using cached blis-1.3.0-cp312-cp312-win_amd64.whl.metadata (7.6 kB)
Collecting confection<1.0.0,>=0.0.1 (from thinc<8.4.0,>=8.3.4->spacy)
  Using cached confection-0.1.5-py3-none-any.whl.metadata (19 kB)
Using cached spacy-3.8.7-cp312-cp312-win_amd64.whl (13.9 MB)
Using cached langcodes-3.5.0-py3-none-any.whl (182 kB)
Using cached thinc-8.3.6-cp312-cp312-win_amd64.whl (1.7 MB)
Using cached weasel-0.4.1-py3-none-any.whl (50 kB)
Using cached blis-1.3.0-cp312-cp312-win_amd64.whl (6.3 MB)
Using cached confectio


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
!python -m spacy download en_core_web_sm


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 148, in _get_module_details
  File "<frozen runpy>", line 112, in _get_module_details
  File "D:\python\WPy64-31241\python-3.12.4.amd64\Lib\site-packages\spacy\__init__.py", line 6, in <module>
  File "D:\python\WPy64-31241\python-3.12.4.amd64\Lib\site-packages\spacy\errors.py", line 3, in <module>
    from .compat import Literal
  File "D:\python\WPy64-31241\python-3.12.4.amd64\Lib\site-packages\spacy\compat.py", line 4, in <module>
  

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:

# Set up logging
logging.basicConfig(filename='analysis.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
def load_data(csv_path):
    """Load the cleaned review data"""
    try:
        df = pd.read_csv(csv_path)
        logging.info(f"Loaded {len(df)} reviews from {csv_path}")
        return df
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        return None

In [None]:
def preprocess_text(text):
    """Preprocess text for NLP analysis"""
    if pd.isna(text) or text == '':
        return ''
    
    # Convert to lowercase and remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

In [None]:
def analyze_sentiment(df):
    """Analyze sentiment using DistilBERT"""
    logging.info("Starting sentiment analysis...")
    
    # Initialize the sentiment analysis pipeline
    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    
    # Process reviews in batches to avoid memory issues
    batch_size = 32
    sentiments = []
    scores = []
    
    for i in range(0, len(df), batch_size):
        batch = df['review_text'][i:i+batch_size].tolist()
        batch = [text if pd.notna(text) else "" for text in batch]
        
        # Skip empty strings
        valid_indices = [j for j, text in enumerate(batch) if text.strip()]
        valid_texts = [text for text in batch if text.strip()]
        
        if not valid_texts:
            continue
        
        results = sentiment_analyzer(valid_texts)
        
        # Map results back to original indices
        for idx, result in zip(valid_indices, results):
            sentiments.append(result['label'])
            scores.append(result['score'])
        
        # Fill missing values for empty strings
        for j in range(len(batch)):
            if j not in valid_indices:
                sentiments.append('NEUTRAL')
                scores.append(0.5)
    
    df['sentiment_label'] = sentiments
    df['sentiment_score'] = scores
    
    logging.info(f"Sentiment analysis complete. Found {df['sentiment_label'].value_counts().to_dict()}")
    return df

In [None]:
def extract_keywords(df):
    """Extract keywords using TF-IDF"""
    logging.info("Extracting keywords...")
    
    # Preprocess text for TF-IDF
    df['processed_text'] = df['review_text'].apply(preprocess_text)
    
    # Group by bank to extract keywords per bank
    banks = df['bank_name'].unique()
    all_keywords = {}
    
    for bank in banks:
        bank_reviews = df[df['bank_name'] == bank]['processed_text']
        
        # Skip if no reviews
        if len(bank_reviews) == 0:
            continue
            
        # Apply TF-IDF
        vectorizer = TfidfVectorizer(max_features=100, min_df=2, max_df=0.7)
        tfidf_matrix = vectorizer.fit_transform(bank_reviews)
        
        # Get feature names
        feature_names = vectorizer.get_feature_names_out()
        
        # Get top keywords based on TF-IDF scores
        tfidf_scores = np.array(tfidf_matrix.sum(axis=0)).flatten()
        top_indices = tfidf_scores.argsort()[-30:][::-1]  # Top 30 keywords
        top_keywords = [feature_names[i] for i in top_indices]
        
        all_keywords[bank] = top_keywords
        logging.info(f"Extracted top keywords for {bank}: {', '.join(top_keywords[:10])}")
    
    return all_keywords

In [None]:
def identify_themes(df, keywords):
    """Identify themes based on keywords"""
    logging.info("Identifying themes...")
    
    # Define theme keywords for each bank
    theme_definitions = {
        'Dashen Bank': {
            'UI/UX': ['interface', 'design', 'ui', 'user', 'friendly', 'navigation', 'screen', 'menu'],
            'Performance': ['slow', 'fast', 'speed', 'crash', 'bug', 'loading', 'response', 'time'],
            'Features': ['feature', 'transfer', 'payment', 'balance', 'transaction', 'service', 'option'],
            'Security': ['security', 'login', 'password', 'fingerprint', 'authentication', 'secure', 'protection'],
            'Support': ['support', 'customer', 'service', 'help', 'contact', 'assistance', 'resolve']
        },
        'Commercial Bank of Ethiopia': {
            'UI/UX': ['interface', 'design', 'ui', 'user', 'friendly', 'navigation', 'screen', 'menu'],
            'Performance': ['slow', 'fast', 'speed', 'crash', 'bug', 'loading', 'response', 'time'],
            'Features': ['feature', 'transfer', 'payment', 'balance', 'transaction', 'service', 'option'],
            'Security': ['security', 'login', 'password', 'fingerprint', 'authentication', 'secure', 'protection'],
            'Support': ['support', 'customer', 'service', 'help', 'contact', 'assistance', 'resolve']
        },
        'Bank of Abyssinia': {
            'UI/UX': ['interface', 'design', 'ui', 'user', 'friendly', 'navigation', 'screen', 'menu'],
            'Performance': ['slow', 'fast', 'speed', 'crash', 'bug', 'loading', 'response', 'time'],
            'Features': ['feature', 'transfer', 'payment', 'balance', 'transaction', 'service', 'option'],
            'Security': ['security', 'login', 'password', 'fingerprint', 'authentication', 'secure', 'protection'],
            'Support': ['support', 'customer', 'service', 'help', 'contact', 'assistance', 'resolve']
        }
    }
    
    # Enrich theme definitions with extracted keywords
    for bank, bank_keywords in keywords.items():
        if bank in theme_definitions:
            # Add top keywords to appropriate themes based on similarity
            for keyword in bank_keywords:
                for theme, theme_keywords in theme_definitions[bank].items():
                    if any(keyword in k or k in keyword for k in theme_keywords):
                        theme_definitions[bank][theme].append(keyword)
    
    # Identify themes in each review
    df['identified_themes'] = ''
    
    for idx, row in df.iterrows():
        bank = row['bank_name']
        text = row['review_text'].lower() if pd.notna(row['review_text']) else ""
        
        if bank not in theme_definitions or not text:
            continue
            
        themes = []
        for theme, keywords in theme_definitions[bank].items():
            if any(keyword in text for keyword in keywords):
                themes.append(theme)
        
        df.at[idx, 'identified_themes'] = ';'.join(themes) if themes else 'Unclassified'
    
    # Log theme distribution
    for bank in theme_definitions:
        bank_df = df[df['bank_name'] == bank]
        logging.info(f"Theme distribution for {bank}: {bank_df['identified_themes'].value_counts().to_dict()}")
    
    return df

In [None]:
def save_results(df, output_path):
    """Save analysis results to CSV"""
    df.to_csv(output_path, index=False)
    logging.info(f"Analysis results saved to {output_path}")

In [None]:


def run_analysis(input_csv):
    """Run the complete sentiment and thematic analysis"""
    logging.info(f"Starting analysis on {input_csv}")
    
    # Load data
    df = load_data(input_csv)
    if df is None:
        return
    
    # Analyze sentiment
    df = analyze_sentiment(df)
    
    # Extract keywords
    keywords = extract_keywords(df)
    
    # Identify themes
    df = identify_themes(df, keywords)
    
    # Save results
    output_file = input_csv.replace('.csv', '_analyzed.csv')
    save_results(df, output_file)
    
    logging.info("Analysis complete")
    return output_file

if __name__ == "__main__":
    # load the cleaned dataset
    input_file = r"aE://KAIM//phase 2//Week 2//Customer-Experience-Analytics-for-Fintech-Apps//data//all_banks_reviews_clean_20250612_205612.csv"  # Update with your actual filename
    run_analysis(input_file)