# Sentiment Analysis of Bank Reviews

## 1. Data Loading and Initial Setup

In this section, we'll:
- Import necessary libraries
- Load the cleaned review data
- Set up the sentiment analyzer using TextBlob
- Perform initial data exploration

### Libraries Used:
- textblob: For sentiment analysis
- pandas: For data manipulation
- numpy: For numerical operations
- matplotlib: For visualization
- seaborn: For enhanced visualizations

### Data Overview:
We'll analyze reviews from three major Ethiopian banks:
1. Commercial Bank of Ethiopia (CBE)
2. Bank of Abyssinia (BOA)
3. Dashen Bank

Each dataset contains:
- Review text
- Rating (1-5 stars)
- Date
- Bank name
- Source

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import logging
from datetime import datetime
import os

# Set up logging
logging.basicConfig(
    filename='sentiment_analysis.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette("husl")

# Function to load and combine all bank data
def load_bank_data():
    """
    Load cleaned review data for all banks and combine into a single DataFrame.
    """
    data_folder = "data/cleaned"
    bank_files = [
        "Commercial_Bank_of_Ethiopia_cleaned_data.csv",
        "Bank_of_Abyssinia_cleaned_data.csv",
        "Dashen_Bank_cleaned_data.csv"
    ]
    
    all_data = []
    for file in bank_files:
        try:
            file_path = os.path.join(data_folder, file)
            df = pd.read_csv(file_path)
            all_data.append(df)
            logging.info(f"Loaded {len(df)} reviews from {file}")
        except Exception as e:
            logging.error(f"Error loading {file}: {str(e)}")
            continue
    
    if not all_data:
        raise Exception("No data files were loaded successfully")
    
    # Combine all data
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# Load the data
print("Loading review data...")
try:
    df = load_bank_data()
    print("Data loaded successfully!")
except Exception as e:
    print(f"Error loading data: {str(e)}")
    raise

# Display basic information about the dataset
print("\nDataset Overview:")
print(f"Total number of reviews: {len(df)}")
print("\nReviews per bank:")
print(df['bank'].value_counts())

# Display sample of the data
print("\nSample of the data:")
display(df.head())

# Basic statistics
print("\nBasic Statistics:")
print("\nRating Distribution:")
print(df['rating'].value_counts().sort_index())

# Create a summary plot
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='bank', hue='rating')
plt.title('Review Distribution by Bank and Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Calculate and display average ratings
avg_ratings = df.groupby('bank')['rating'].agg(['mean', 'count']).round(2)
print("\nAverage Ratings by Bank:")
print(avg_ratings)

# Define sentiment analysis function
def get_sentiment(text):
    """
    Analyze sentiment of text using TextBlob.
    Returns: 'positive', 'negative', or 'neutral'
    """
    try:
        analysis = TextBlob(str(text))
        if analysis.sentiment.polarity > 0:
            return 'positive'
        elif analysis.sentiment.polarity < 0:
            return 'negative'
        else:
            return 'neutral'
    except Exception as e:
        logging.error(f"Error in sentiment analysis: {str(e)}")
        return 'neutral'

# Apply sentiment analysis
print("\nPerforming sentiment analysis...")
df['sentiment'] = df['review'].apply(get_sentiment)

# Display sentiment distribution
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

# Create sentiment distribution plot
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='sentiment')
plt.title('Overall Sentiment Distribution')
plt.show()

# Create sentiment distribution by bank
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='bank', hue='sentiment')
plt.title('Sentiment Distribution by Bank')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Calculate sentiment percentages by bank
sentiment_by_bank = pd.crosstab(df['bank'], df['sentiment'], normalize='index') * 100
print("\nSentiment Distribution by Bank (%):")
print(sentiment_by_bank.round(2))

ModuleNotFoundError: No module named 'matplotlib'