In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

### ================================================================
### 1. DATA LOADING AND INITIAL EXPLORATION
### ================================================================

In [19]:
def clean_bank_reviews():
    file_paths = [
    '../data/Bank_of_Abyssinia_reviews.csv',
    '../data/Commercial_Bank_of_Ethiopia_reviews.csv', 
    '../data/Dashen_Bank_reviews.csv'
    ]
    
    # List to store cleaned dataframes
    cleaned_dfs = []
    
    print("Starting data cleaning process...")
    print("=" * 50)
    
    for file_path in file_paths:
        if os.path.exists(file_path):
            print(f"\nProcessing: {file_path}")
            
            # Read the CSV file
            df = pd.read_csv(file_path)
            print(f"Original shape: {df.shape}")
            
            # Display basic info about the dataset
            print(f"Columns: {list(df.columns)}")
            print(f"Missing values:\n{df.isnull().sum()}")
            
            # 1. Remove duplicates
            initial_rows = len(df)
            df = df.drop_duplicates()
            duplicates_removed = initial_rows - len(df)
            print(f"Duplicates removed: {duplicates_removed}")
            
            # 2. Handle missing data
            # Critical columns: review text cannot be missing
            df_before_critical = len(df)
            df = df.dropna(subset=['review'])
            critical_rows_removed = df_before_critical - len(df)
            print(f"Rows removed due to missing review text: {critical_rows_removed}")
            
            # Non-critical columns: fill missing values appropriately
            # Fill missing ratings with median rating
            if df['rating'].isnull().any():
                median_rating = df['rating'].median()
                df['rating'].fillna(median_rating, inplace=True)
                print(f"Missing ratings filled with median: {median_rating}")
            
            # Fill missing dates with a placeholder or current date
            if df['date'].isnull().any():
                df['date'].fillna('2025-01-01', inplace=True)
                print("Missing dates filled with placeholder date")
            
            # Fill missing source with 'Unknown'
            if 'source' in df.columns and df['source'].isnull().any():
                df['source'].fillna('Unknown', inplace=True)
                print("Missing sources filled with 'Unknown'")
            
            # 3. Standardize date format using strftime
            try:
                # Convert to datetime first
                df['date'] = pd.to_datetime(df['date'])
                # Format to YYYY-MM-DD using strftime
                df['date'] = df['date'].dt.strftime('%Y-%m-%d')
                print("Date format standardized to YYYY-MM-DD")
            except Exception as e:
                print(f"Date formatting error: {e}")
            
            # 4. Clean and standardize text data
            # Remove extra whitespace from review text
            df['review'] = df['review'].astype(str).str.strip()
            
            # Remove any empty string reviews that might have passed the null check
            df = df[df['review'] != '']
            
            # 5. Ensure rating is numeric and within valid range (1-5)
            df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
            df = df[(df['rating'] >= 1) & (df['rating'] <= 5)]
            
            # 6. Select only the required columns in the correct order
            required_columns = ['review', 'rating', 'date', 'source']
            
            # Add bank column if it exists, otherwise create from filename
            if 'bank' not in df.columns:
                bank_name = file_path.replace('_reviews.csv', '').replace('_', ' ')
                df['bank'] = bank_name
            
            # Reorder columns (including bank for reference, can be removed later if needed)
            final_columns = ['review', 'rating', 'date', 'source', 'bank']
            df = df[final_columns]
            
            print(f"Final shape after cleaning: {df.shape}")
            print(f"Date range: {df['date'].min()} to {df['date'].max()}")
            print(f"Rating distribution:\n{df['rating'].value_counts().sort_index()}")
            
            # Save cleaned dataset
            # Extract filename only
            file_dir = os.path.dirname(file_path)  # ../data
            file_name = os.path.basename(file_path)  # Bank_of_Abyssinia_reviews.csv

            # Construct new file path in the same directory
            output_filename = os.path.join(file_dir, f"cleaned_{file_name}") 

            os.makedirs(file_dir, exist_ok=True)

            # Save cleaned file
            df.to_csv(output_filename, index=False)
            print(f"Cleaned data saved to: {output_filename}")
            cleaned_dfs.append(df)
            
        else:
            print(f"File not found: {file_path}")
    
    # Combine all cleaned datasets
    if cleaned_dfs:
        combined_df = pd.concat(cleaned_dfs, ignore_index=True)
        
        print("\n" + "=" * 50)
        print("COMBINED DATASET SUMMARY")
        print("=" * 50)
        print(f"Total reviews: {len(combined_df)}")
        print(f"Banks included: {combined_df['bank'].unique()}")
        print(f"Date range: {combined_df['date'].min()} to {combined_df['date'].max()}")
        print(f"Rating distribution:\n{combined_df['rating'].value_counts().sort_index()}")
        print(f"Source distribution:\n{combined_df['source'].value_counts()}")
        
        
        
        combined_df.to_csv('../data/.combined_clean_bank_reviews.csv', index=False)
        print(f"\nCombined clean dataset saved to: combined_clean_bank_reviews.csv")
        
        # Display sample of cleaned data
        print(f"\nSample of cleaned data:")
        print(combined_df.head())
        
        return combined_df
    
    else:
        print("No datasets were successfully processed.")
        return None

# Data quality check function
def data_quality_report(df):
    """
    Generate a comprehensive data quality report
    """
    print("\n" + "=" * 50)
    print("DATA QUALITY REPORT")
    print("=" * 50)
    
    print(f"Dataset shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")
    
    print(f"\nColumn data types:")
    print(df.dtypes)
    
    print(f"\nMissing values:")
    print(df.isnull().sum())
    
    print(f"\nDuplicate rows: {df.duplicated().sum()}")
    
    print(f"\nUnique values per column:")
    for col in df.columns:
        print(f"  {col}: {df[col].nunique()}")
    
    print(f"\nReview text statistics:")
    review_lengths = df['review'].str.len()
    print(f"  Average length: {review_lengths.mean():.1f} characters")
    print(f"  Min length: {review_lengths.min()}")
    print(f"  Max length: {review_lengths.max()}")
    print(f"  Reviews with length < 10: {(review_lengths < 10).sum()}")
    
    print(f"\nRating statistics:")
    print(df['rating'].describe())
    
    return df

if __name__ == "__main__":
    # Clean the datasets
    combined_data = clean_bank_reviews()
    
    if combined_data is not None:
        # Generate quality report
        data_quality_report(combined_data)
        
        print(f"\n✅ Data cleaning completed successfully!")
        print(f"Ready for sentiment and thematic analysis.")
    else:
        print(f"\n❌ Data cleaning failed. Please check file paths and data.")

Starting data cleaning process...

Processing: ../data/Bank_of_Abyssinia_reviews.csv
Original shape: (889, 5)
Columns: ['review', 'rating', 'date', 'bank', 'source']
Missing values:
review    0
rating    0
date      0
bank      0
source    0
dtype: int64
Duplicates removed: 0
Rows removed due to missing review text: 0
Date format standardized to YYYY-MM-DD
Final shape after cleaning: (889, 5)
Date range: 2024-01-10 to 2025-06-05
Rating distribution:
rating
1    412
2     51
3     56
4     38
5    332
Name: count, dtype: int64
Cleaned data saved to: ../data\cleaned_Bank_of_Abyssinia_reviews.csv

Processing: ../data/Commercial_Bank_of_Ethiopia_reviews.csv
Original shape: (2180, 5)
Columns: ['review', 'rating', 'date', 'bank', 'source']
Missing values:
review    0
rating    0
date      0
bank      0
source    0
dtype: int64
Duplicates removed: 0
Rows removed due to missing review text: 0
Date format standardized to YYYY-MM-DD
Final shape after cleaning: (2180, 5)
Date range: 2024-03-06 to