# Task 1: Data Collection and Preprocessing

In [None]:
# preprocessing_eda.ipynb

# ----------------------------------------
# Data Analysis Notebook
# ----------------------------------------
# This notebook integrates:
# 1. Scraping reviews from Google Play Store
# 2. Preprocessing and cleaning the data
# 3. Visualizing the results

import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --------------------------
# Set project root dynamically
# --------------------------
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

if os.path.basename(current_dir) == "notebooks":
    project_root = os.path.dirname(current_dir)
else:
    project_root = current_dir

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root set to: {project_root}")

# --------------------------
# Import project modules
# --------------------------
try:
    from src.scraping import main as run_scraper
    from src.preprocessing import ReviewPreprocessor
except ModuleNotFoundError as e:
    raise ModuleNotFoundError(
        f"Cannot import project modules. Make sure {project_root}/src exists and has __init__.py"
    ) from e


## Step 1: Scraping the data

In [None]:

# --------------------------
# Step 1: Run Scraper
# --------------------------
print("üöÄ Starting Google Play Scraper...")

# Run scraper to fetch reviews
raw_df = run_scraper()

print("\n‚úÖ Scraping Finished.")
display(raw_df.head())


## Preprocessing the data

In [None]:
# --------------------------
# Step 2: Run Preprocessing
# --------------------------
print("\nüßπ Starting Preprocessing...")

preprocessor = ReviewPreprocessor()
df = preprocessor.process()  # returns processed DataFrame

# Check if preprocessing returned a valid DataFrame
if df is not None and not df.empty:
    print("\n‚úÖ Preprocessing finished successfully!")
    display(df.head())
    
    # --------------------------
    # Step 2b: Limit to 400 reviews per bank
    # --------------------------
    target_per_bank = 400
    df = df.groupby("bank").head(target_per_bank).reset_index(drop=True)
    print(f"\nDataset limited to {target_per_bank} reviews per bank (if available).")
    print(f"Final dataset size after limiting: {len(df)}")
    display(df.head())
else:
    print("‚ùå Preprocessing failed or resulted in empty dataset.")
    df = pd.DataFrame()  # fallback to empty DataFrame


## Visualizing the Preprocessed data

In [None]:

# --------------------------
# Step 3: Visualizations
# --------------------------
if not df.empty:
    sns.set(style="whitegrid")
    
    # Ratings Distribution
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.countplot(x='rating', data=df, palette='viridis')
    plt.title('Distribution of Ratings')
    plt.xlabel('Star Rating')
    plt.ylabel('Count')
    
    # Reviews per Bank
    plt.subplot(1, 2, 2)
    sns.countplot(x='bank', data=df, palette='Set2')
    plt.title('Number of Reviews per Bank')
    plt.xlabel('Bank')
    plt.ylabel('Count')
    
    plt.tight_layout()
    plt.show()
    
    # Ratings count per bank
    plt.figure(figsize=(10, 5))
    sns.countplot(x='bank', hue='rating', data=df, palette='viridis')
    plt.title('Rating Count per Bank')
    plt.xlabel('Bank')
    plt.ylabel('Number of Reviews')
    plt.legend(title='Star Rating')
    plt.show()
    
    # Review Length Distribution
    if 'text_length' not in df.columns:
        df['text_length'] = df['clean_text'].str.len()
    
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='text_length', bins=50, kde=True, hue='bank')
    plt.title('Distribution of Review Lengths by Bank')
    plt.xlabel('Review Length (characters)')
    plt.ylabel('Count')
    plt.show()
else:
    print("No data available for visualization.")
