# Data Analysis Notebook

This notebook combines data collection (scraping) and data preprocessing.

Steps:
1. **Scrape** reviews from Google Play Store
2. **Preprocess** and clean the data
3. **Visualize** the results

In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Get the current working directory
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

# Check if we need to change directory to the project root
if os.path.basename(current_dir) == 'src':
    project_root = os.path.dirname(current_dir)
    os.chdir(project_root)
    print(f"Changed working directory to: {project_root}")
    sys.path.append(current_dir)
else:
    scripts_path = os.path.join(current_dir, 'notebooks')
    if os.path.exists(scripts_path):
        sys.path.append(scripts_path)

import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Step 1: Set project root dynamically ---
# Get the absolute path of the current notebook
notebook_path = os.path.abspath("")
current_dir = os.getcwd()

# Detect if we're in the notebooks folder or elsewhere
project_root = current_dir
if os.path.basename(current_dir) == "notebooks":
    project_root = os.path.dirname(current_dir)

# Add project root to sys.path so we can import src modules
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root set to: {project_root}")
print(f"Python sys.path: {sys.path[:3]} ...")  # Show first 3 entries for sanity

# --- Step 2: Import project modules ---
try:
    from src.preprocessing import ReviewPreprocessor
    from src.scraping import main as run_scraper
except ModuleNotFoundError as e:
    raise ModuleNotFoundError(
        f"Cannot import project modules. Make sure {project_root}/src exists and __init__.py is present."
    ) from e

# --- Step 3: Now you can run preprocessing or scraping ---
# Example usage:
# df = run_scraper()
# preprocessor = ReviewPreprocessor(df)


## 1. Run Scraper

This cell runs the scraping script to fetch the latest reviews from the Google Play Store.
The output will show the progress.

In [None]:
print("üöÄ Starting Scraper...")

# Run the main scraper function
raw_df = run_scraper()

print("\n‚úÖ Scraping Finished.")
display(raw_df.head())

## 2. Run Preprocessing Pipeline

Now we clean the scraped data using our `ReviewPreprocessor`.

In [None]:
# Initialize the preprocessor
preprocessor = ReviewPreprocessor()

# Run the processing pipeline using the correct method
success = preprocessor.process()  # returns True/False

if success:
    print("\n‚úÖ Preprocessing finished successfully!")
    df = preprocessor.df  # access the processed DataFrame
else:
    print("‚ùå Preprocessing failed.")


## 3. Visualizations

Let's explore the cleaned data.

In [None]:
# Set plot style
sns.set(style="whitegrid")
plt.figure(figsize=(12, 5))

# 1. Ratings Distribution
plt.subplot(1, 2, 1)
sns.countplot(x='rating', data=df, palette='viridis')
plt.title('Distribution of Ratings')
plt.xlabel('Star Rating')
plt.ylabel('Count')


# 2. Reviews per Bank
plt.subplot(1, 2, 2)
sns.countplot(x='bank_code', data=df, palette='Set2')
plt.title('Number of Reviews per Bank')
plt.xlabel('Bank')
plt.ylabel('Count')

plt.tight_layout()
plt.show()
# Ratings count per bank
sns.countplot(x='bank_code', hue='rating', data=df, palette='viridis')
plt.title('Rating Count per Bank')
plt.xlabel('Bank')
plt.ylabel('Number of Reviews')
plt.legend(title='Star Rating')
plt.show()

In [None]:
# 3. Review Length Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='text_length', bins=50, kde=True, hue='bank_code')
plt.title('Distribution of Review Lengths by Bank')
plt.xlabel('Review Length (characters)')
plt.ylabel('Count')
plt.show()