## Monzo User Sentiment & Feature Insights Dashboard

In [None]:
"""
============================================================
Notebook: 01_data_exploration.ipynb
Author: James O. Adeshina
Version: 1.0  |  October 2025
============================================================

Purpose:
--------
This notebook performs the initial data ingestion and exploratory
data analysis (EDA) for Monzo app reviews exported from AppFollow.

It aims to:
    • Validate the structure and quality of the raw review datasets
    • Unify Apple App Store and Google Play Store schemas
    • Identify cleaning requirements (duplicates, nulls, non-English)
    • Explore rating and review distributions
    • Provide summary insights to guide the next stages:
        - Data cleaning & normalization
        - Thematic tagging
        - Sentiment analysis
============================================================
"""

In [None]:
# ------------------------------------------------------------
# 1. Import Libraries
# ------------------------------------------------------------
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from langdetect import detect, DetectorFactory
from wordcloud import WordCloud

In [None]:
# Reproducibility for language detection
DetectorFactory.seed = 42

# Display settings
pd.set_option("display.max_columns", 50)
sns.set(style="whitegrid", palette="muted", font_scale=1.1)


In [None]:
# ------------------------------------------------------------
# 2. Define File Paths
# ------------------------------------------------------------
APPSTORE_PATH = "../data/raw/appstore/monzo_appstore_2015_2025.csv"
GOOGLEPLAY_PATHS = [
    "../data/raw/googleplay/monzo_googleplay_2015_2019.csv",
    "../data/raw/googleplay/monzo_googleplay_2019_2021.csv",
    "../data/raw/googleplay/monzo_googleplay_2022_2025.csv"
]

In [None]:
def read_appfollow_csv(path):
    """
    Read AppFollow export:
    - skip 'sep=' and 'From:' lines
    - use line 3 as header
    """
    df = pd.read_csv(
        path,
        sep=",",
        skiprows=2,
        engine="python",
        encoding="utf-8",
        on_bad_lines="skip"
    )
    print(f"{os.path.basename(path)} → {df.shape[0]} rows × {df.shape[1]} cols")
    return df

In [None]:
appstore_df  = read_appfollow_csv("../data/raw/appstore/monzo_appstore_2015_2025.csv")
gplay_15_19  = read_appfollow_csv("../data/raw/googleplay/monzo_googleplay_2015_2019.csv")
gplay_19_21  = read_appfollow_csv("../data/raw/googleplay/monzo_googleplay_2019_2021.csv")
gplay_22_25  = read_appfollow_csv("../data/raw/googleplay/monzo_googleplay_2022_2025.csv")


In [None]:
# ------------------------------------------------------------
# 3. Data Ingestion (Staged — Inspect Before Merge)
# ------------------------------------------------------------
"""
Purpose:
---------
This stage loads the raw Monzo App Store and Google Play Store
review datasets individually for inspection, without merging them yet.

Key Features:
-------------
- Skips the first two metadata lines in AppFollow exports ('sep=' and 'From:').
- Ensures UTF-8 encoding to preserve emojis and multilingual text.
- Prints shape, column names, and sample rows for validation.
- Returns a dictionary of DataFrames keyed by dataset name.
"""

import os
import pandas as pd


def read_appfollow_csv(path):
    """
    Read AppFollow export:
    - Skip 'sep=' and 'From:' lines (first two)
    - Use line 3 as the header
    - Preserve emojis and multilingual characters
    """
    try:
        df = pd.read_csv(
            path,
            sep=",",                # AppFollow exports use commas
            skiprows=2,             # skip metadata lines
            engine="python",
            encoding="utf-8",
            on_bad_lines="skip"     # skip problematic lines without stopping
        )
        print(f"✅ {os.path.basename(path)} → {df.shape[0]:,} rows × {df.shape[1]} cols")
        return df
    except Exception as e:
        print(f"❌ Error reading {os.path.basename(path)}: {e}")
        return None


def load_individual_datasets(appstore_path, google_paths):
    """
    Load App Store and Google Play review datasets individually for inspection.
    Returns a dictionary of DataFrames for each dataset.

    Parameters
    ----------
    appstore_path : str
        Path to the App Store CSV export.
    google_paths : list
        List of paths to the Google Play CSV exports.

    Returns
    -------
    dict
        Dictionary of DataFrames with dataset names as keys.
    """
    datasets = {}

    # --- Load App Store ---
    print("\n📂 Loading App Store dataset...")
    appstore_df = read_appfollow_csv(APPSTORE_PATH)
    datasets["appstore"] = appstore_df
    print(f"   → Columns: {list(appstore_df.columns)}\n")

    # --- Load Google Play datasets ---
    print("📂 Loading Google Play datasets...")
    for path in GOOGLEPLAY_PATHS:
        name = os.path.basename(path).replace(".csv", "")
        df_part = read_appfollow_csv(path)
        datasets[name] = df_part
        print(f"   → {name} Columns: {list(df_part.columns)}\n")

    print("\n✅ All datasets successfully loaded and ready for schema review.\n")
    return datasets




In [None]:
# ------------------------------------------------------------
# Execute Data Loading
# ------------------------------------------------------------

datasets = load_individual_datasets(APPSTORE_PATH, GOOGLEPLAY_PATHS)


In [None]:
# ------------------------------------------------------------
# 4. Schema & Structural Review
# ------------------------------------------------------------
"""
Purpose:
---------
Before merging the datasets, this stage performs a detailed structural
audit of the individual App Store and Google Play Store review datasets.

Objectives:
------------
1. Compare and validate column schemas across all datasets.
2. Inspect column data types and ensure consistency for merging.
3. Identify missing values and structural anomalies.
4. Display representative samples to confirm review text quality.

Outcome:
---------
This ensures all datasets are harmonised, complete, and merge-ready.
"""


In [None]:
# ------------------------------------------------------------
# 4.1 Column Comparison Across Datasets
# ------------------------------------------------------------
print("🧾 Comparing column structures across datasets...\n")

# Extract and sort columns per dataset
schema_dict = {name: sorted(df.columns.tolist()) for name, df in datasets.items()}

# Find the maximum number of columns among all datasets
max_len = max(len(cols) for cols in schema_dict.values())

# Pad shorter lists with empty strings to equalize length
for name in schema_dict:
    schema_dict[name] += [""] * (max_len - len(schema_dict[name]))

# Build schema comparison DataFrame safely
schema_comparison = pd.DataFrame(schema_dict).T
schema_comparison.columns = [f"Col_{i+1}" for i in range(max_len)]

display(schema_comparison.head(10))



In [None]:

# Identify common and unique columns
all_columns = set().union(*[df.columns for df in datasets.values()])
common_columns = set.intersection(*[set(df.columns) for df in datasets.values()])
unique_columns = {name: list(set(df.columns) - common_columns) for name, df in datasets.items()}

print(f"\n✅ Total unique columns across all datasets: {len(all_columns)}")
print(f"✅ Columns common to all datasets ({len(common_columns)}):")
print(sorted(common_columns))

print(f"\n⚙️ Platform-specific (unique) columns:")
for name, cols in unique_columns.items():
    if cols:
        print(f"   - {name}: {cols}")

In [None]:

# ------------------------------------------------------------
# 4.2 Data Type Inspection
# ------------------------------------------------------------
print("\n📊 Data Type Summary:\n")
for name, df in datasets.items():
    print(f"🧩 {name.upper()} ({df.shape[0]:,} rows × {df.shape[1]} cols)")
    print(df.dtypes.value_counts())
    print("-" * 60)


In [None]:

# ------------------------------------------------------------
# 4.3 Missing Value Overview
# ------------------------------------------------------------
print("\n🔍 Missing Value Overview (Top 10 columns with missing counts per dataset):\n")
for name, df in datasets.items():
    missing_summary = (
        df.isna().sum()
        .sort_values(ascending=False)
        .head(10)
        .reset_index()
        .rename(columns={'index': 'Column', 0: 'Missing Count'})
    )
    print(f"\n📂 {name.upper()} Dataset Missing Values:")
    display(missing_summary)
    print("-" * 60)


In [None]:

# ------------------------------------------------------------
# 4.4 Sample Review Inspection
# ------------------------------------------------------------
print("\n💬 Sampling reviews for content validation...\n")
for name, df in datasets.items():
    print(f"🗂️ {name.upper()} — Sample Reviews")
    display(df[['Submission date', 'Rating', 'Review']].sample(3, random_state=42))
    print("-" * 60)


In [None]:
# ------------------------------------------------------------
# 4.5 Summary Notes
# ------------------------------------------------------------
"""
Summary:
---------
- Confirmed which columns are shared across all datasets.
- Identified platform-specific fields to be harmonised or dropped.
- Inspected data types to ensure compatibility for merging.
- Validated data quality and presence of meaningful user reviews.

Next Step:
-----------
Proceed to Section 5 — Schema Harmonisation & Merging,
where columns will be standardised and all reviews unified into
a single master dataset: 'Monzo_Reviews_Master.csv'.
"""


In [None]:
# ------------------------------------------------------------
# 5. Schema Harmonisation & Merge Pipeline
# ------------------------------------------------------------
"""
Purpose:
---------
To harmonise column names, ensure consistent structure across all
review datasets, add platform identifiers, and merge them into one
master dataset for downstream analysis (sentiment, themes, Power BI).

Key Features:
--------------
- Aligns App Store and Google Play columns via a mapping dictionary.
- Drops redundant or empty columns (e.g., Notes, Tags, Semantic fields).
- Adds a 'platform' column for source tracking (iOS / Android).
- Converts key columns to proper datatypes (date, numeric).
- Exports the unified dataset as 'Monzo_Reviews_Master.csv'.
"""

In [None]:

# ------------------------------------------------------------
# 5.1 Column Mapping (Standardisation Dictionary)
# ------------------------------------------------------------
column_mapping = {
    "Submission date": "review_date",
    "AppID": "app_id",
    "AppName": "app_name",
    "Country": "country",
    "Review Language": "review_language",
    "Version": "app_version",
    "Author": "author_name",
    "Rating": "rating",
    "Title": "review_title",
    "Review": "review_text",
    "Reply Date": "developer_reply_date",
    "Reply Delta": "developer_reply_delta",
    "Developer Reply": "developer_reply_text",
    "Translated title": "translated_title",
    "Translated review": "translated_review",
    "Link": "review_link",
    "Permalink": "review_permalink",
    "Updated": "updated_at",
    # Metadata or optional context
    "Device Name": "device_name",
    "VersionCode": "version_code",
    "OS": "os_version"
}


# Columns to drop entirely (empty or redundant in all datasets)
drop_columns = [
    "Tags", "User", "Notes", "Semantic Tags",
    "Semantic Categories", "Semantic Sentiment",
    "Categories", "Likes", "Dislikes", "AF Link"
]


In [None]:

# ------------------------------------------------------------
# 5.2 Harmonisation Function
# ------------------------------------------------------------
def harmonise_dataset(df, platform_name):
    """
    Standardise schema for App Store / Google Play datasets.

    Steps:
    - Rename columns using mapping dictionary.
    - Drop redundant / empty columns.
    - Add 'platform' column (iOS / Android).
    - Convert 'review_date' to datetime and 'rating' to numeric.
    """
    df = df.rename(columns=column_mapping)
    df = df.drop(columns=[col for col in drop_columns if col in df.columns], errors="ignore")
    df["platform"] = platform_name

    # Clean data types
    if "review_date" in df.columns:
        df["review_date"] = pd.to_datetime(df["review_date"], errors="coerce")
    if "rating" in df.columns:
        df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

    # Optional: Keep only core columns
    core_cols = [
        "review_date", "rating", "review_title", "review_text", "author_name",
        "app_version", "country", "review_language", "developer_reply_text",
        "developer_reply_date", "platform"
    ]
    return df[[col for col in core_cols if col in df.columns]]


In [None]:
# ------------------------------------------------------------
# 5.3 Apply Harmonisation to Each Dataset
# ------------------------------------------------------------
harmonised = []
for name, df in datasets.items():
    platform = "iOS" if "appstore" in name.lower() else "Android"
    cleaned_df = harmonise_dataset(df.copy(), platform)
    harmonised.append(cleaned_df)
    print(f"✅ Harmonised {name} ({platform}) → {cleaned_df.shape[0]:,} rows, {cleaned_df.shape[1]} cols")


In [None]:

# ------------------------------------------------------------
# 5.4 Merge All Reviews into One Master Dataset
# ------------------------------------------------------------
monzo_reviews_master = pd.concat(harmonised, ignore_index=True)
print(f"\n🔗 Combined dataset shape: {monzo_reviews_master.shape}")
print(f"   Unique platforms: {monzo_reviews_master['platform'].unique().tolist()}")


In [None]:
# ------------------------------------------------------------
# 5.5 Export Cleaned & Harmonised Dataset
# ------------------------------------------------------------
output_dir = "../data/processed"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "Monzo_Reviews_Master.csv")

monzo_reviews_master.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"\n💾 Exported unified dataset to: {output_path}")

In [None]:
# ------------------------------------------------------------
# 5.6 Quick Sanity Checks
# ------------------------------------------------------------
print("\n📊 Sanity Checks:")
print("Date Range:", monzo_reviews_master["review_date"].min(), "→", monzo_reviews_master["review_date"].max())
print("Average Rating:", round(monzo_reviews_master["rating"].mean(), 2))
print("Sample rows:")
display(monzo_reviews_master.sample(5, random_state=42))


In [None]:
"""
Outcome:
---------
✅ All Monzo App Store and Google Play datasets successfully harmonised.
✅ Final dataset ready for exploratory analysis and sentiment modelling.
✅ Exported to '../data/processed/Monzo_Reviews_Master.csv'.
"""

In [None]:
"""
Before Jumping to Section 6 (Sentiment)

The next smart step is data readiness validation, Handle Missing Values and we'll focus on these review_text, rating, review_date, optionally (review_language and platform)
"""

In [None]:
#Checking for missing values (NaNs)
print("Checking for missing values (NaNs)")
monzo_reviews_master[['review_text','rating','review_date','review_language']].isna().sum()


In [None]:
#Apply light cleaning
# Drop empty or invalid reviews
monzo_reviews_master = monzo_reviews_master.dropna(subset=['review_text'])

# Replace missing ratings with 0 or drop them
monzo_reviews_master = monzo_reviews_master.dropna(subset=['rating'])

# Normalise language codes (fallback to 'en')
monzo_reviews_master['review_language'] = (
    monzo_reviews_master['review_language']
    .fillna('en')
    .str.lower()
)


In [None]:
# Identify all non-English reviews
non_english_reviews = monzo_reviews_master[monzo_reviews_master['review_language'] != 'en']

# Basic info
print(f"🌍 Non-English reviews: {len(non_english_reviews):,}")
print(f"Languages present: {non_english_reviews['review_language'].unique()}")

# View a few examples
display(non_english_reviews[['review_language', 'review_text']].sample(15, random_state=42))


In [None]:
print("Remaining rows:", len(monzo_reviews_master))
print("Unique languages:", monzo_reviews_master['review_language'].unique())
print("Nulls per column:\n", monzo_reviews_master.isna().sum())


### Reclassify & Annotate NaN or Misdetected Languages

In [None]:
import langdetect
from langdetect import DetectorFactory
DetectorFactory.seed = 42  # ensures reproducibility

def detect_language_safe(text):
    """Safely detect language using langdetect with fallback."""
    try:
        if isinstance(text, str) and len(text.strip()) > 5:
            lang = langdetect.detect(text)
            return lang
        else:
            return None
    except Exception:
        return None


# 1️⃣ Extract the 366 non-English or NaN-labelled reviews
suspect_reviews = monzo_reviews_master[
    (monzo_reviews_master['review_language'].isna()) |
    (monzo_reviews_master['review_language'] != 'en')
].copy()

print(f"🔍 Reviewing {len(suspect_reviews):,} suspect language rows...")

# 2️⃣ Detect language again for these reviews
suspect_reviews['detected_lang'] = suspect_reviews['review_text'].apply(detect_language_safe)

# 3️⃣ Check where the new detection finds English content
english_like = suspect_reviews[suspect_reviews['detected_lang'] == 'en']

print(f"✅ Reclassified {len(english_like):,} as English from the suspect set.")

# 4️⃣ Update main dataset
monzo_reviews_master.loc[english_like.index, 'review_language'] = 'en'

# 5️⃣ Optional sanity check
print("🔠 Updated language distribution:\n", monzo_reviews_master['review_language'].value_counts().head(10))

# 6️⃣ Save reclassified subset for audit
english_like[['review_text', 'review_language', 'detected_lang']].to_csv(
    "../data/processed/Monzo_Reclassified_English.csv",
    index=False, encoding="utf-8-sig"
)
print("💾 Saved reclassified English reviews for audit.")


In [None]:
# suspect_reviews = monzo_reviews_master[monzo_reviews_master['review_language'].isna()].copy()


In [None]:
# Filter remaining non-English reviews
non_en_remaining = monzo_reviews_master[monzo_reviews_master["review_language"] != "en"]

print(f"🌍 Remaining non-English reviews: {len(non_en_remaining):,}")
print(f"Languages still present: {sorted(non_en_remaining['review_language'].dropna().unique().tolist())}")

# Show random samples per language
for lang in non_en_remaining["review_language"].dropna().unique():
    print(f"\n🗣️ Language: {lang}")

    lang_reviews = non_en_remaining[non_en_remaining["review_language"] == lang]["review_text"]
    sample_size = min(3, len(lang_reviews))

    sample = lang_reviews.sample(sample_size, random_state=42)

    for text in sample:
        print(f"  - {text[:200]}")



In [None]:
import re
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def is_mostly_english(text):
    """
    Checks if >60% of words are likely English using dictionary and langdetect.
    Returns True if confidently English.
    """
    if not isinstance(text, str) or len(text) < 3:
        return False

    # Tokenise text and count English-like words
    english_words = re.findall(r"\b(?:bank|money|monzo|love|good|great|best|app|amazing|easy|help|support|thank|brilliant|nice|card|account|customer|issue|login|transfer|recommend)\b", text.lower())
    total_words = len(re.findall(r"[a-zA-Z]+", text))

    ratio = len(english_words) / total_words if total_words else 0

    # Confirm with langdetect if uncertain
    try:
        lang = detect(text)
    except LangDetectException:
        lang = None

    return (ratio > 0.5) or (lang == "en")

# Apply the refined filter only on non-English labels
suspect_idx = monzo_reviews_master[
    (monzo_reviews_master["review_language"] != "en") &
    (monzo_reviews_master["review_text"].apply(is_mostly_english))
].index

# Update the language tag
monzo_reviews_master.loc[suspect_idx, "review_language"] = "en"

print(f"✅ Reclassified {len(suspect_idx)} reviews as English (validated by ratio & langdetect).")

# Optional: audit export
monzo_reviews_master.loc[suspect_idx].to_csv(
    "../data/processed/Monzo_Reclassified_English_Validated.csv",
    index=False,
    encoding="utf-8"
)


In [None]:
# Show 10 rows of the reclassified data
print("")
monzo_reviews_master.loc[suspect_idx].head(10)



### Light Pre-Sentiment Cleaning

In [None]:
# # ------------------------------------------------------------
# # 5.X Pre-Sentiment Data Preparation
# # ------------------------------------------------------------
#
# # Drop rows with missing review text
# monzo_reviews_master = monzo_reviews_master.dropna(subset=['review_text'])
#
# # Fill missing author and language
# monzo_reviews_master['author_name'] = monzo_reviews_master['author_name'].fillna('Unknown')
# monzo_reviews_master['review_language'] = monzo_reviews_master['review_language'].fillna('en').str.lower()
#
# # Keep only English reviews for sentiment analysis
# monzo_reviews_master = monzo_reviews_master[monzo_reviews_master['review_language'] == 'en']
#
# # Strip whitespace and remove duplicates
# monzo_reviews_master['review_text'] = monzo_reviews_master['review_text'].astype(str).str.strip()
# monzo_reviews_master = monzo_reviews_master.drop_duplicates(subset=['review_text'])
#
# # Sanity check after cleaning
# print("✅ Cleaned dataset ready for sentiment analysis")
# print(f"Remaining rows: {len(monzo_reviews_master):,}")
# print("Unique languages:", monzo_reviews_master['review_language'].unique())
