## Monzo User Sentiment & Feature Insights Dashboard

In [92]:
"""
============================================================
Notebook: 01_data_exploration.ipynb
Author: James O. Adeshina
Version: 1.0  |  October 2025
============================================================

Purpose:
--------
This notebook performs the initial data ingestion and exploratory
data analysis (EDA) for Monzo app reviews exported from AppFollow.

It aims to:
    • Validate the structure and quality of the raw review datasets
    • Unify Apple App Store and Google Play Store schemas
    • Identify cleaning requirements (duplicates, nulls, non-English)
    • Explore rating and review distributions
    • Provide summary insights to guide the next stages:
        - Data cleaning & normalization
        - Thematic tagging
        - Sentiment analysis
============================================================
"""



In [93]:
# ------------------------------------------------------------
# 1. Import Libraries
# ------------------------------------------------------------
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from langdetect import detect, DetectorFactory
from wordcloud import WordCloud

In [94]:
# Reproducibility for language detection
DetectorFactory.seed = 42

# Display settings
pd.set_option("display.max_columns", 50)
sns.set(style="whitegrid", palette="muted", font_scale=1.1)


In [95]:
# ------------------------------------------------------------
# 2. Define File Paths
# ------------------------------------------------------------
APPSTORE_PATH = "../data/raw/appstore/monzo_appstore_2015_2025.csv"
GOOGLEPLAY_PATHS = [
    "../data/raw/googleplay/monzo_googleplay_2015_2019.csv",
    "../data/raw/googleplay/monzo_googleplay_2019_2021.csv",
    "../data/raw/googleplay/monzo_googleplay_2022_2025.csv"
]

In [96]:
def read_appfollow_csv(path):
    """
    Read AppFollow export:
    - skip 'sep=' and 'From:' lines
    - use line 3 as header
    """
    df = pd.read_csv(
        path,
        sep=",",
        skiprows=2,
        engine="python",
        encoding="utf-8",
        on_bad_lines="skip"
    )
    print(f"{os.path.basename(path)} → {df.shape[0]} rows × {df.shape[1]} cols")
    return df

In [97]:
appstore_df  = read_appfollow_csv("../data/raw/appstore/monzo_appstore_2015_2025.csv")
gplay_15_19  = read_appfollow_csv("../data/raw/googleplay/monzo_googleplay_2015_2019.csv")
gplay_19_21  = read_appfollow_csv("../data/raw/googleplay/monzo_googleplay_2019_2021.csv")
gplay_22_25  = read_appfollow_csv("../data/raw/googleplay/monzo_googleplay_2022_2025.csv")


monzo_appstore_2015_2025.csv → 9579 rows × 28 cols
monzo_googleplay_2015_2019.csv → 2821 rows × 31 cols
monzo_googleplay_2019_2021.csv → 9839 rows × 31 cols
monzo_googleplay_2022_2025.csv → 9984 rows × 31 cols


In [98]:
# ------------------------------------------------------------
# 3. Data Ingestion (Staged — Inspect Before Merge)
# ------------------------------------------------------------
"""
Purpose:
---------
This stage loads the raw Monzo App Store and Google Play Store
review datasets individually for inspection, without merging them yet.

Key Features:
-------------
- Skips the first two metadata lines in AppFollow exports ('sep=' and 'From:').
- Ensures UTF-8 encoding to preserve emojis and multilingual text.
- Prints shape, column names, and sample rows for validation.
- Returns a dictionary of DataFrames keyed by dataset name.
"""

import os
import pandas as pd


def read_appfollow_csv(path):
    """
    Read AppFollow export:
    - Skip 'sep=' and 'From:' lines (first two)
    - Use line 3 as the header
    - Preserve emojis and multilingual characters
    """
    try:
        df = pd.read_csv(
            path,
            sep=",",                # AppFollow exports use commas
            skiprows=2,             # skip metadata lines
            engine="python",
            encoding="utf-8",
            on_bad_lines="skip"     # skip problematic lines without stopping
        )
        print(f"✅ {os.path.basename(path)} → {df.shape[0]:,} rows × {df.shape[1]} cols")
        return df
    except Exception as e:
        print(f"❌ Error reading {os.path.basename(path)}: {e}")
        return None


def load_individual_datasets(appstore_path, google_paths):
    """
    Load App Store and Google Play review datasets individually for inspection.
    Returns a dictionary of DataFrames for each dataset.

    Parameters
    ----------
    appstore_path : str
        Path to the App Store CSV export.
    google_paths : list
        List of paths to the Google Play CSV exports.

    Returns
    -------
    dict
        Dictionary of DataFrames with dataset names as keys.
    """
    datasets = {}

    # --- Load App Store ---
    print("\n📂 Loading App Store dataset...")
    appstore_df = read_appfollow_csv(APPSTORE_PATH)
    datasets["appstore"] = appstore_df
    print(f"   → Columns: {list(appstore_df.columns)}\n")

    # --- Load Google Play datasets ---
    print("📂 Loading Google Play datasets...")
    for path in GOOGLEPLAY_PATHS:
        name = os.path.basename(path).replace(".csv", "")
        df_part = read_appfollow_csv(path)
        datasets[name] = df_part
        print(f"   → {name} Columns: {list(df_part.columns)}\n")

    print("\n✅ All datasets successfully loaded and ready for schema review.\n")
    return datasets




In [99]:
# ------------------------------------------------------------
# Execute Data Loading
# ------------------------------------------------------------

datasets = load_individual_datasets(APPSTORE_PATH, GOOGLEPLAY_PATHS)



📂 Loading App Store dataset...
✅ monzo_appstore_2015_2025.csv → 9,579 rows × 28 cols
   → Columns: ['Submission date', 'AppID', 'AppName', 'Country', 'Review Language', 'Version', 'Author', 'Rating', 'Title', 'Review', 'Translated title', 'Translated review', 'Reply Date', 'Reply Delta', 'Developer Reply', 'User', 'Tags', 'Categories', 'Updated', 'Semantic Tags', 'Semantic Categories', 'Semantic Sentiment', 'Notes', 'Likes', 'Dislikes', 'Link', 'Permalink', 'AF Link']

📂 Loading Google Play datasets...
✅ monzo_googleplay_2015_2019.csv → 2,821 rows × 31 cols
   → monzo_googleplay_2015_2019 Columns: ['Submission date', 'AppID', 'AppName', 'Country', 'Review Language', 'Version', 'Author', 'Rating', 'Title', 'Review', 'Translated title', 'Translated review', 'Reply Date', 'Reply Delta', 'Developer Reply', 'User', 'Tags', 'Categories', 'Updated', 'Semantic Tags', 'Semantic Categories', 'Semantic Sentiment', 'Notes', 'Likes', 'Dislikes', 'Link', 'Permalink', 'AF Link', 'Device Name', 'Vers

In [100]:
# ------------------------------------------------------------
# 4. Schema & Structural Review
# ------------------------------------------------------------
"""
Purpose:
---------
Before merging the datasets, this stage performs a detailed structural
audit of the individual App Store and Google Play Store review datasets.

Objectives:
------------
1. Compare and validate column schemas across all datasets.
2. Inspect column data types and ensure consistency for merging.
3. Identify missing values and structural anomalies.
4. Display representative samples to confirm review text quality.

Outcome:
---------
This ensures all datasets are harmonised, complete, and merge-ready.
"""


'\nPurpose:\n---------\nBefore merging the datasets, this stage performs a detailed structural\naudit of the individual App Store and Google Play Store review datasets.\n\nObjectives:\n------------\n1. Compare and validate column schemas across all datasets.\n2. Inspect column data types and ensure consistency for merging.\n3. Identify missing values and structural anomalies.\n4. Display representative samples to confirm review text quality.\n\nOutcome:\n---------\nThis ensures all datasets are harmonised, complete, and merge-ready.\n'

In [101]:
# ------------------------------------------------------------
# 4.1 Column Comparison Across Datasets
# ------------------------------------------------------------
print("🧾 Comparing column structures across datasets...\n")

# Extract and sort columns per dataset
schema_dict = {name: sorted(df.columns.tolist()) for name, df in datasets.items()}

# Find the maximum number of columns among all datasets
max_len = max(len(cols) for cols in schema_dict.values())

# Pad shorter lists with empty strings to equalize length
for name in schema_dict:
    schema_dict[name] += [""] * (max_len - len(schema_dict[name]))

# Build schema comparison DataFrame safely
schema_comparison = pd.DataFrame(schema_dict).T
schema_comparison.columns = [f"Col_{i+1}" for i in range(max_len)]

display(schema_comparison.head(10))



🧾 Comparing column structures across datasets...



Unnamed: 0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,Col_10,Col_11,Col_12,Col_13,Col_14,Col_15,Col_16,Col_17,Col_18,Col_19,Col_20,Col_21,Col_22,Col_23,Col_24,Col_25,Col_26,Col_27,Col_28,Col_29,Col_30,Col_31
appstore,AF Link,AppID,AppName,Author,Categories,Country,Developer Reply,Dislikes,Likes,Link,Notes,Permalink,Rating,Reply Date,Reply Delta,Review,Review Language,Semantic Categories,Semantic Sentiment,Semantic Tags,Submission date,Tags,Title,Translated review,Translated title,Updated,User,Version,,,
monzo_googleplay_2015_2019,AF Link,AppID,AppName,Author,Categories,Country,Developer Reply,Device Name,Dislikes,Likes,Link,Notes,OS,Permalink,Rating,Reply Date,Reply Delta,Review,Review Language,Semantic Categories,Semantic Sentiment,Semantic Tags,Submission date,Tags,Title,Translated review,Translated title,Updated,User,Version,VersionCode
monzo_googleplay_2019_2021,AF Link,AppID,AppName,Author,Categories,Country,Developer Reply,Device Name,Dislikes,Likes,Link,Notes,OS,Permalink,Rating,Reply Date,Reply Delta,Review,Review Language,Semantic Categories,Semantic Sentiment,Semantic Tags,Submission date,Tags,Title,Translated review,Translated title,Updated,User,Version,VersionCode
monzo_googleplay_2022_2025,AF Link,AppID,AppName,Author,Categories,Country,Developer Reply,Device Name,Dislikes,Likes,Link,Notes,OS,Permalink,Rating,Reply Date,Reply Delta,Review,Review Language,Semantic Categories,Semantic Sentiment,Semantic Tags,Submission date,Tags,Title,Translated review,Translated title,Updated,User,Version,VersionCode


In [102]:

# Identify common and unique columns
all_columns = set().union(*[df.columns for df in datasets.values()])
common_columns = set.intersection(*[set(df.columns) for df in datasets.values()])
unique_columns = {name: list(set(df.columns) - common_columns) for name, df in datasets.items()}

print(f"\n✅ Total unique columns across all datasets: {len(all_columns)}")
print(f"✅ Columns common to all datasets ({len(common_columns)}):")
print(sorted(common_columns))

print(f"\n⚙️ Platform-specific (unique) columns:")
for name, cols in unique_columns.items():
    if cols:
        print(f"   - {name}: {cols}")


✅ Total unique columns across all datasets: 31
✅ Columns common to all datasets (28):
['AF Link', 'AppID', 'AppName', 'Author', 'Categories', 'Country', 'Developer Reply', 'Dislikes', 'Likes', 'Link', 'Notes', 'Permalink', 'Rating', 'Reply Date', 'Reply Delta', 'Review', 'Review Language', 'Semantic Categories', 'Semantic Sentiment', 'Semantic Tags', 'Submission date', 'Tags', 'Title', 'Translated review', 'Translated title', 'Updated', 'User', 'Version']

⚙️ Platform-specific (unique) columns:
   - monzo_googleplay_2015_2019: ['OS', 'VersionCode', 'Device Name']
   - monzo_googleplay_2019_2021: ['OS', 'VersionCode', 'Device Name']
   - monzo_googleplay_2022_2025: ['OS', 'VersionCode', 'Device Name']


In [103]:

# ------------------------------------------------------------
# 4.2 Data Type Inspection
# ------------------------------------------------------------
print("\n📊 Data Type Summary:\n")
for name, df in datasets.items():
    print(f"🧩 {name.upper()} ({df.shape[0]:,} rows × {df.shape[1]} cols)")
    print(df.dtypes.value_counts())
    print("-" * 60)



📊 Data Type Summary:

🧩 APPSTORE (9,579 rows × 28 cols)
object     16
float64    10
int64       2
Name: count, dtype: int64
------------------------------------------------------------
🧩 MONZO_GOOGLEPLAY_2015_2019 (2,821 rows × 31 cols)
float64    16
object     14
int64       1
Name: count, dtype: int64
------------------------------------------------------------
🧩 MONZO_GOOGLEPLAY_2019_2021 (9,839 rows × 31 cols)
float64    16
object     14
int64       1
Name: count, dtype: int64
------------------------------------------------------------
🧩 MONZO_GOOGLEPLAY_2022_2025 (9,984 rows × 31 cols)
object     15
float64    15
int64       1
Name: count, dtype: int64
------------------------------------------------------------


In [104]:

# ------------------------------------------------------------
# 4.3 Missing Value Overview
# ------------------------------------------------------------
print("\n🔍 Missing Value Overview (Top 10 columns with missing counts per dataset):\n")
for name, df in datasets.items():
    missing_summary = (
        df.isna().sum()
        .sort_values(ascending=False)
        .head(10)
        .reset_index()
        .rename(columns={'index': 'Column', 0: 'Missing Count'})
    )
    print(f"\n📂 {name.upper()} Dataset Missing Values:")
    display(missing_summary)
    print("-" * 60)



🔍 Missing Value Overview (Top 10 columns with missing counts per dataset):


📂 APPSTORE Dataset Missing Values:


Unnamed: 0,Column,Missing Count
0,Semantic Categories,9579
1,Notes,9579
2,Tags,9579
3,User,9579
4,Semantic Tags,9579
5,Translated review,9579
6,Translated title,9579
7,Categories,9579
8,Likes,9579
9,Dislikes,9579


------------------------------------------------------------

📂 MONZO_GOOGLEPLAY_2015_2019 Dataset Missing Values:


Unnamed: 0,Column,Missing Count
0,User,2821
1,Tags,2821
2,VersionCode,2821
3,Device Name,2821
4,Link,2821
5,Dislikes,2821
6,Likes,2821
7,Notes,2821
8,Semantic Sentiment,2821
9,Semantic Categories,2821


------------------------------------------------------------

📂 MONZO_GOOGLEPLAY_2019_2021 Dataset Missing Values:


Unnamed: 0,Column,Missing Count
0,User,9839
1,Tags,9839
2,VersionCode,9839
3,Device Name,9839
4,Link,9839
5,Dislikes,9839
6,Likes,9839
7,Notes,9839
8,Semantic Sentiment,9839
9,Semantic Categories,9839


------------------------------------------------------------

📂 MONZO_GOOGLEPLAY_2022_2025 Dataset Missing Values:


Unnamed: 0,Column,Missing Count
0,User,9984
1,Tags,9984
2,VersionCode,9984
3,Device Name,9984
4,Link,9984
5,Dislikes,9984
6,Likes,9984
7,Notes,9984
8,Semantic Categories,9984
9,Semantic Tags,9984


------------------------------------------------------------


In [105]:

# ------------------------------------------------------------
# 4.4 Sample Review Inspection
# ------------------------------------------------------------
print("\n💬 Sampling reviews for content validation...\n")
for name, df in datasets.items():
    print(f"🗂️ {name.upper()} — Sample Reviews")
    display(df[['Submission date', 'Rating', 'Review']].sample(3, random_state=42))
    print("-" * 60)



💬 Sampling reviews for content validation...

🗂️ APPSTORE — Sample Reviews


Unnamed: 0,Submission date,Rating,Review
6723,2019-10-30T13:19:39+00:00,5,100% recommended
4629,2021-10-23T20:40:20+00:00,5,Brilliant Idea
1383,2024-07-20T14:17:48+00:00,2,I got sent £5.60 from my PayPal into my Monzo ...


------------------------------------------------------------
🗂️ MONZO_GOOGLEPLAY_2015_2019 — Sample Reviews


Unnamed: 0,Submission date,Rating,Review
1090,2018-12-25T17:56:35+00:00,2,The app is losing its simplicity. Overload of ...
2342,2017-04-07T13:43:36+00:00,5,Monzo are currently leading the way in the new...
772,2019-03-05T01:16:48+00:00,5,An absolutely great bank that has everything y...


------------------------------------------------------------
🗂️ MONZO_GOOGLEPLAY_2019_2021 — Sample Reviews


Unnamed: 0,Submission date,Rating,Review
6718,2020-04-24T17:59:09+00:00,2,To restricted Cash deposit are to restrictive ...
4254,2020-11-12T17:17:05+00:00,5,Very good
9027,2019-08-26T14:55:47+00:00,1,Does not give account info. Useless app


------------------------------------------------------------
🗂️ MONZO_GOOGLEPLAY_2022_2025 — Sample Reviews


Unnamed: 0,Submission date,Rating,Review
4896,2023-09-08T17:12:33+00:00,1,New app layout is awful
8655,2022-06-15T11:04:07+00:00,1,Poomtless
1780,2024-08-22T22:17:42+00:00,4,Modern


------------------------------------------------------------


In [106]:
# ------------------------------------------------------------
# 4.5 Summary Notes
# ------------------------------------------------------------
"""
Summary:
---------
- Confirmed which columns are shared across all datasets.
- Identified platform-specific fields to be harmonised or dropped.
- Inspected data types to ensure compatibility for merging.
- Validated data quality and presence of meaningful user reviews.

Next Step:
-----------
Proceed to Section 5 — Schema Harmonisation & Merging,
where columns will be standardised and all reviews unified into
a single master dataset: 'Monzo_Reviews_Master.csv'.
"""


"\nSummary:\n---------\n- Confirmed which columns are shared across all datasets.\n- Identified platform-specific fields to be harmonised or dropped.\n- Inspected data types to ensure compatibility for merging.\n- Validated data quality and presence of meaningful user reviews.\n\nNext Step:\n-----------\nProceed to Section 5 — Schema Harmonisation & Merging,\nwhere columns will be standardised and all reviews unified into\na single master dataset: 'Monzo_Reviews_Master.csv'.\n"

In [107]:
# ------------------------------------------------------------
# 5. Schema Harmonisation & Merge Pipeline
# ------------------------------------------------------------
"""
Purpose:
---------
To harmonise column names, ensure consistent structure across all
review datasets, add platform identifiers, and merge them into one
master dataset for downstream analysis (sentiment, themes, Power BI).

Key Features:
--------------
- Aligns App Store and Google Play columns via a mapping dictionary.
- Drops redundant or empty columns (e.g., Notes, Tags, Semantic fields).
- Adds a 'platform' column for source tracking (iOS / Android).
- Converts key columns to proper datatypes (date, numeric).
- Exports the unified dataset as 'Monzo_Reviews_Master.csv'.
"""

"\nPurpose:\n---------\nTo harmonise column names, ensure consistent structure across all\nreview datasets, add platform identifiers, and merge them into one\nmaster dataset for downstream analysis (sentiment, themes, Power BI).\n\nKey Features:\n--------------\n- Aligns App Store and Google Play columns via a mapping dictionary.\n- Drops redundant or empty columns (e.g., Notes, Tags, Semantic fields).\n- Adds a 'platform' column for source tracking (iOS / Android).\n- Converts key columns to proper datatypes (date, numeric).\n- Exports the unified dataset as 'Monzo_Reviews_Master.csv'.\n"

In [108]:

# ------------------------------------------------------------
# 5.1 Column Mapping (Standardisation Dictionary)
# ------------------------------------------------------------
column_mapping = {
    "Submission date": "review_date",
    "AppID": "app_id",
    "AppName": "app_name",
    "Country": "country",
    "Review Language": "review_language",
    "Version": "app_version",
    "Author": "author_name",
    "Rating": "rating",
    "Title": "review_title",
    "Review": "review_text",
    "Reply Date": "developer_reply_date",
    "Reply Delta": "developer_reply_delta",
    "Developer Reply": "developer_reply_text",
    "Translated title": "translated_title",
    "Translated review": "translated_review",
    "Link": "review_link",
    "Permalink": "review_permalink",
    "Updated": "updated_at",
    # Metadata or optional context
    "Device Name": "device_name",
    "VersionCode": "version_code",
    "OS": "os_version"
}


# Columns to drop entirely (empty or redundant in all datasets)
drop_columns = [
    "Tags", "User", "Notes", "Semantic Tags",
    "Semantic Categories", "Semantic Sentiment",
    "Categories", "Likes", "Dislikes", "AF Link"
]


In [109]:

# ------------------------------------------------------------
# 5.2 Harmonisation Function
# ------------------------------------------------------------
def harmonise_dataset(df, platform_name):
    """
    Standardise schema for App Store / Google Play datasets.

    Steps:
    - Rename columns using mapping dictionary.
    - Drop redundant / empty columns.
    - Add 'platform' column (iOS / Android).
    - Convert 'review_date' to datetime and 'rating' to numeric.
    """
    df = df.rename(columns=column_mapping)
    df = df.drop(columns=[col for col in drop_columns if col in df.columns], errors="ignore")
    df["platform"] = platform_name

    # Clean data types
    if "review_date" in df.columns:
        df["review_date"] = pd.to_datetime(df["review_date"], errors="coerce")
    if "rating" in df.columns:
        df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

    # Optional: Keep only core columns
    core_cols = [
        "review_date", "rating", "review_title", "review_text", "author_name",
        "app_version", "country", "review_language", "developer_reply_text",
        "developer_reply_date", "platform"
    ]
    return df[[col for col in core_cols if col in df.columns]]


In [110]:

# ------------------------------------------------------------
# 5.3 Apply Harmonisation to Each Dataset
# ------------------------------------------------------------
harmonised = []
for name, df in datasets.items():
    platform = "iOS" if "appstore" in name.lower() else "Android"
    cleaned_df = harmonise_dataset(df.copy(), platform)
    harmonised.append(cleaned_df)
    print(f"✅ Harmonised {name} ({platform}) → {cleaned_df.shape[0]:,} rows, {cleaned_df.shape[1]} cols")


✅ Harmonised appstore (iOS) → 9,579 rows, 11 cols
✅ Harmonised monzo_googleplay_2015_2019 (Android) → 2,821 rows, 11 cols
✅ Harmonised monzo_googleplay_2019_2021 (Android) → 9,839 rows, 11 cols
✅ Harmonised monzo_googleplay_2022_2025 (Android) → 9,984 rows, 11 cols


In [111]:

# ------------------------------------------------------------
# 5.4 Merge All Reviews into One Master Dataset
# ------------------------------------------------------------
monzo_reviews_master = pd.concat(harmonised, ignore_index=True)
print(f"\n🔗 Combined dataset shape: {monzo_reviews_master.shape}")
print(f"   Unique platforms: {monzo_reviews_master['platform'].unique().tolist()}")



🔗 Combined dataset shape: (32223, 11)
   Unique platforms: ['iOS', 'Android']


In [112]:
# ------------------------------------------------------------
# 5.5 Export Cleaned & Harmonised Dataset
# ------------------------------------------------------------
output_dir = "../data/processed"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "Monzo_Reviews_Master.csv")

monzo_reviews_master.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"\n💾 Exported unified dataset to: {output_path}")


💾 Exported unified dataset to: ../data/processed/Monzo_Reviews_Master.csv


In [113]:
# ------------------------------------------------------------
# 5.6 Quick Sanity Checks
# ------------------------------------------------------------
print("\n📊 Sanity Checks:")
print("Date Range:", monzo_reviews_master["review_date"].min(), "→", monzo_reviews_master["review_date"].max())
print("Average Rating:", round(monzo_reviews_master["rating"].mean(), 2))
print("Sample rows:")
display(monzo_reviews_master.sample(5, random_state=42))



📊 Sanity Checks:
Date Range: 2016-02-26 22:15:55+00:00 → 2025-10-01 06:03:52+00:00
Average Rating: 3.81
Sample rows:


Unnamed: 0,review_date,rating,review_title,review_text,author_name,app_version,country,review_language,developer_reply_text,developer_reply_date,platform
15397,2021-02-14 11:16:22+00:00,5,,never had an issue with the card from it's ear...,Dean Ross,3.68.0,,en,,,Android
23785,2024-09-23 05:22:06+00:00,5,,Get this ever growing system,Gpettoe Trillgod,,,en,,,Android
22761,2025-02-27 12:51:28+00:00,4,,Great app with some even greater functionality...,Viliani,,,en,,,Android
5001,2021-04-21 15:02:46+00:00,1,Refused to open me an account,Due to having an iva!!! I needed the account t...,jadfranswa,3.78.0,gb,en,,,iOS
27986,2023-05-30 15:09:47+00:00,5,,"Very convenient to use, especially for people ...",Zhehua Mao,5.25.0,,en,,,Android


In [114]:
"""
Outcome:
---------
✅ All Monzo App Store and Google Play datasets successfully harmonised.
✅ Final dataset ready for exploratory analysis and sentiment modelling.
✅ Exported to '../data/processed/Monzo_Reviews_Master.csv'.
"""

"\nOutcome:\n---------\n✅ All Monzo App Store and Google Play datasets successfully harmonised.\n✅ Final dataset ready for exploratory analysis and sentiment modelling.\n✅ Exported to '../data/processed/Monzo_Reviews_Master.csv'.\n"

In [115]:
"""
Before Jumping to Section 6 (Sentiment)

The next smart step is data readiness validation, Handle Missing Values and we'll focus on these review_text, rating, review_date, optionally (review_language and platform)
"""

"\nBefore Jumping to Section 6 (Sentiment)\n\nThe next smart step is data readiness validation, Handle Missing Values and we'll focus on these review_text, rating, review_date, optionally (review_language and platform)\n"

In [116]:
#Checking for missing values (NaNs)
print("Checking for missing values (NaNs)")
monzo_reviews_master[['review_text','rating','review_date','review_language']].isna().sum()


Checking for missing values (NaNs)


review_text          1
rating               0
review_date          0
review_language    218
dtype: int64

In [117]:
#Apply light cleaning
# Drop empty or invalid reviews
monzo_reviews_master = monzo_reviews_master.dropna(subset=['review_text'])

# Replace missing ratings with 0 or drop them
monzo_reviews_master = monzo_reviews_master.dropna(subset=['rating'])

# Normalise language codes (fallback to 'en')
monzo_reviews_master['review_language'] = (
    monzo_reviews_master['review_language']
    .fillna('en')
    .str.lower()
)


In [118]:
# Identify all non-English reviews
non_english_reviews = monzo_reviews_master[monzo_reviews_master['review_language'] != 'en']

# Basic info
print(f"🌍 Non-English reviews: {len(non_english_reviews):,}")
print(f"Languages present: {non_english_reviews['review_language'].unique()}")

# View a few examples
display(non_english_reviews[['review_language', 'review_text']].sample(15, random_state=42))


🌍 Non-English reviews: 148
Languages present: ['fa' 'et' 'de' 'ru' 'pt' 'it' 'so' 'no' 'tr' 'nl' 'la' 'es' 'bg' 'fy'
 'co' 'hu' 'sr' 'fr' 'zh' 'lb' 'ht' 'ro' 'ca' 'ia' 'da' 'vi' 'uz' 'sw'
 'sn' 'mt' 'ar' 'jv' 'fi' 'ceb' 'pl' 'ko' 'ku' 'bn' 'gl' 'ig']


Unnamed: 0,review_language,review_text
7409,sr,I love Monzo
2661,sn,Love my Monzo Account
8506,fr,"Logiciel pauvre, et compte uniquement en livre..."
1106,pt,Melhor banco que ja usei aqui em UK
5691,fy,Best bank app so far
805,ru,Благодарю Вас за финансовую помощь
4321,it,It’s actually amazing I love it
1490,sr,İm verry happy
4551,zh,申请好几天了还在审核中，为啥别人几分钟就通过
640,nl,Best online bank EVER!


In [119]:
print("Remaining rows:", len(monzo_reviews_master))
print("Unique languages:", monzo_reviews_master['review_language'].unique())
print("Nulls per column:\n", monzo_reviews_master.isna().sum())


Remaining rows: 32222
Unique languages: ['en' 'fa' 'et' 'de' 'ru' 'pt' 'it' 'so' 'no' 'tr' 'nl' 'la' 'es' 'bg'
 'fy' 'co' 'hu' 'sr' 'fr' 'zh' 'lb' 'ht' 'ro' 'ca' 'ia' 'da' 'vi' 'uz'
 'sw' 'sn' 'mt' 'ar' 'jv' 'fi' 'ceb' 'pl' 'ko' 'ku' 'bn' 'gl' 'ig']
Nulls per column:
 review_date                 0
rating                      0
review_title            22355
review_text                 0
author_name                 1
app_version              7354
country                 20215
review_language             0
developer_reply_text    29626
developer_reply_date    29626
platform                    0
dtype: int64


### Reclassify & Annotate NaN or Misdetected Languages

In [120]:
import langdetect
from langdetect import DetectorFactory
DetectorFactory.seed = 42  # ensures reproducibility

def detect_language_safe(text):
    """Safely detect language using langdetect with fallback."""
    try:
        if isinstance(text, str) and len(text.strip()) > 5:
            lang = langdetect.detect(text)
            return lang
        else:
            return None
    except Exception:
        return None


# 1️⃣ Extract the 366 non-English or NaN-labelled reviews
suspect_reviews = monzo_reviews_master[
    (monzo_reviews_master['review_language'].isna()) |
    (monzo_reviews_master['review_language'] != 'en')
].copy()

print(f"🔍 Reviewing {len(suspect_reviews):,} suspect language rows...")

# 2️⃣ Detect language again for these reviews
suspect_reviews['detected_lang'] = suspect_reviews['review_text'].apply(detect_language_safe)

# 3️⃣ Check where the new detection finds English content
english_like = suspect_reviews[suspect_reviews['detected_lang'] == 'en']

print(f"✅ Reclassified {len(english_like):,} as English from the suspect set.")

# 4️⃣ Update main dataset
monzo_reviews_master.loc[english_like.index, 'review_language'] = 'en'

# 5️⃣ Optional sanity check
print("🔠 Updated language distribution:\n", monzo_reviews_master['review_language'].value_counts().head(10))

# 6️⃣ Save reclassified subset for audit
english_like[['review_text', 'review_language', 'detected_lang']].to_csv(
    "../data/processed/Monzo_Reclassified_English.csv",
    index=False, encoding="utf-8-sig"
)
print("💾 Saved reclassified English reviews for audit.")


🔍 Reviewing 148 suspect language rows...
✅ Reclassified 34 as English from the suspect set.
🔠 Updated language distribution:
 review_language
en    32108
es       13
zh        9
pt        6
hu        6
sr        5
it        5
de        5
fy        5
nl        5
Name: count, dtype: int64
💾 Saved reclassified English reviews for audit.


In [121]:
# suspect_reviews = monzo_reviews_master[monzo_reviews_master['review_language'].isna()].copy()


In [122]:
# Filter remaining non-English reviews
non_en_remaining = monzo_reviews_master[monzo_reviews_master["review_language"] != "en"]

print(f"🌍 Remaining non-English reviews: {len(non_en_remaining):,}")
print(f"Languages still present: {sorted(non_en_remaining['review_language'].dropna().unique().tolist())}")

# Show random samples per language
for lang in non_en_remaining["review_language"].dropna().unique():
    print(f"\n🗣️ Language: {lang}")

    lang_reviews = non_en_remaining[non_en_remaining["review_language"] == lang]["review_text"]
    sample_size = min(3, len(lang_reviews))

    sample = lang_reviews.sample(sample_size, random_state=42)

    for text in sample:
        print(f"  - {text[:200]}")



🌍 Remaining non-English reviews: 114
Languages still present: ['ar', 'bg', 'bn', 'ca', 'ceb', 'co', 'da', 'de', 'es', 'et', 'fa', 'fi', 'fr', 'fy', 'ht', 'hu', 'ia', 'ig', 'it', 'ko', 'ku', 'la', 'lb', 'mt', 'nl', 'no', 'pl', 'pt', 'ro', 'ru', 'sn', 'so', 'sr', 'sw', 'tr', 'uz', 'vi', 'zh']

🗣️ Language: fa
  - ازااااادد

🗣️ Language: et
  - Please update

🗣️ Language: de
  - You’re amazing 🤩
  - In Deutschland nicht dabei
  - I love monzo

🗣️ Language: ru
  - мои деньги не уходят моему товарищу за такси, что за бред?
  - Благодарю Вас за финансовую  помощь
  - bloody brilliant!

🗣️ Language: pt
  - Melhor bank
  - Melhor banco que ja usei aqui em UK
  - Boa noite. Tento entrar no app da Monzo no meu novo telemóvel ou no tablet e aparece sempre erro, dá-me um valor de saldo muito desatualizado, não percebo. Será que podem ajudar-me a resolver isto?! O

🗣️ Language: so
  - NOOO NOOOO NOOOO
  - Sosososossoaoo goooooood

🗣️ Language: tr
  - Hiç bir bonusu vermiyorlar yalan söylüyorlar ken

In [128]:
import re
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def is_mostly_english(text):
    """
    Checks if >60% of words are likely English using dictionary and langdetect.
    Returns True if confidently English.
    """
    if not isinstance(text, str) or len(text) < 3:
        return False

    # Tokenise text and count English-like words
    english_words = re.findall(r"\b(?:bank|money|monzo|love|good|great|best|app|amazing|easy|help|support|thank|brilliant|nice|card|account|customer|issue|login|transfer|recommend)\b", text.lower())
    total_words = len(re.findall(r"[a-zA-Z]+", text))

    ratio = len(english_words) / total_words if total_words else 0

    # Confirm with langdetect if uncertain
    try:
        lang = detect(text)
    except LangDetectException:
        lang = None

    return (ratio > 0.5) or (lang == "en")

# Apply the refined filter only on non-English labels
suspect_idx = monzo_reviews_master[
    (monzo_reviews_master["review_language"] != "en") &
    (monzo_reviews_master["review_text"].apply(is_mostly_english))
].index

# Update the language tag
monzo_reviews_master.loc[suspect_idx, "review_language"] = "en"

print(f"✅ Reclassified {len(suspect_idx)} reviews as English (validated by ratio & langdetect).")

# Optional: audit export
monzo_reviews_master.loc[suspect_idx].to_csv(
    "../data/processed/Monzo_Reclassified_English_Validated.csv",
    index=False,
    encoding="utf-8"
)


✅ Reclassified 0 reviews as English (validated by ratio & langdetect).


In [127]:
# Show 10 rows of the reclassified data
print("")
monzo_reviews_master.loc[suspect_idx].head(10)






Unnamed: 0,review_date,rating,review_title,review_text,author_name,app_version,country,review_language,developer_reply_text,developer_reply_date,platform
283,2025-07-02 06:35:29+00:00,5,Gooood bank,Melhor bank,maisaandra,6.33.0,gb,en,,,iOS
601,2025-03-22 23:17:25+00:00,1,Dolandırıcılar,Hiç bir bonusu vermiyorlar yalan söylüyorlar k...,cio_.es,6.18.0,gb,en,,,iOS
860,2024-12-22 13:54:20+00:00,5,Rexhep Krasniqi,I love it,Qeaqina,6.6.0,gb,en,,,iOS
899,2024-12-07 05:26:24+00:00,5,You’re amazing 🤩,You’re amazing 🤩,cutenystery,6.4.0,gb,en,,,iOS
981,2024-11-08 22:44:30+00:00,5,Monzo big fan,Amazing,It orr,6.0.0,gb,en,,,iOS
1195,2024-09-12 15:51:47+00:00,5,R Ladiane diagne,Sun good,Thieno diagne,5.90.0,gb,en,,,iOS
1209,2024-09-09 15:03:06+00:00,5,Minta,Mint bank 💯💯💯,Minta0812,5.90.0,gb,en,,,iOS
1293,2024-08-12 17:22:38+00:00,5,Great app,Love monzo,Housekeeping buddy,5.86.0,gb,en,,,iOS
1425,2024-07-09 15:41:07+00:00,5,Je t’aime Monzo.,Je suis Monzo.,D.J Gonnella,5.81.0,gb,en,,,iOS
1457,2024-07-02 15:40:54+00:00,5,Mr West,I love monzo,Thenthing,5.80.0,gb,en,,,iOS


### Light Pre-Sentiment Cleaning

In [124]:
# # ------------------------------------------------------------
# # 5.X Pre-Sentiment Data Preparation
# # ------------------------------------------------------------
#
# # Drop rows with missing review text
# monzo_reviews_master = monzo_reviews_master.dropna(subset=['review_text'])
#
# # Fill missing author and language
# monzo_reviews_master['author_name'] = monzo_reviews_master['author_name'].fillna('Unknown')
# monzo_reviews_master['review_language'] = monzo_reviews_master['review_language'].fillna('en').str.lower()
#
# # Keep only English reviews for sentiment analysis
# monzo_reviews_master = monzo_reviews_master[monzo_reviews_master['review_language'] == 'en']
#
# # Strip whitespace and remove duplicates
# monzo_reviews_master['review_text'] = monzo_reviews_master['review_text'].astype(str).str.strip()
# monzo_reviews_master = monzo_reviews_master.drop_duplicates(subset=['review_text'])
#
# # Sanity check after cleaning
# print("✅ Cleaned dataset ready for sentiment analysis")
# print(f"Remaining rows: {len(monzo_reviews_master):,}")
# print("Unique languages:", monzo_reviews_master['review_language'].unique())
