In [1]:
# ------------------------------------------------------------
# 02. Sentiment Analysis Pipeline
# ------------------------------------------------------------
# Purpose: Compute sentiment polarity and categorical labels
# Input: ../data/processed/Monzo_Reviews_Master.csv
# Output: ../data/processed/Monzo_Sentiment_Scored.csv
# Author: James O. Adeshina
# Date: October 2025
# ------------------------------------------------------------

import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from datetime import datetime

In [11]:

# === NLP & Sentiment ===
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [7]:
# Paths
DATA_PATH = "../data/processed/Monzo_Reviews_Master.csv"
EXPORT_PATH = "../data/processed/Monzo_Sentiment_Scored.csv"
EXPORT_PARQUET = "../data/processed/Monzo_Sentiment_Scored.parquet"

In [8]:

# Load dataset
print("📂 Loading cleaned master dataset...")
monzo_df = pd.read_csv(DATA_PATH)
print(f"✅ Loaded {len(monzo_df):,} reviews across {monzo_df['platform'].nunique()} platforms.")
monzo_df.head(3)

📂 Loading cleaned master dataset...
✅ Loaded 32,223 reviews across 2 platforms.


Unnamed: 0,review_date,rating,review_title,review_text,author_name,app_version,country,review_language,developer_reply_text,developer_reply_date,platform
0,2025-10-01 06:03:52+00:00,5,Read this truthful review,This banking app is the best for kids because ...,Thegamingmommyreviw,6.46.0,gb,en,,,iOS
1,2025-09-30 15:17:27+00:00,5,Cracking easy to use bank,Don’t tend to use the perks other than get sue...,Kar3n2,6.46.0,gb,en,,,iOS
2,2025-09-30 15:01:26+00:00,5,Monzo is great,The best bank I have ever use. It is honestly ...,Kira mella,6.46.0,gb,en,,,iOS


In [9]:
# ------------------------------------------------------------
# 2. Prepare & Sanity Check Text Data
# ------------------------------------------------------------
# Drop any rows without text (safety check)
monzo_df = monzo_df[monzo_df["review_text"].notna()]

# Basic stats
print(f"Remaining reviews: {len(monzo_df):,}")
print("Columns available:", list(monzo_df.columns))

# Check for duplicated reviews
dup_count = monzo_df["review_text"].duplicated().sum()
print(f"🔁 Duplicate review_text count: {dup_count}")


Remaining reviews: 32,222
Columns available: ['review_date', 'rating', 'review_title', 'review_text', 'author_name', 'app_version', 'country', 'review_language', 'developer_reply_text', 'developer_reply_date', 'platform']
🔁 Duplicate review_text count: 3950


In [5]:
# ------------------------------------------------------------
# 3A. Investigate Duplicate Reviews
# ------------------------------------------------------------

# Find duplicated text
dupes = monzo_df[monzo_df["review_text"].duplicated(keep=False)]

print(f"🔎 Found {len(dupes)} duplicate review texts (including repeats).")

# Check how many are exact duplicates across all columns
exact_dupes = monzo_df.duplicated(subset=monzo_df.columns, keep=False).sum()
print(f"📦 Exact duplicates across all columns: {exact_dupes}")

# Quick sample — identical text across different platforms/dates
sample_dupes = (
    dupes.sort_values("review_text")
    .groupby("review_text")
    .head(2)
    .sort_values("review_date")
    .head(10)
)

display(sample_dupes[["review_date", "rating", "review_text", "platform"]])


🔎 Found 4598 duplicate review texts (including repeats).
📦 Exact duplicates across all columns: 0


Unnamed: 0,review_date,rating,review_text,platform
12231,2016-10-10 15:56:37+00:00,5,So far so good!,Android
12227,2016-10-10 21:57:31+00:00,5,😍,Android
12223,2016-10-11 05:55:18+00:00,5,Good job,Android
12092,2016-12-06 13:45:54+00:00,5,Very Good,Android
12091,2016-12-06 14:26:06+00:00,5,Excellent App,Android
12006,2017-02-04 13:36:19+00:00,5,Amazing!!!,Android
11915,2017-04-11 16:46:36+00:00,5,Awesome!,Android
11892,2017-04-24 10:48:35+00:00,5,Fantastic!,Android
11883,2017-05-02 10:29:17+00:00,5,Class,Android
11854,2017-05-22 19:43:29+00:00,5,Great app!,Android


In [6]:
# ------------------------------------------------------------
# 4. Duplicate Classification & Safe Export
# ------------------------------------------------------------
# Find all duplicate text entries
dupes = monzo_df[monzo_df["review_text"].duplicated(keep=False)].copy()
print(f"🔎 Found {len(dupes)} potential duplicate reviews.")

# Tagging duplicate categories
dupes["dup_type"] = "Other"

# Same text & same platform
dupes.loc[
    dupes.duplicated(subset=["review_text", "platform"], keep=False),
    "dup_type"
] = "Same platform"

# Same text but different platform
cross_platform_dupes = (
    dupes.groupby("review_text")["platform"].nunique().reset_index()
)
cross_platform_texts = cross_platform_dupes.loc[
    cross_platform_dupes["platform"] > 1, "review_text"
]
dupes.loc[dupes["review_text"].isin(cross_platform_texts), "dup_type"] = "Cross platform"

# Same text but different rating
dupes["has_rating_diff"] = dupes.duplicated(subset=["review_text", "rating"], keep=False)
dupes.loc[dupes["has_rating_diff"], "dup_type"] = "Same text diff rating"

# Same text but different date
dupes["has_date_diff"] = dupes.duplicated(subset=["review_text", "review_date"], keep=False)
dupes.loc[dupes["has_date_diff"], "dup_type"] = "Same text diff date"

# Drop helper flags
dupes.drop(columns=["has_rating_diff", "has_date_diff"], inplace=True)

# Save for audit
dupes.to_csv("../data/processed/Monzo_Duplicates_Audit.csv", index=False)
print(f"💾 Saved duplicate audit log → Monzo_Duplicates_Audit.csv")

# Create filtered dataset keeping one copy per (review_text, platform, rating)
monzo_dedup = monzo_df.drop_duplicates(subset=["review_text", "platform", "rating"])
print(f"✅ De-duplicated working dataset: {len(monzo_dedup):,} rows")

# Save clean working dataset
monzo_dedup.to_csv("../data/processed/Monzo_Reviews_Master_Dedup.csv", index=False)
print("💾 Exported cleaned dataset ready for sentiment analysis.")


🔎 Found 4598 potential duplicate reviews.
💾 Saved duplicate audit log → Monzo_Duplicates_Audit.csv
✅ De-duplicated working dataset: 28,626 rows
💾 Exported cleaned dataset ready for sentiment analysis.


In [10]:
# ------------------------------------------------------------
# Refined Duplicate Handling (Preserve Cross-User Texts)
# ------------------------------------------------------------
print(f"Initial dataset: {len(monzo_df):,} rows")

# 1️⃣ Identify true duplicates (same user + same text + same platform + same rating)
true_dupes = monzo_df[
    monzo_df.duplicated(subset=["review_text", "author_name", "platform", "rating"], keep=False)
]

print(f"🔍 Found {len(true_dupes)} true duplicate rows (same user, same text, same platform).")

# 2️⃣ Save for audit
true_dupes.to_csv("../data/processed/Monzo_True_Duplicates_Audit.csv", index=False)
print("💾 Saved true duplicates for audit.")

# 3️⃣ Drop only those duplicates
monzo_clean = monzo_df.drop_duplicates(subset=["review_text", "author_name", "platform", "rating"])

print(f"✅ Cleaned dataset shape: {monzo_clean.shape}")
print(f"🔹 Rows removed: {len(monzo_df) - len(monzo_clean):,}")
print(f"🔹 Rows retained: {len(monzo_clean):,}")

# 4️⃣ Export cleaned dataset
monzo_clean.to_csv("../data/processed/Monzo_Reviews_Master_Cleaned.csv", index=False)
print("💾 Exported Monzo_Reviews_Master_Cleaned.csv (for sentiment pipeline).")


Initial dataset: 32,222 rows
🔍 Found 745 true duplicate rows (same user, same text, same platform).
💾 Saved true duplicates for audit.
✅ Cleaned dataset shape: (31625, 11)
🔹 Rows removed: 597
🔹 Rows retained: 31,625
💾 Exported Monzo_Reviews_Master_Cleaned.csv (for sentiment pipeline).


In [12]:
# ------------------------------------------------------------
# 1. Setup & Imports
# ------------------------------------------------------------
"""
Notebook: 02_sentiment_analysis.ipynb
Author: James O. Adeshina
Date: October 2025

Objective:
----------
Perform sentiment scoring using VADER on cleaned Monzo review data.
Save the output as Monzo_Sentiment_Scored.csv for data modeling and BI visualization.
"""

# === Setup ===
nltk.download('vader_lexicon')
print("📦 NLTK VADER lexicon ready.")

# === File Paths ===
DATA_PATH = "../data/processed/Monzo_Reviews_Master_Cleaned.csv"
OUTPUT_PATH = "../data/processed/Monzo_Sentiment_Scored.csv"


📦 NLTK VADER lexicon ready.


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/apple2015/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [13]:
# ------------------------------------------------------------
# 2. Load Cleaned Dataset
# ------------------------------------------------------------

print("📂 Loading cleaned Monzo review dataset...")
monzo_df = pd.read_csv(DATA_PATH)
print(f"✅ Loaded {len(monzo_df):,} reviews.")
print("Columns:", list(monzo_df.columns))

# Preview first few records
monzo_df.head(3)


📂 Loading cleaned Monzo review dataset...
✅ Loaded 31,625 reviews.
Columns: ['review_date', 'rating', 'review_title', 'review_text', 'author_name', 'app_version', 'country', 'review_language', 'developer_reply_text', 'developer_reply_date', 'platform']


Unnamed: 0,review_date,rating,review_title,review_text,author_name,app_version,country,review_language,developer_reply_text,developer_reply_date,platform
0,2025-10-01 06:03:52+00:00,5,Read this truthful review,This banking app is the best for kids because ...,Thegamingmommyreviw,6.46.0,gb,en,,,iOS
1,2025-09-30 15:17:27+00:00,5,Cracking easy to use bank,Don’t tend to use the perks other than get sue...,Kar3n2,6.46.0,gb,en,,,iOS
2,2025-09-30 15:01:26+00:00,5,Monzo is great,The best bank I have ever use. It is honestly ...,Kira mella,6.46.0,gb,en,,,iOS


In [14]:
# ------------------------------------------------------------
# 3. Sentiment Scoring Function
# ------------------------------------------------------------

def compute_sentiment(text_series):
    """
    Compute sentiment scores using NLTK VADER.
    Returns compound score (float) and categorical label.
    """
    sia = SentimentIntensityAnalyzer()
    scores = text_series.apply(lambda x: sia.polarity_scores(str(x))['compound'])

    # Define thresholds
    def categorize(score):
        if score >= 0.05:
            return 'positive'
        elif score <= -0.05:
            return 'negative'
        else:
            return 'neutral'

    labels = scores.apply(categorize)
    return scores, labels


In [15]:
# ------------------------------------------------------------
# 4. Apply Sentiment Analysis
# ------------------------------------------------------------

print("🧠 Running sentiment analysis on review_text...")
monzo_df['sentiment_score'], monzo_df['sentiment_label'] = compute_sentiment(monzo_df['review_text'])

# Quick overview
sentiment_counts = monzo_df['sentiment_label'].value_counts()
print("✅ Sentiment label distribution:")
print(sentiment_counts)


🧠 Running sentiment analysis on review_text...
✅ Sentiment label distribution:
sentiment_label
positive    22297
negative     5934
neutral      3394
Name: count, dtype: int64


In [16]:
# ------------------------------------------------------------
# 5. Summary Statistics
# ------------------------------------------------------------
print("\n📊 Sentiment Summary by Platform:")
summary = monzo_df.groupby("platform")["sentiment_score"].agg(["mean", "std", "count"])
display(summary)

print("\n📈 Rating vs. Sentiment Overview:")
corr = monzo_df["rating"].corr(monzo_df["sentiment_score"])
print(f"🔗 Correlation between rating and sentiment score: {corr:.3f}")



📊 Sentiment Summary by Platform:


Unnamed: 0_level_0,mean,std,count
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Android,0.360884,0.498473,22046
iOS,0.367136,0.562958,9579



📈 Rating vs. Sentiment Overview:
🔗 Correlation between rating and sentiment score: 0.666


In [17]:
# ------------------------------------------------------------
# 6. Save Processed Dataset
# ------------------------------------------------------------
monzo_df.to_csv(OUTPUT_PATH, index=False)
print(f"💾 Sentiment-scored dataset saved to: {OUTPUT_PATH}")
print("✅ Ready for next phase → 03_data_model_preparation.ipynb")


💾 Sentiment-scored dataset saved to: ../data/processed/Monzo_Sentiment_Scored.csv
✅ Ready for next phase → 03_data_model_preparation.ipynb
