## 1) Install & Import

In [None]:
%pip install pandas numpy matplotlib --quiet

import json
import glob
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

# Download VADER lexicon (only needed once; comment out after the first successful run)
nltk.download("vader_lexicon", quiet=True)

# For reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 2) Load & Flatten Sri Lanka JSONL -> Compute VADER -> Drop “Mismatches”

In [None]:
# Initialize VADER analyzer
sia = SentimentIntensityAnalyzer()

# Load all SL JSONL files and flatten into a DataFrame:
sl_files = glob.glob("C:\Users/indur/OneDrive - University of Westminster/GitHub/GoogleMapsReviewsScraper/category_outputs/*.jsonl")
sl_rows = []

for fp in sl_files:
    with open(fp, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            place_name = rec.get("name", "")
            for rev in rec.get("reviews", []):
                text = rev.get("text") or ""
                if not text.strip():
                    continue
                # Parse rating (e.g., "4.0" → 4.0)
                raw_rating = rev.get("rating", "0")
                try:
                    rating = float(str(raw_rating).split()[0])
                except:
                    continue
                sl_rows.append({
                    "source": "SL",
                    "place":   place_name,
                    "author":  rev.get("author", ""),
                    "date":    rev.get("date", ""),
                    "text":    text.strip(),
                    "rating":  rating
                })

sl_df = pd.DataFrame(sl_rows)
print(f"Sri Lanka reviews loaded (raw): {len(sl_df):,}")

# Compute VADER “compound” score and binary flags for pos/neg/neu
sl_df["vader_compound"] = sl_df["text"].astype(str).apply(lambda t: sia.polarity_scores(t)["compound"])
sl_df["vader_pos"]      = sl_df["vader_compound"].apply(lambda c: 1 if c >= 0.3 else 0)
sl_df["vader_neg"]      = sl_df["vader_compound"].apply(lambda c: 1 if c <= -0.3 else 0)
sl_df["vader_neu"]      = sl_df["vader_compound"].apply(lambda c: 1 if (-0.3 < c < 0.3) else 0)

# Identify SL “mismatch” reviews:
#       – rating -> 4 but VADER says negative (vader_neg == 1)
#       – rating -> 2 but VADER says positive (vader_pos == 1)
sl_bad = sl_df[
    ((sl_df["rating"] >= 4.0) & (sl_df["vader_neg"] == 1)) |
    ((sl_df["rating"] <= 2.0) & (sl_df["vader_pos"] == 1))
]
print(f"SL mismatches to drop: {len(sl_bad):,}")

# Drop those mismatches → sl_clean
sl_clean = sl_df.drop(sl_bad.index).reset_index(drop=True)
print(f"SL reviews after dropping mismatches: {len(sl_clean):,}")

# Display a couple of example SL mismatches
if not sl_bad.empty:
    print("\nExample SL mismatches:")
    print(
        sl_bad.head(2)[["rating", "vader_compound", "text"]]
        .to_string(index=False)
    )

## 3) Load & Flatten USA JSON -> Compute VADER -> Drop “Mismatches”

In [None]:
# Load the single USA JSON file (Nevada) and flatten into usa_df:
usa_path = "C:/Users/indur/Downloads/Compressed/review-Nevada/review-Nevada.json"
usa_rows = []

with open(usa_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        rec = json.loads(line)

        # Skip if rating is missing or not convertible
        raw_rating = rec.get("rating", None)
        if raw_rating is None:
            continue
        try:
            rating = float(raw_rating)
        except (TypeError, ValueError):
            continue

        text = rec.get("text") or ""
        if not text.strip():
            continue

        usa_rows.append({
            "source": "USA",
            "place":  rec.get("gmap_id", ""),
            "author": rec.get("name", ""),
            "date":   rec.get("time", ""),
            "text":   text.strip(),
            "rating": rating
        })

usa_df = pd.DataFrame(usa_rows)
print(f"USA reviews loaded (raw): {len(usa_df):,}")

# Compute VADER “compound” score and binary flags for pos/neg/neu
usa_df["vader_compound"] = usa_df["text"].astype(str).apply(lambda t: sia.polarity_scores(t)["compound"])
usa_df["vader_pos"]      = usa_df["vader_compound"].apply(lambda c: 1 if c >= 0.3 else 0)
usa_df["vader_neg"]      = usa_df["vader_compound"].apply(lambda c: 1 if c <= -0.3 else 0)
usa_df["vader_neu"]      = usa_df["vader_compound"].apply(lambda c: 1 if (-0.3 < c < 0.3) else 0)

# Identify USA “mismatch” reviews:
#       – rating => 4 but VADER says negative (vader_neg == 1)
#       – rating <= 2 but VADER says positive (vader_pos == 1)
usa_bad = usa_df[
    ((usa_df["rating"] >= 4.0) & (usa_df["vader_neg"] == 1)) |
    ((usa_df["rating"] <= 2.0) & (usa_df["vader_pos"] == 1))
]
print(f"USA mismatches to drop: {len(usa_bad):,}")

# Drop those mismatches -> usa_clean
usa_clean = usa_df.drop(usa_bad.index).reset_index(drop=True)
print(f"USA reviews after dropping mismatches: {len(usa_clean):,}")

# Display a couple of example USA mismatches
if not usa_bad.empty:
    print("\nExample USA mismatches:")
    print(
        usa_bad.head(2)[["rating", "vader_compound", "text"]]
        .to_string(index=False)
    )

## 4) Combine “Clean” SL + USA into a Single DataFrame (Before Balancing)


In [None]:
# For inspection, build a raw combined (mismatch‐filtered) DataFrame:
combined_raw = pd.concat([sl_clean, usa_clean], ignore_index=True)
print(f"Total mismatch‐filtered reviews (SL + USA): {len(combined_raw):,}")

# Show the star distribution before balancing:
raw_counts = combined_raw["rating"].value_counts().sort_index()
print("\nStar‐level counts before balancing:")
print(raw_counts)

## 5) Balance the Dataset by Star (SL‐priority, then USA) -> `balanced_df`


In [None]:
# Set TOTAL_TARGET and compute per‐star target
TOTAL_TARGET = 100_000
per_star_target = TOTAL_TARGET // 5   # = 20_000
print(f"→ Target: {per_star_target:,} reviews per star (1.0 through 5.0).")

# Check SL_clean and USA_clean counts for each star
print("\nSL_clean counts per star:")
print(sl_clean["rating"].value_counts().sort_index())
print("\nUSA_clean counts per star:")
print(usa_clean["rating"].value_counts().sort_index())

# Build balanced subsets for each star
balanced_parts = []

for star in [1.0, 2.0, 3.0, 4.0, 5.0]:
    # All SL_clean rows of this star
    sl_star_df = sl_clean[sl_clean["rating"] == star]
    num_sl_available = len(sl_star_df)

    if num_sl_available >= per_star_target:
        # SL alone suffices -> just subsample
        sl_taken = sl_star_df.sample(n=per_star_target, random_state=RANDOM_SEED)
        usa_taken = pd.DataFrame(columns=sl_clean.columns)  # zero USA for this star
        print(f"Star={star:.1f}: took {per_star_target:,} from SL (SL had {num_sl_available:,}).")

    else:
        # SL has fewer than needed → take all SL, then fill with USA
        sl_taken = sl_star_df.copy().reset_index(drop=True)
        needed_from_usa = per_star_target - num_sl_available

        # Gather USA_clean rows of this star
        usa_star_df = usa_clean[usa_clean["rating"] == star]
        if len(usa_star_df) < needed_from_usa:
            raise ValueError(
                f"Not enough USA_clean reviews of star={star:.1f} to fill {needed_from_usa:,}. "
                f"USA_clean has only {len(usa_star_df):,}."
            )
        usa_taken = usa_star_df.sample(n=needed_from_usa, random_state=RANDOM_SEED)
        print(f"Star={star:.1f}: took {num_sl_available:,} from SL + {needed_from_usa:,} from USA "
              f"(SL had only {num_sl_available:,}).")

    # Concatenate SL‐taken + USA‐taken for this star
    balanced_parts.append(pd.concat([sl_taken, usa_taken], ignore_index=True))

# Concatenate all five star‐level subsets
balanced_df = pd.concat(balanced_parts, ignore_index=True)

# Shuffle entire DataFrame so stars are interleaved
balanced_df = balanced_df.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)

# Verify final counts
balanced_counts = balanced_df["rating"].value_counts().sort_index()
print("\nFinal balanced counts per star (should all equal 20 000):")
print(balanced_counts)
print(f"\nTotal balanced reviews = {len(balanced_df):,} (should be {TOTAL_TARGET:,}).")

## 6) (Optional) Plot “Before vs. After” Star Distributions

In [None]:
plt.figure(figsize=(10,4))

# Before: raw (mismatch‐filtered) SL+USA
plt.subplot(1, 2, 1)
raw_counts.plot(kind="bar", color="skyblue", alpha=0.7)
plt.title("Before Balancing (mismatch‐filtered)")
plt.xlabel("Star Rating")
plt.ylabel("Count")

# After: balanced_df
plt.subplot(1, 2, 2)
balanced_counts.plot(kind="bar", color="orange", alpha=0.7)
plt.title("After Balancing (SL‐priority)")
plt.xlabel("Star Rating")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

## 7) Re‐nest & Save Balanced JSON to Disk

In [None]:
output_balanced = []
for place_name, grp in balanced_df.groupby("place"):
    output_balanced.append({
        "name":    place_name,
        "reviews": [
            {
                "source": row.source,
                "author": row.author,
                "date":   row.date,
                "text":   row.text,
                "rating": f"{row.rating} stars"
            }
            for row in grp.itertuples()
        ]
    })

with open("correct_reviews_balanced.json", "w", encoding="utf-8") as f:
    json.dump(output_balanced, f, ensure_ascii=False, indent=2)

print("Written balanced JSON to `correct_reviews_balanced.json`")