In [2]:
import json

with open("dataset_crawler-google-places_2025-08-28_15-58-50-060.json") as f:
    places = json.load(f)

startUrls = []
for p in places:
    pid = p.get("placeId")
    url = p.get("url")
    if pid:
        startUrls.append({"url": f"https://www.google.com/maps/place/?q=place_id:{pid}"})
    elif url:
        startUrls.append({"url": url})

print(f"Built {len(startUrls)} startUrls")

out = {
    "startUrls": startUrls,
    "maxReviews": 0,
    "reviewsSort": "newest",
    "language": "en"
}

with open("clementi_places_deduped.json", "w", encoding="utf-8") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)

print(f"Wrote {"clementi_places_deduped.json"} with {len(startUrls)} startUrls.")


Built 50 startUrls
Wrote clementi_places_deduped.json with 50 startUrls.


In [3]:
import pandas as pd
import json, re, unicodedata
import urllib.parse as urlq

# === EDIT THESE to match your CSV ===
CSV_IN = "google_reviews_singapore.csv"
COL_NAME = "place_name"          # restaurant name
COL_PLACEID = "place_id"   # Google place_id (if available)
COL_ADDR = "address"       # optional: street/address column (or set to None)
COL_URL = None             # optional: if you already have a Google Maps URL column

# === 1) load ===
df = pd.read_csv(CSV_IN)

# === 2) build a normalized name for fuzzy dedupe (used when place_id is missing) ===
def normalize(s: str) -> str:
    if not isinstance(s, str): s = ""
    # fold unicode (é -> e), lowercase, remove punctuation/spaces runs
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    s = s.lower()
    s = re.sub(r"[^\w\s]", " ", s)            # drop punctuation
    s = re.sub(r"\s+", " ", s).strip()        # collapse spaces
    return s

df["_name_norm"] = df[COL_NAME].map(normalize)

if COL_ADDR and COL_ADDR in df.columns:
    df["_addr_norm"] = df[COL_ADDR].map(normalize)
else:
    df["_addr_norm"] = ""

# A composite key when place_id is missing
df["_fallback_key"] = df["_name_norm"] + " | " + df["_addr_norm"]

# === 3) choose a dedupe key: prefer place_id, else normalized name+address ===
def dedupe_key(row):
    pid = str(row.get(COL_PLACEID, "")).strip() if COL_PLACEID in row else ""
    if pid and pid.lower() != "nan":
        return ("PID", pid)
    return ("FALLBACK", row["_fallback_key"])

df["_dedupe_key"] = df.apply(dedupe_key, axis=1)

# === 4) decide which duplicate to keep ===
# Strategy A (simple): keep the first occurrence
# If you have a "reviews_count" column and want the most-reviewed, use idxmax per group instead.

# Simple keep-first:
dedup = df.drop_duplicates(subset=["_dedupe_key"], keep="first").copy()

# # If you prefer "most reviews wins", uncomment and set the column name:
# SCORE_COL = "reviews_count"   # <- change to your column if you have one
# dedup = (df.loc[df.groupby("_dedupe_key")[SCORE_COL].idxmax()]
#            if SCORE_COL in df.columns else df.drop_duplicates("_dedupe_key"))

print(f"Original rows: {len(df)} | After dedupe: {len(dedup)}")

# === 5) (optional) create Google Maps URL if you need the startUrls JSON next ===
def build_gmaps_url(name, place_id=None, url_from_csv=None):
    if url_from_csv:              # if your CSV already has a URL, use it
        return url_from_csv
    if place_id and str(place_id).strip().lower() != "nan":
        return f"https://www.google.com/maps/search/?api=1&query={urlq.quote(str(name))}&query_place_id={place_id}"
    # no place_id: still make a searchable URL with just the name
    return f"https://www.google.com/maps/search/?api=1&query={urlq.quote(str(name))}"

dedup["gmaps_url"] = dedup.apply(
    lambda r: build_gmaps_url(
        r.get(COL_NAME, ""),
        r.get(COL_PLACEID, "") if COL_PLACEID in dedup.columns else None,
        r.get(COL_URL, "") if (COL_URL and COL_URL in dedup.columns) else None
    ),
    axis=1
)


# If you also want the JSON like your second file:
start_urls = [{"url": u} for u in dedup["gmaps_url"]]
json_obj = {
    "startUrls": start_urls,
    "maxReviews": 0,
    "reviewsSort": "newest",
    "language": "en"
}
with open("restaurants_dedup_startUrls.json", "w", encoding="utf-8") as f:
    json.dump(json_obj, f, indent=2, ensure_ascii=False)
print("✔ Saved JSON -> restaurants_dedup_startUrls.json")



Original rows: 3120 | After dedupe: 643
✔ Saved JSON -> restaurants_dedup_startUrls.json
