In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from hashlib import md5
from collections import Counter
from sklearn.cluster import DBSCAN
from scipy.stats import zscore
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from langdetect import detect

In [2]:
# Ensure output directory exists
OUTPUT_DIR = "data_cleaned"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# ---------- Helper Functions ----------
def parse_and_clean_cats(cat_field):
    if not isinstance(cat_field, str):
        return []
    raw = [c.strip() for c in cat_field.split(",") if c.strip()]
    out = []
    for c in raw:
        c2 = c.lower()
        if c2.endswith(" food"):
            c2 = c2[: -len(" food")]
        out.append(c2)
    return out

def valid_hours(day_hours):
    try:
        start, end = day_hours.split("-")
        return int(start.replace(":", "")) < int(end.replace(":", ""))
    except:
        return False

def valid_hours_dict(h):
    if not isinstance(h, dict):
        return False
    return all(isinstance(v, str) and valid_hours(v) for v in h.values())

def parse_dates(s):
    out = []
    for part in s.split(","):
        try:
            out.append(datetime.strptime(part.strip(), "%Y-%m-%d %H:%M:%S"))
        except ValueError:
            continue
    return sorted(set(out))

def sentiment_ok(row, analyzer):
    s = analyzer.polarity_scores(row.text)["compound"]
    if row.stars >= 4 and s < 0:   return False
    if row.stars <= 2 and s > 0.5: return False
    return True



In [5]:
# ---------- 1) Business ----------
print("Cleaning business.json...")
df = pd.read_json("Data/business.json", lines=True)
# Drop incomplete
mandatory = ["business_id","name","city","state","latitude","longitude","stars","review_count"]
df.dropna(subset=mandatory, inplace=True)
# Unique IDs by review_count
df.sort_values("review_count", ascending=False, inplace=True)
df.drop_duplicates("business_id", keep="first", inplace=True)
# Filter review_count >=3
df = df[df.review_count >= 3]
# Stars bounds & rounding
df = df[df.stars.between(0,5)]
df["stars"] = (df.stars * 2).round() / 2.0
# Hours
df = df[df.hours.apply(valid_hours_dict)]
# Numeric outliers
for col in ["review_count","stars"]:
    z = (df[col] - df[col].mean())/df[col].std()
    df = df[np.abs(z) <= 3]
# Categories
df["categories"] = df.categories.apply(parse_and_clean_cats)
def clean_cats(cat_list): return [c.strip().lower().replace("food","") for c in cat_list]
df["categories"] = df.categories.apply(clean_cats)
# Write
business_out = df.reset_index(drop=True)
business_out.to_csv(os.path.join(OUTPUT_DIR, "business_clean.csv"), index=False)
print("business_clean.csv saved")

Cleaning business.json...
business_clean.csv saved


In [6]:
# ---------- 2) Check-in ----------
print("Cleaning checkin.json...")
raw = pd.read_json("Data/checkin.json", lines=True)
raw = raw.groupby("business_id").agg({"date": lambda L: ",".join(L)}).reset_index()
raw["checkins"] = raw.date.apply(parse_dates)
raw = raw[raw.checkins.str.len() > 0]
rows = []
for _, r in raw.iterrows():
    for ts in r.checkins:
        rows.append({"business_id": r.business_id, "checkin_time": ts})
events = pd.DataFrame(rows)
# Filter >=5 per business
agg = events.groupby("business_id").checkin_time.count().rename("n_checkins")
good = agg[agg >= 5].index
events = events[events.business_id.isin(good)]
# Write
events.to_csv(os.path.join(OUTPUT_DIR, "checkin_clean.csv"), index=False)
print("checkin_clean.csv saved")

Cleaning checkin.json...
checkin_clean.csv saved


In [7]:
# ---------- 3) Reviews ----------
print("Cleaning review.json (may take a while)...")
analyzer = SentimentIntensityAnalyzer()
chunksize = 100_000
first = True
for chunk in pd.read_json("Data/review.json", lines=True, chunksize=chunksize):
    # mandatory + dedupe
    mandatory = ["review_id","user_id","business_id","stars","date","text"]
    chunk.dropna(subset=mandatory, inplace=True)
    chunk.drop_duplicates("review_id", keep="first", inplace=True)
    # date parse & range
    chunk["date_parsed"] = pd.to_datetime(chunk.date, format="%Y-%m-%d", errors='coerce')
    chunk = chunk[chunk.date_parsed.between("2010-01-01","2020-12-31")]
    # stars & text length
    chunk = chunk[chunk.stars.between(1,5)]
    chunk = chunk[chunk.text.str.len() >= 5]
    # text hash dedupe
    chunk["text_hash"] = chunk.text.map(lambda t: md5(t.encode('utf8')).hexdigest())
    chunk.drop_duplicates("text_hash", keep="first", inplace=True)
    # vote counts
    for col in ["useful","funny","cool"]:
        chunk = chunk[chunk[col] >= 0]
    # z-score outliers
    for col in ["useful","funny","cool"]:
        zs = zscore(chunk[col])
        chunk = chunk[zs < 3]
    # sentiment mismatch
    chunk = chunk[chunk.apply(lambda r: sentiment_ok(r, analyzer), axis=1)]
    # drop helper
    chunk.drop(columns=["text_hash","date_parsed"], inplace=True)
    # write
    mode = 'w' if first else 'a'
    header = first
    chunk.to_csv(os.path.join(OUTPUT_DIR, "review_clean.csv"), mode=mode, header=header, index=False)
    first = False
print("review_clean.csv saved.")

Cleaning review.json (may take a while)...
review_clean.csv saved.


In [9]:
# ---------- 4) Tips ----------
print("Cleaning tip.json...")
df_t = pd.read_json("Data/tip.json", lines=True)
# drop missing
mandatory = ["text","date","compliment_count","business_id","user_id"]
df_t.dropna(subset=mandatory, inplace=True)
# parse date & filter
df_t["date_parsed"] = pd.to_datetime(df_t.date, format="%Y-%m-%d", errors='coerce')
mask = (df_t.date_parsed >= "2010-01-01") & (df_t.date_parsed <= "2020-12-31")
df_t = df_t[mask]
# compliments & outliers
df_t = df_t[df_t.compliment_count >= 0]
z_scores = zscore(df_t.compliment_count.astype(float), nan_policy='omit')
if not df_t.empty:
    df_t = df_t[np.abs(z_scores) < 3]
# text length & dedupe 
df_t["text_len"] = df_t.text.str.len().fillna(0)
df_t = df_t[df_t.text_len >= 5]
df_t["text_hash"] = df_t.text.map(lambda t: md5(t.encode('utf-8')).hexdigest())
df_t.drop_duplicates(subset=["text_hash"], inplace=True)
# drop helpers and write
cols = [c for c in df_t.columns if c not in ["text_hash","text_len","date_parsed"]]
df_t[cols].to_csv(os.path.join(OUTPUT_DIR, "tips_clean.csv"), index=False)
print("tips_clean.csv saved")

Cleaning tip.json...
tips_clean.csv saved


In [6]:
# ----------- 5) Users (Chunked) -----------
print("Cleaning user.json in chunks...")
chunk_size = 100000
first = True
for chunk in pd.read_json("Data/user.json", lines=True, chunksize=chunk_size):
    # 1) mandatory
    mand_u = ["user_id","review_count","yelping_since","useful","funny","cool","fans","average_stars"]
    chunk = chunk.dropna(subset=mand_u)
    # 2) parse date
    chunk['joined'] = pd.to_datetime(chunk.yelping_since, errors='coerce')
    chunk = chunk[chunk.joined.between('2010-01-01','2020-12-31')]
    # 3) non-neg
    for col in ['review_count','useful','funny','cool','fans','average_stars']:
        chunk = chunk[chunk[col] >= 0]
    # 4) dedupe
    chunk = chunk.drop_duplicates('user_id')
    # 5) drop helpers
    chunk = chunk.drop(columns=['yelping_since','joined'])
    # 6) write
    mode = 'w' if first else 'a'
    header = first
    chunk.to_csv(os.path.join(OUTPUT_DIR, "user_clean.csv"), mode=mode, header=header, index=False)
    print(f"Chunk: wrote {len(chunk)} users")
    first = False
print("Users cleaned.")

Cleaning user.json in chunks...
Chunk: wrote 82748 users
Chunk: wrote 95253 users
Chunk: wrote 92513 users
Chunk: wrote 92748 users
Chunk: wrote 90169 users
Chunk: wrote 92816 users
Chunk: wrote 92662 users
Chunk: wrote 90450 users
Chunk: wrote 93594 users
Chunk: wrote 90633 users
Chunk: wrote 93601 users
Chunk: wrote 90748 users
Chunk: wrote 94419 users
Chunk: wrote 90868 users
Chunk: wrote 95075 users
Chunk: wrote 88095 users
Chunk: wrote 94758 users
Chunk: wrote 89777 users
Chunk: wrote 95235 users
Chunk: wrote 80557 users
Users cleaned.
