In [1]:
"""
Yelp data cleaning script for a much smaller sample dataset.
This version aggressively filters and samples to reduce size.
"""
import os
import pandas as pd
import numpy as np
from datetime import datetime
from hashlib import md5
from sklearn.cluster import DBSCAN
from scipy.stats import zscore
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# Configuration: sample sizes and thresholds
OUTPUT_DIR = "data_filtered"
os.makedirs(OUTPUT_DIR, exist_ok=True)

BUSINESS_SAMPLE = 1000     # number of businesses to keep
MIN_BUSINESS_REVIEWS = 20 # min reviews per business
REVIEW_SAMPLE_PER_BIZ = 50
MIN_USER_REVIEWS = 5
MIN_TIPS_PER_BIZ = 5
MIN_CHECKINS_PER_BIZ = 5
DATE_MIN, DATE_MAX = "2011-01-01", "2016-12-31"

In [3]:

# Helper functions

def is_valid_hours(h):
    if not isinstance(h, dict): return False
    ok = []
    for v in h.values():
        try:
            s,e = v.split("-")
            ok.append(int(s.replace(":", "")) < int(e.replace(":", "")))
        except:
            ok.append(False)
    return all(ok)

In [4]:
# 1) Businesses
biz = pd.read_json("Data/business.json", lines=True)
# Drop missing critical fields
biz = biz.dropna(subset=["business_id", "name", "city", "state", "latitude", "longitude"])
# Only open businesses with enough reviews
biz = biz[(biz.is_open == 1) & (biz.review_count >= MIN_BUSINESS_REVIEWS)]
# Valid hours only
biz = biz[biz.hours.apply(is_valid_hours)]
# Round stars
biz.stars = (biz.stars * 2).round() / 2
# Select top by review_count then sample
biz = biz.sort_values("review_count", ascending=False).head(BUSINESS_SAMPLE)
biz.to_csv(os.path.join(OUTPUT_DIR, "business_filtered.csv"), index=False)
print(f"Saved {len(biz)} businesses")

Saved 1000 businesses


In [5]:
# 2) Reviews
analyzer = SentimentIntensityAnalyzer()
biz_ids = set(biz.business_id)
reader = pd.read_json("Data/review.json", lines=True, chunksize=100000)
out_reviews = []
for chunk in reader:
    # filter by business
    chunk = chunk[chunk.business_id.isin(biz_ids)]
    if chunk.empty:
        continue
    # parse date
    chunk["dts"] = pd.to_datetime(chunk.date, errors='coerce')
    chunk = chunk[(chunk.dts >= DATE_MIN) & (chunk.dts <= DATE_MAX)]
    # length & stars
    chunk = chunk[(chunk.text.str.len() >= 20) & (chunk.stars.between(1,5))]
    # sentiment filter
    chunk['s'] = chunk.text.map(lambda t: analyzer.polarity_scores(t)['compound'])
    # keep neutral to mixed
    chunk = chunk[chunk.s.abs() < 0.8]
    # sample per business
    chunk = chunk.groupby('business_id').apply(lambda df: df.sample(min(len(df), REVIEW_SAMPLE_PER_BIZ))).reset_index(drop=True)
    out_reviews.append(chunk)

  chunk = chunk.groupby('business_id').apply(lambda df: df.sample(min(len(df), REVIEW_SAMPLE_PER_BIZ))).reset_index(drop=True)
  chunk = chunk.groupby('business_id').apply(lambda df: df.sample(min(len(df), REVIEW_SAMPLE_PER_BIZ))).reset_index(drop=True)
  chunk = chunk.groupby('business_id').apply(lambda df: df.sample(min(len(df), REVIEW_SAMPLE_PER_BIZ))).reset_index(drop=True)
  chunk = chunk.groupby('business_id').apply(lambda df: df.sample(min(len(df), REVIEW_SAMPLE_PER_BIZ))).reset_index(drop=True)
  chunk = chunk.groupby('business_id').apply(lambda df: df.sample(min(len(df), REVIEW_SAMPLE_PER_BIZ))).reset_index(drop=True)
  chunk = chunk.groupby('business_id').apply(lambda df: df.sample(min(len(df), REVIEW_SAMPLE_PER_BIZ))).reset_index(drop=True)
  chunk = chunk.groupby('business_id').apply(lambda df: df.sample(min(len(df), REVIEW_SAMPLE_PER_BIZ))).reset_index(drop=True)
  chunk = chunk.groupby('business_id').apply(lambda df: df.sample(min(len(df), REVIEW_SAMPLE_PER_BIZ))).reset_i

In [6]:
reviews = pd.concat(out_reviews, ignore_index=True)
reviews.to_csv(os.path.join(OUTPUT_DIR, "review_small.csv"), index=False)
print(f"Saved {len(reviews)} reviews")

Saved 60935 reviews


In [7]:
# 3) Users
usr_ids = set(reviews.user_id)
reader = pd.read_json("Data/user.json", lines=True, chunksize=50000)
out_users = []
for chunk in reader:
    chunk = chunk[chunk.user_id.isin(usr_ids)]
    # enforce activity
    chunk = chunk[chunk.review_count >= MIN_USER_REVIEWS]
    out_users.append(chunk)
users = pd.concat(out_users, ignore_index=True)
users.to_csv(os.path.join(OUTPUT_DIR, "user_small.csv"), index=False)
print(f"Saved {len(users)} users")

Saved 43368 users


In [13]:
# 4) Checkins (fast, limited)
chk = pd.read_json("Data/checkin.json", lines=True)
chk = chk[chk.business_id.isin(biz_ids)]
chk = chk.head(1000)  # limit for speed
# extract ISO timestamps per row
chk['times'] = chk.date.str.findall(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}")
# explode lists
chk = chk.explode('times').dropna(subset=['times'])
chk['times'] = pd.to_datetime(chk['times'], errors='coerce')
chk = chk.dropna(subset=['times'])
# split into date and time columns
chk['date'] = chk['times'].dt.date
chk['time'] = chk['times'].dt.time
chk = chk[['business_id', 'date', 'time']]
# keep businesses with enough
chk = chk.groupby('business_id').filter(lambda df: len(df) >= MIN_CHECKINS_PER_BIZ)
chk.to_csv(os.path.join(OUTPUT_DIR, "checkin_small.csv"), index=False)
print(f"Saved {len(chk)} checkins")


Saved 1304861 checkins


In [14]:
# 5) Tips
tips = pd.read_json("Data/tip.json", lines=True)
# filter
tips = tips[tips.business_id.isin(biz_ids)]
tips['d'] = pd.to_datetime(tips.date, errors='coerce')
tips = tips[(tips.d >= DATE_MIN) & (tips.d <= DATE_MAX)]
tips = tips.groupby('business_id').filter(lambda df: len(df) >= MIN_TIPS_PER_BIZ)
tips.to_csv(os.path.join(OUTPUT_DIR, "tips_small.csv"), index=False)
print(f"Saved {len(tips)} tips")

Saved 56044 tips
