# 4 | Merchant Review Interruption & Rating Decline Analysis

1. **Parameters & Functions**
2. **Aggregate `review.json` → Temporal features for each merchant**
3. **Merge `business.json` / `checkin.json`**
4. **Calculate interruption & rating-decline flags**
5. **Output profile table `business_churn_profile.csv`**
6. **Quick sanity-check & distribution visualization (optional)**


In [1]:
# 1 | Parameters & Functions
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm

RAW   = Path("../data/raw")
PROC  = Path("../data/processed"); PROC.mkdir(exist_ok=True, parents=True)

# === Parameters ===
T_END          = pd.Timestamp("2022-01-19")    # Keep consistent with the user side
WINDOW_DAYS    = 365                           # No reviews in the last X days → interruption
RATING_DROP    = 0.3                           # Recent rating lower than historical average by 0.3 stars
MIN_HIST_CNT   = 10                            # Minimum historical reviews required to flag “rating decline”
RECENT_MONTHS  = 12                            # “Recent” window (in months)
CHUNKSIZE      = 200_000                       # Chunk size when reading large files

In [23]:
# 2 | Read business.json basic information
biz_cols = ["business_id", "name", "city", "state", "categories",
            "stars", "review_count", "is_open", "latitude", "longitude"]

biz_df = (
    pd.read_json(RAW / "yelp_academic_dataset_business.json",
                 lines=True, encoding="utf-8")[biz_cols]
)

biz_df["state"] = biz_df["state"].where(
    biz_df["state"].str.match(r"^[A-Z]{2}$", na=False), "XX"
)

print(biz_df.shape)
biz_df.head()

(150346, 10)


Unnamed: 0,business_id,name,city,state,categories,stars,review_count,is_open,latitude,longitude
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara,CA,"Doctors, Traditional Chinese Medicine, Naturop...",5.0,7,0,34.426679,-119.711197
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,Affton,MO,"Shipping Centers, Local Services, Notaries, Ma...",3.0,15,1,38.551126,-90.335695
2,tUFrWirKiKi_TAnsVWINQQ,Target,Tucson,AZ,"Department Stores, Shopping, Fashion, Home & G...",3.5,22,0,32.223236,-110.880452
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",4.0,80,1,39.955505,-75.155564
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,Green Lane,PA,"Brewpubs, Breweries, Food",4.5,13,1,40.338183,-75.471659


In [24]:
# 3 | Aggregate review.json by business_id
recent_cut = T_END - pd.Timedelta(days=30 * RECENT_MONTHS)

# dic -> [first, last, sum_hist, cnt_hist, sum_recent, cnt_recent]
tmp = {}

reader = pd.read_json(
    RAW / "yelp_academic_dataset_review.json",
    lines=True,
    chunksize=CHUNKSIZE,
    encoding="utf-8"
)

for chunk in tqdm(reader, desc="Scanning reviews"):
    chunk = chunk[["business_id", "stars", "date"]].copy()
    chunk["date"] = pd.to_datetime(chunk["date"], errors="coerce")

    for bid, g in chunk.groupby("business_id"):
        first = g["date"].min()
        last  = g["date"].max()
        ch    = len(g)
        sh    = g["stars"].sum()

        recent = g[g["date"] >= recent_cut]
        cr     = len(recent)
        sr     = recent["stars"].sum()

        if bid not in tmp:
            tmp[bid] = [first, last, sh, ch, sr, cr]
        else:
            cur = tmp[bid]
            # 更新
            cur[0] = min(cur[0], first)
            cur[1] = max(cur[1], last)
            cur[2] += sh
            cur[3] += ch
            cur[4] += sr
            cur[5] += cr

# turn to DataFrame
rev_agg = pd.DataFrame.from_dict(
    tmp, orient="index",
    columns=["date_first", "date_last",
             "star_sum_hist", "cnt_hist",
             "star_sum_recent", "cnt_recent"]
).reset_index().rename(columns={"index": "business_id"})

# averge
rev_agg["star_hist"]   = rev_agg["star_sum_hist"]   / rev_agg["cnt_hist"]
rev_agg["star_recent"] = rev_agg["star_sum_recent"] / rev_agg["cnt_recent"]
# if no recent, comment keep NaN
rev_agg.loc[rev_agg["cnt_recent"] == 0, "star_recent"] = np.nan

rev_agg = rev_agg.drop(columns=["star_sum_hist", "star_sum_recent"])
print(rev_agg.shape)
rev_agg.head(2)


Scanning reviews: 35it [02:46,  4.76s/it]


(150346, 7)


Unnamed: 0,business_id,date_first,date_last,cnt_hist,cnt_recent,star_hist,star_recent
0,--ZVrH2X2QXBFdCilbirsw,2007-08-31 17:35:11,2018-02-24 00:53:41,36,0,4.722222,
1,--_9CAxgfXZmoFdNIRrhHA,2010-07-13 20:40:56,2021-08-21 21:44:14,12,1,3.25,5.0


In [26]:
# 4 | Parse checkin.json -> Total number of check-ins
ckn_path = RAW / "yelp_academic_dataset_checkin.json"

if ckn_path.exists():
    counts = {}
    with open(ckn_path, encoding="utf-8") as f:
        for line in tqdm(f, desc="Parsing checkins"):
            d = json.loads(line)
            counts[d["business_id"]] = len(d["date"].split(", "))
    ckn_df = pd.DataFrame(counts.items(),
                          columns=["business_id", "checkins_total"])
else:
    ckn_df = pd.DataFrame(columns=["business_id", "checkins_total"])

print(ckn_df.shape)
ckn_df.head()

Parsing checkins: 131930it [00:00, 170197.00it/s]


(131930, 2)


Unnamed: 0,business_id,checkins_total
0,---kPU91CF4Lq2-WlRu9Lw,11
1,--0iUa4sNDFiZFrAdIWhZQ,10
2,--30_8IhuyMHbSOcNWd6DQ,2
3,--7PUidqRWpRSpXebiyxTg,10
4,--7jw19RH9JKXgFohspgQw,26


In [32]:
# 5 | Merge & Flags
df = (
    biz_df
      .merge(rev_agg, on="business_id", how="left")
      .merge(ckn_df, on="business_id", how="left")
)

# --- No-review flag ---
df["no_review_recent"] = (
    df["date_last"] < (T_END - pd.Timedelta(days=WINDOW_DAYS))
)

# --- Rating drop flag ---
df["rating_drop"] = (
    (df["cnt_hist"] >= MIN_HIST_CNT) &
    (
        # Case A: No reviews in the past 12 months ⇒ Directly marked as "drop"
        (df["cnt_recent"] == 0)
        |
        # Case B: Has reviews, but the average rating has dropped
        (
            (df["cnt_recent"] > 0) &
            (df["star_recent"].notna()) &
            ((df["star_recent"] + RATING_DROP) < df["star_hist"])
        )
    )
)

# --- Combined churn flag (for still-operating businesses only) ---
df["churn_flag"] = (
    (df["is_open"] == 1) &
    df["no_review_recent"] &
    df["rating_drop"]
)

print("⚑ Number of businesses with both churn and rating drop =", int(df["churn_flag"].sum()))
df.loc[df["churn_flag"]].head()


⚑ Number of businesses with both churn and rating drop = 14132


Unnamed: 0,business_id,name,city,state,categories,stars,review_count,is_open,latitude,longitude,date_first,date_last,cnt_hist,cnt_recent,star_hist,star_recent,checkins_total,no_review_recent,rating_drop,churn_flag
6,n_0UpQx1hsNbnPUSlodU8w,Famous Footwear,Brentwood,MO,"Sporting Goods, Fashion, Shoe Stores, Shopping...",2.5,13,1,38.627695,-90.340465,2010-09-20 15:26:47,2019-12-30 19:59:54,13,0,2.307692,,79.0,True,True,True
26,noByYNtDLQAra9ccqxdfDw,H&M,Santa Barbara,CA,"Women's Clothing, Accessories, Children's Clot...",3.0,24,1,34.420209,-119.70046,2011-06-24 03:17:08,2020-07-26 18:51:31,24,0,2.958333,,289.0,True,True,True
32,8sshLb4UU7emeUDvtJWnpA,DanceLine,Paoli,PA,"Dance Wear, Sports Wear, Children's Clothing, ...",4.5,11,1,40.041585,-75.484953,2011-10-16 13:43:29,2019-08-31 23:27:26,11,0,4.454545,,27.0,True,True,True
38,LcAozWCMLGjwRbokaJAKMg,Edwardsville Children's Museum,Edwardsville,IL,"Museums, Kids Activities, Arts & Entertainment...",4.5,12,1,38.804395,-89.949733,2011-10-19 17:35:47,2020-05-12 18:44:57,13,0,4.384615,,7.0,True,True,True
46,JX4tUpd09YFchLBuI43lGw,Naked Cyber Cafe & Espresso Bar,Edmonton,AB,"Arts & Entertainment, Music Venues, Internet S...",4.0,12,1,53.544682,-113.506589,2008-10-09 06:52:43,2020-06-30 20:23:03,13,0,4.0,,14.0,True,True,True


In [33]:
# 6 | Save profile & sanity check
out_path = PROC / "business_churn_profile.csv"
df.to_csv(out_path, index=False)
print("✅ business_churn_profile.csv saved →", out_path)

# Basic statistics
print("\nTotal number of operating businesses:", (df["is_open"] == 1).sum())
print("Number of businesses marked as churn:", df["churn_flag"].sum())

# Top 5 states by churn
state_stats = (
    df[df["churn_flag"]]
    .groupby("state")["business_id"]
    .count()
    .sort_values(ascending=False)
    .head(5)
)
print("\n⚑ Top 5 states with the most churned businesses:")
print(state_stats)

✅ business_churn_profile.csv saved → ..\data\processed\business_churn_profile.csv

Total number of operating businesses: 119698
Number of businesses marked as churn: 14132

⚑ Top 5 states with the most churned businesses:
state
PA    3789
FL    1797
AB    1134
LA    1091
TN    1025
Name: business_id, dtype: int64


In [34]:
# 6 | Optional – Output category/state-level aggregation
## 6-A By state
state_df = (
    df.groupby("state")
      .agg(total_business=("business_id", "count"),
           churned=("churn_flag", "sum"))
      .assign(churn_rate=lambda d: d["churned"] / d["total_business"])
      .reset_index()
)
state_df.to_csv(PROC / "state_business_churn.csv", index=False)

## 6-B By main category (using the first tag in the categories string)
def pick_first(cat):
    return cat.split(",")[0].strip() if isinstance(cat, str) else "Unknown"

df["main_cat"] = df["categories"].apply(pick_first)

cat_df = (
    df.groupby("main_cat")
      .agg(total_business=("business_id", "count"),
           churned=("churn_flag", "sum"))
      .assign(churn_rate=lambda d: d["churned"] / d["total_business"])
      .sort_values("churned", ascending=False)
      .reset_index()
)
cat_df.to_csv(PROC / "business_churn_by_category.csv", index=False)

print("✅ Tableau summary files exported.")
cat_df.head(10)

✅ Tableau summary files exported.


Unnamed: 0,main_cat,total_business,churned,churn_rate
0,Restaurants,15290,808,0.052845
1,Beauty & Spas,4385,607,0.138426
2,Shopping,5480,596,0.108759
3,Food,6783,529,0.077989
4,Health & Medical,3058,376,0.122956
5,Home Services,3793,355,0.093593
6,Automotive,3449,350,0.101479
7,Active Life,1919,286,0.149036
8,Local Services,2642,258,0.097653
9,Event Planning & Services,2067,247,0.119497
