## 2 | User Churn Labeling & Feature Engineering

This section labels users as churned or active based on their review activity and engineers both static and behavioral features for downstream modeling. It is organized into the following steps:

| Step | Section                                | Description                                                                                                                                                |
| ---- | -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| 0    | Imports & Global Constants             | Load libraries, define raw/processed paths, set project‑wide constants and helper functions (`clean_state`, `safe_read_json`).                              |
| 1    | Churn Label Generation                 | Stream through the review data to extract each user’s last review date, then assign a binary `churn_label` based on a 365‑day inactivity cutoff.          |
| 2    | User Static Features                   | Read `user.json`, select key fields (`review_count`, `average_stars`, `fans`, etc.), derive `friends_count`, `elite_years`, and `member_years`.               |
| 3    | Behavioral Features                    | Chunk‑aggregate per-user reaction counts (`useful`, `funny`, `cool`), review statistics (count/mean/variance), and average text length.                      |
| 4    | Merging & Missing Value Handling       | Left‑join churn labels, static features, and behavioral aggregates into a single DataFrame, drop unneeded columns, and fill numeric NaNs with zeros.        |
| 5    | Save Features                          | Write the final feature table to `data/processed/user_churn_features.csv`.                                                                                 |
| 6    | Train/Test Split (Time‑based Stratify) | Split users into train and test sets by last review date relative to the cutoff, and save the ID lists to `train_user_ids.txt` and `test_user_ids.txt`. |





In [1]:
# 0 | Imports & Global Constants
from pathlib import Path
import pandas as pd
from tqdm import tqdm

RAW   = Path("../data/raw")
PROC  = Path("../data/processed");  PROC.mkdir(exist_ok=True)

T_END        = pd.Timestamp("2022-01-19")   # by part 1
WINDOW_DAYS  = 365
CUTOFF_DATE  = T_END - pd.Timedelta(days=WINDOW_DAYS)

In [2]:
def clean_state(df, col="state"):
    mask = ~df[col].str.match(r"^[A-Z]{2}$", na=False)
    df.loc[mask, col] = "XX"
    return df

In [3]:
def safe_read_json(fname, usecols=None, chunksize=None):
    path = RAW / fname
    if chunksize:
        # Read in chunks and yield them
        reader = pd.read_json(path, lines=True, encoding="utf-8", chunksize=chunksize)
        for chunk in reader:
            yield chunk[usecols] if usecols else chunk
    else:
        df = pd.read_json(path, lines=True, encoding="utf-8")
        return df[usecols] if usecols else df

In [4]:
# 1 | gey `churn_label`
last_review = {}
for chunk in tqdm(
    safe_read_json(
        "yelp_academic_dataset_review.json",
        usecols=["user_id", "date"],
        chunksize=200_000
    ),
    desc="Scanning reviews"
):
    chunk = chunk.copy()
    chunk.loc[:, "date"] = pd.to_datetime(chunk["date"], errors="coerce")

    grp = chunk.groupby("user_id")["date"].max()

    for uid, dt_ in grp.items():
        if (uid not in last_review) or (dt_ > last_review[uid]):
            last_review[uid] = dt_

user_last_df = (
    pd.Series(last_review, name="last_review_date")
      .to_frame()
      .reset_index()
      .rename(columns={"index": "user_id"})
)
user_last_df["churn_label"] = (user_last_df["last_review_date"] < CUTOFF_DATE).astype(int)
user_last_df.head()


Scanning reviews: 35it [00:59,  1.69s/it]


Unnamed: 0,user_id,last_review_date,churn_label
0,---2PmXbF47D870stH1jqA,2019-04-27 17:35:51,1
1,---UgP94gokyCDuB5zUssA,2021-09-17 17:36:13,0
2,--4AjktZiHowEIBCMd4CZA,2019-12-26 17:46:25,1
3,--6PFZka7og6Khaw6oyjvQ,2017-10-15 02:19:43,1
4,--E0uVPphTORm_OiZ5KCvA,2018-04-05 23:30:41,1


In [5]:
# 1.5 Re-label churn based on fixed anchor (T_END)
ANCHOR = pd.Timestamp("2021-01-19")          # T_END - 365 days

user_last_df["churn_label"] = (user_last_df["last_review_date"] < ANCHOR).astype(int)
print("label Global Distribution:\n", user_last_df["churn_label"].value_counts())

label Global Distribution:
 churn_label
1    1670849
0     317080
Name: count, dtype: int64


In [6]:
# 2 | User Static Features (from `user.json`)
user_cols = [
    "user_id", "review_count", "average_stars",
    "fans", "friends", "elite", "yelping_since", "useful", "funny", "cool"
]
user_df = pd.read_json(
    RAW/"yelp_academic_dataset_user.json",
    lines=True,
    encoding="utf-8"
)[user_cols].copy()

user_df["friends"] = user_df["friends"].fillna("")
user_df["friends_count"] = user_df["friends"].str.split(", ").str.len()

s = user_df["elite"].str.split(",").str.len().fillna(0)
s = s.infer_objects(copy=False)
user_df["elite_years"] = s.astype(int)

user_df["yelping_since"] = pd.to_datetime(user_df["yelping_since"], errors="coerce")
user_df["member_years"] = (T_END.year - user_df["yelping_since"].dt.year + 1)

user_df = user_df.drop(columns=["friends", "elite", "yelping_since"])

user_df.head()

Unnamed: 0,user_id,review_count,average_stars,fans,useful,funny,cool,friends_count,elite_years,member_years
0,qVc8ODYU5SZjKXVBgXdI7w,585,3.91,267,7217,1259,5994,14995,1,16
1,j14WgRoU_-2ZE1aw1dXrJg,4333,3.74,3138,43091,13066,27281,4646,14,14
2,2WnXYQFK0hXEoTxPtV2zvg,665,3.32,52,2086,1010,1003,381,5,15
3,SZDeASXq7o05mMNLshsdIA,224,4.27,28,512,330,299,131,3,18
4,hA5lMy-EnncsH4JoR-hFGQ,79,3.54,1,29,15,7,27,1,16


In [36]:
# 3 | Behavioral Features (`review.json`)
behav_chunks = []
for chunk in tqdm(
    safe_read_json("yelp_academic_dataset_review.json",
                   usecols=["user_id","stars","useful","funny","cool","text"],
                   chunksize=200_000),
    desc="Behavior aggs"
):
    chunk = chunk.copy()
    chunk["text_len"] = chunk["text"].str.len()

    aggs = {
        "useful":   "sum",
        "funny":    "sum",
        "cool":     "sum",
        "stars":    ["count", "mean", "var"],
        "text_len": "sum",
    }
    g = chunk.groupby("user_id").agg(aggs)
    g.columns = ["_".join(map(str, c)).strip() for c in g.columns]
    behav_chunks.append(g)

behav_df = (
    pd.concat(behav_chunks)
      .groupby("user_id")
      .sum(numeric_only=True)
      .reset_index()
)

behav_df["avg_len"]    = behav_df["text_len_sum"] / behav_df["stars_count"]
behav_df["rating_std"] = (behav_df["stars_var"] / behav_df["stars_count"]).pow(0.5)

behav_df = behav_df.drop(columns=["text_len_sum", "stars_var"])

behav_df = behav_df.rename(columns=lambda x: x.replace("text_<lambda_0>", "avg_len"))

behav_df.head()

Behavior aggs: 35it [01:01,  1.76s/it]


Unnamed: 0,user_id,useful_sum,funny_sum,cool_sum,stars_count,stars_mean,avg_len,rating_std
0,---1lKK3aKOuomHnwAkAow,0,0,0,1,5.0,637.0,0.0
1,---2PmXbF47D870stH1jqA,45,3,23,28,95.0,469.928571,0.0
2,---UgP94gokyCDuB5zUssA,7,0,2,11,36.5,459.636364,0.213201
3,---fa6ZK37T9NjkGKI4oSg,1,0,0,1,1.0,175.0,0.0
4,---r61b7EpVPkb4UVme5tA,8,2,3,5,15.0,429.0,0.0


In [47]:
# 4 | Merging & Missing Value Handling
feat_df = (
    user_last_df
      .merge(user_df,  on="user_id", how="left")
      .merge(behav_df, on="user_id", how="left")
)

feat_df = feat_df.drop(columns=["stars_count"])

num_cols = feat_df.select_dtypes("number").columns
feat_df[num_cols] = feat_df[num_cols].fillna(0)

leak_cols = ["days_since_last_review"]
feat_df = feat_df.drop(columns=[c for c in leak_cols if c in feat_df.columns])

feat_df.head()

Unnamed: 0,user_id,last_review_date,churn_label,review_count,average_stars,fans,useful,funny,cool,friends_count,elite_years,member_years,useful_sum,funny_sum,cool_sum,stars_mean,avg_len,rating_std
0,---2PmXbF47D870stH1jqA,2019-04-27 17:35:51,1,36.0,4.98,3.0,63.0,4.0,36.0,420.0,1.0,11.0,45,3,23,95.0,469.928571,0.0
1,---UgP94gokyCDuB5zUssA,2021-09-17 17:36:13,0,16.0,3.44,1.0,8.0,0.0,3.0,3.0,1.0,9.0,7,0,2,36.5,459.636364,0.213201
2,--4AjktZiHowEIBCMd4CZA,2019-12-26 17:46:25,1,57.0,4.07,0.0,47.0,1.0,5.0,1.0,1.0,8.0,40,1,3,54.233333,488.291667,0.538774
3,--6PFZka7og6Khaw6oyjvQ,2017-10-15 02:19:43,1,25.0,4.96,3.0,33.0,12.0,23.0,14.0,1.0,15.0,5,1,4,39.0,422.125,0.0
4,--E0uVPphTORm_OiZ5KCvA,2018-04-05 23:30:41,1,6.0,3.67,0.0,0.0,0.0,0.0,1.0,1.0,5.0,0,0,0,11.0,805.0,0.0


In [54]:
print(feat_df["churn_label"].value_counts(dropna=False))

churn_label
1    1670849
0     317080
Name: count, dtype: int64


In [55]:
# 6-A | Use stratified sampling to split the training and test sets based on labels.
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    feat_df, test_size=0.15, stratify=feat_df["churn_label"], random_state=42
)

train_ids = train_df["user_id"];  test_ids = test_df["user_id"]

# ave the list for Part 3 to read
train_ids.to_csv(PROC/"train_user_ids.txt", index=False, header=False)
test_ids .to_csv(PROC/"test_user_ids.txt",  index=False, header=False)
print("✅ ID files updated")

# Visualize the new distribution
print("train rows:", len(train_ids), " test rows:", len(test_ids))
print("train label\n", train_df["churn_label"].value_counts())
print("test  label\n",  test_df["churn_label"].value_counts())

✅ ID files updated
train rows: 1689739  test rows: 298190
train label
 churn_label
1    1420221
0     269518
Name: count, dtype: int64
test  label
 churn_label
1    250628
0     47562
Name: count, dtype: int64


In [52]:
# 6-B | Save the new training and testing user ID lists
train_ids.to_csv(PROC/"train_user_ids.txt", index=False, header=False)
test_ids.to_csv( PROC/"test_user_ids.txt",  index=False, header=False)
print("✅ ID files updated: train_user_ids.txt & test_user_ids.txt")

✅ ID files updated: train_user_ids.txt & test_user_ids.txt
