# Labeled Dataset Builder — Google Trends (pytrends)

This notebook creates a **labeled dataset** suitable for training classification models (e.g., XGBoost) to detect **emerging** topics.
It focuses only on **data creation & labeling** (no training).

**Pipeline**  
1. Configure seeds, timeframe, and labeling rules.  
2. Fetch Trends for each seed **plus related queries** (cohort context).  
3. Build **leakage-safe, time-indexed features** (only past data to predict the future).  
4. Create labels using a simple rule (e.g., **future 7‑day lift** > threshold).  
5. Save a single table (`CSV` + `Parquet`) for modeling.


In [None]:
!pip -q install pytrends statsmodels pandas numpy pyarrow

In [None]:
import os, time, math, json, random, datetime as dt
import numpy as np
import pandas as pd
from typing import List, Optional, Callable, Any

from pytrends.request import TrendReq
from statsmodels.tsa.seasonal import STL

## 0. Parameters — seeds, scope, windows, labels

In [None]:
# ==== INPUT KEYWORDS ===
SEED_KEYWORDS = [

    # Genres
    "romance movie", "action movie", "science fiction movie",
    "superhero movie", "horror movie", "psychological thriller",
    "mystery movie", "fantasy movie", "crime drama",
    "biographical film", "historical drama", "animated movie",
    "coming of age movie", "musical film", "family movie",

    # Themes / Topics
    "climate change movie", "ai movie", "time travel movie",
    "multiverse movie", "apocalypse movie", "zombie movie",
    "vampire movie", "robot movie", "alien invasion movie",
    "supernatural horror", "serial killer movie",
    "cyberpunk movie", "dystopian movie", "romantic comedy",
    "heist movie", "detective story", "teen romance movie",

    # Movie industry timing signals
    "movie release date", "summer blockbuster", "oscar season",
    "film festival", "movie trailer", "movie premiere", "box office",
    "movie marketing", "streaming release",

    # Character tropes
    "anti hero movie", "classic hero story", "villain origin story",
    "strong female lead", "teen protagonist", "supernatural villain",
    "chosen one trope", "mentor archetype", "love triangle",

    # Settings
    "space exploration movie", "medieval fantasy movie",
    "cyberpunk city", "post apocalyptic world", "haunted house movie",
    "island survival movie", "small town mystery",
    "new york city movie", "high school romance",

    # Narrative tropes
    "enemies to lovers", "found footage movie", "time loop movie",
    "plot twist ending", "sci fi thriller", "origin story",
    "true crime film",

    # Culture → film pipeline topics
    "urban legend", "conspiracy thriller", "cryptid sightings",
    "viral internet story", "space discoveries", "true crime stories",
    "missing person case", "reddit horror stories"
]

# ==== PYTRENDS SCOPE ===
TIMEFRAME = "today 5-y"     # e.g. 'today 12-m', 'today 5-y', 'now 7-d'
GEO       = "US"            # e.g. '' for worldwide
CATEGORY  = 23              # 0 means all categories
INCLUDE_RELATED = True
TOPN_RELATED = 5
RETRIES = 3
PAUSE   = 10.0

# ==== FEATURE WINDOWS ===
FEAT_WINDOW = 28   # short window for slope/variance/mean
REF_WINDOW  = 90   # longer baseline window for ref mean

# ==== LABELING ===
FUTURE_HORIZON = 14     # how many days ahead for the target
THRESH_LIFT   = 0.35
BINARY_LABEL  = True

# ==== OUTPUT ===
OUT_PARQUET = "labeled_trends.parquet"
OUT_CSV     = "labeled_trends.csv"
RANDOM_SEED = 13
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

## 1. Fetch trends for seeds (+ related cohort)

In [None]:
def with_backoff(fn: Callable, retries: int = RETRIES, pause: float = PAUSE, **kwargs) -> Any:
    """Helper to run a function with exponential backoff."""
    for i in range(retries):
        try:
            return fn(**kwargs)
        except Exception as e:
            # Handle the specific 'KeyError' which might not be a rate limit, but a data-not-found issue.
            if isinstance(e, KeyError):
                 print(f"  Data not found (KeyError): {e}. Skipping retries for this item.")
                 return None
            print(f"  Attempt {i+1}/{retries} failed: {e}")
            if i == retries - 1:
                print(f"  All {retries} attempts failed for {fn.__name__}.")
                return None # Return None on total failure
            # Exponential backoff with jitter
            sleep_time = pause * (2**i) + random.uniform(0, pause * 0.5)
            print(f"  Retrying in {sleep_time:.2f}s...")
            time.sleep(sleep_time)

# --- Modified Original Function ---
def fetch_trends_timeseries(keywords: List[str],
                             timeframe: str = TIMEFRAME,
                             geo: str = GEO,
                             category: int = CATEGORY,
                             include_related: bool = INCLUDE_RELATED,
                             topn_related: int = TOPN_RELATED,
                             pause: float = PAUSE, # Use new PAUSE default
                             retries: int = RETRIES # Use new RETRIES default
                             ) -> dict:
    """Fetch interest_over_time for a cohort: seeds + optional top/rising related queries.
    Returns dict with DataFrame at 'wide' and related_map at 'related_map'."""

    # Using 'gprop'='' for web search, as in the user's new example
    py = TrendReq(hl="en-US", tz=0)
    all_kws = list(dict.fromkeys([k.strip() for k in keywords if k]))
    related_map = {}

    if include_related:
        print("Fetching related queries...")
        for kw in all_kws:
            print(f"  - for: {kw}")
            # Use with_backoff for the payload + related_queries
            def fetch_related():
                # gprop='' for web search. cat=category
                py.build_payload([kw], timeframe=timeframe, geo=geo, cat=category, gprop='')
                return py.related_queries()

            rq = with_backoff(fetch_related, retries=retries, pause=pause)

            if rq is None:
                print(f"  Failed to get related queries for {kw}.")
                continue

            rel_tbl = rq.get(kw, {})
            rel = []
            for kind in ("top", "rising"):
                dfk = rel_tbl.get(kind)
                if dfk is not None and len(dfk):
                    rel.extend(dfk["query"].astype(str).tolist())
            rel = [r for r in rel if r.lower() != kw.lower()]
            related_map[kw] = list(dict.fromkeys(rel))[:topn_related]

            # Add a small pause even between successful keyword-related fetches
            time.sleep(pause + random.uniform(0, pause * 0.5))

        cohort = list(dict.fromkeys(all_kws + sum(related_map.values(), [])))
    else:
        cohort = all_kws

    print(f"\nFetching time series for {len(cohort)} total keywords...")

    # Fetch in batches of <= 5
    def batched(xs, n=5):
        for i in range(0, len(xs), n):
            yield xs[i:i+n]

    frames = []
    for i, grp in enumerate(batched(cohort, 5)):
        print(f"  - Batch {i+1} ({', '.join(grp)})")

        # Use with_backoff for the time series fetch
        def fetch_interest():
             # gprop='' for web search. cat=category
            py.build_payload(grp, timeframe=timeframe, geo=geo, cat=category, gprop='')
            return py.interest_over_time()

        df = with_backoff(fetch_interest, retries=retries, pause=pause)

        if df is None or df.empty:
            print("    -> No data returned for this batch.")
            continue

        if "isPartial" in df.columns:
            df = df.drop(columns=["isPartial"])
        frames.append(df)

        # Pause before the *next* request.
        time.sleep(pause + random.uniform(0, pause * 0.5))

    if not frames:
        print("No time series data was successfully fetched for any batch.")
        return {"wide": pd.DataFrame(), "related_map": related_map}

    print("Combining fetched data...")
    wide = pd.concat(frames, axis=1)
    wide = wide.loc[:, ~wide.columns.duplicated(keep="first")]
    wide = wide.loc[:, (wide != 0).any(axis=0)] # Drop cols that are all 0
    wide = wide.reset_index().rename(columns={"date": "date"})
    wide["date"] = pd.to_datetime(wide["date"])

    keep_cols = ["date"] + cohort
    wide = wide[[c for c in keep_cols if c in wide.columns]].copy()

    print(f"Fetch complete. Shape: {wide.shape}")
    return {"wide": wide, "related_map": related_map}

# --- This part remains the same, just executing the modified function ---
# Fetch once for the union of seeds
fetched = fetch_trends_timeseries(SEED_KEYWORDS)
wide = fetched["wide"]
related_map = fetched["related_map"]

# === new guard ===
if wide is None or wide.empty or ("date" not in wide.columns):
    print("ERROR: Google Trends returned no usable time series.")
    print("Check:")
    print("  - keyword spellings (Google Trends may not have them)")
    print("  - timeframe (try 'today 5-y' or 'today 3-m')")
    print("  - geo restrictions")
    display(wide)
    print("Stopping execution due to fetch error.")
else:
    print("Fetch successful. Displaying tail of the DataFrame:")
    display(wide.tail())

Fetching related queries...
  - for: romance movie
  - for: action movie
  - for: science fiction movie
  - for: superhero movie
  - for: horror movie
  - for: psychological thriller
  - for: mystery movie
  - for: fantasy movie
  - for: crime drama
  - for: biographical film
  - for: historical drama
  - for: animated movie
  - for: coming of age movie
  - for: musical film
  - for: family movie
  - for: climate change movie
  - for: ai movie
  - for: time travel movie
  - for: multiverse movie
  - for: apocalypse movie
  - for: zombie movie
  - for: vampire movie
  - for: robot movie
  - for: alien invasion movie
  - for: supernatural horror
  - for: serial killer movie
  - for: cyberpunk movie
  - for: dystopian movie
  - for: romantic comedy
  - for: heist movie
  - for: detective story
  - for: teen romance movie
  - for: movie release date
  - for: summer blockbuster
  - for: oscar season
  - for: film festival
  - for: movie trailer
  - for: movie premiere
  - for: box office
  

  df = df.fillna(False)


  - Batch 2 (psychological thriller, mystery movie, fantasy movie, crime drama, biographical film)


  df = df.fillna(False)


  - Batch 3 (historical drama, animated movie, coming of age movie, musical film, family movie)


  df = df.fillna(False)


  - Batch 4 (climate change movie, ai movie, time travel movie, multiverse movie, apocalypse movie)


  df = df.fillna(False)


  - Batch 5 (zombie movie, vampire movie, robot movie, alien invasion movie, supernatural horror)


  df = df.fillna(False)


  - Batch 6 (serial killer movie, cyberpunk movie, dystopian movie, romantic comedy, heist movie)


  df = df.fillna(False)


  - Batch 7 (detective story, teen romance movie, movie release date, summer blockbuster, oscar season)


  df = df.fillna(False)


  - Batch 8 (film festival, movie trailer, movie premiere, box office, movie marketing)


  df = df.fillna(False)


  - Batch 9 (streaming release, anti hero movie, classic hero story, villain origin story, strong female lead)


  df = df.fillna(False)


  - Batch 10 (teen protagonist, supernatural villain, chosen one trope, mentor archetype, love triangle)


  df = df.fillna(False)


  - Batch 11 (space exploration movie, medieval fantasy movie, cyberpunk city, post apocalyptic world, haunted house movie)


  df = df.fillna(False)


  - Batch 12 (island survival movie, small town mystery, new york city movie, high school romance, enemies to lovers)


  df = df.fillna(False)


  - Batch 13 (found footage movie, time loop movie, plot twist ending, sci fi thriller, origin story)


  df = df.fillna(False)


  - Batch 14 (true crime film, urban legend, conspiracy thriller, cryptid sightings, viral internet story)


  df = df.fillna(False)


  - Batch 15 (space discoveries, true crime stories, missing person case, reddit horror stories, romance movies)


  df = df.fillna(False)


  - Batch 16 (action movies, wicked live action movie, rocky horror movie, horror movies, american horror story)


  df = df.fillna(False)


  - Batch 17 (horror movie monologues, ballerina horror movie, psychological thriller movies, murder mystery movie, mystery movies)


  df = df.fillna(False)


  - Batch 18 (murder mystery movies, new murder mystery movie, murder mystery movie 2022, historical drama definition, lord of the rings animated movie)


  df = df.fillna(False)


  - Batch 19 (the lord of the rings animated movie, fire and ice animated movie, scrooge animated movie, lord of the rings animated movie 2024, wicked musical film)


  df = df.fillna(False)


  - Batch 20 (rent musical film, cats musical film, les miserables musical film, hair musical film, family movies)


  df = df.fillna(False)


  - Batch 21 (addams family movie, the addams family movie, family movie night, character ai, ai detector)


  df = df.fillna(False)


  - Batch 22 (ai checker, vampire movies, philip ridley vampire movie, tom hiddleston vampire movie, wild robot movie)


  df = df.fillna(False)


  - Batch 23 (the wild robot movie, robin williams robot movie, my teen romantic comedy, romantic comedy movies, romantic comedy definition)


  df = df.fillna(False)


  - Batch 24 (the heist movie, heist movies, wicked 2 movie release date, wicked the movie release date, in the heights movie release date)


  df = df.fillna(False)


  - Batch 25 (mean girls' musical movie release date, merrily we roll along movie release date, tribeca film festival, new york film festival, dance film festival)


  df = df.fillna(False)


  - Batch 26 (jewish film festival, sundance film festival, wicked trailer, wicked movie, wicked the movie trailer)


  df = df.fillna(False)


  - Batch 27 (phantom of the opera movie trailer, wicked movie premiere, box office wicked, box office broadway, little mermaid box office)


  df = df.fillna(False)


  - Batch 28 (the little mermaid box office, jurassic world box office, wicked streaming release, wicked streaming release date, west side story 2021 streaming release date)


  df = df.fillna(False)


  - Batch 29 (in the heights streaming release date, see how they run streaming release date, olivia rodrigo love triangle, twelfth night love triangle, love triangle in twelfth night)


  df = df.fillna(False)


  - Batch 30 (what is a love triangle, wicked love triangle, enemies to lovers books, enemies to lovers trope, enemies to lovers movies)


  df = df.fillna(False)


  - Batch 31 (urban legend movie)
    -> No data returned for this batch.
Combining fetched data...
Fetch complete. Shape: (262, 122)
Fetch successful. Displaying tail of the DataFrame:


Unnamed: 0,date,romance movie,action movie,science fiction movie,superhero movie,horror movie,mystery movie,fantasy movie,crime drama,biographical film,...,wicked streaming release date,west side story 2021 streaming release date,in the heights streaming release date,see how they run streaming release date,olivia rodrigo love triangle,twelfth night love triangle,what is a love triangle,wicked love triangle,enemies to lovers books,enemies to lovers movies
257,2025-10-19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
258,2025-10-26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
259,2025-11-02,4,11,0,0,32,19,4,3,0,...,0,0,0,0,0,0,0,0,0,0
260,2025-11-09,5,10,0,0,22,21,6,0,0,...,0,0,0,0,0,0,5,0,0,0
261,2025-11-16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2. Leakage-safe feature builder
We compute features at each time **t** using only data up to **t** (no future).

In [None]:
# ------------ helper functions ------------

def _ols_slope(arr: np.ndarray,
               min_len: int = 3,
               eps: float = 1e-9) -> float:
    """
    Robust slope of a 1D array using OLS on index vs value.

    - Returns NaN if there are fewer than min_len points.
    - Returns 0.0 if the window is (almost) flat (very small variance).
    """
    arr = np.asarray(arr, dtype=float)
    # Drop non-finite values if any
    arr = arr[np.isfinite(arr)]
    n = arr.size
    if n < min_len:
        return np.nan

    # x = 0,1,2,...,n-1
    x = np.arange(n, dtype=float)

    # center x and y to improve numerical stability
    y = arr - arr.mean()
    x = x - x.mean()

    # If y has almost no variance, treat as flat
    var_y = (y ** 2).mean()
    if var_y < eps:
        return 0.0

    denom = (x ** 2).sum()
    if denom < eps:
        return 0.0

    slope = (x * y).sum() / denom
    return float(slope)



def _zscore_last(arr: np.ndarray) -> float:
    if len(arr) < 3:
        return np.nan
    mu = np.mean(arr[:-1])
    sd = np.std(arr[:-1]) + 1e-9
    return float((arr[-1] - mu) / sd)


def _days_since_peak(series: pd.Series) -> float:
    if series.isna().all():
        return np.nan
    idx = int(series.values.argmax())
    return float(len(series) - 1 - idx)


def _gini(arr: np.ndarray) -> float:
    arr = np.asarray(arr, dtype=float)
    if np.all(arr == 0):
        return 0.0
    if np.amin(arr) < 0:
        arr -= np.amin(arr)
    arr += 1e-9
    arr.sort()
    n = arr.size
    cum = np.cumsum(arr)
    return float((n + 1 - 2 * np.sum(cum) / cum[-1]) / n)


def _stl_strength(y: np.ndarray, period: int = 7):
    try:
        stl = STL(y, period=period, robust=True)
        res = stl.fit()
        resid_var = np.var(res.resid)
        trend_var = np.var(res.trend)
        seas_var  = np.var(res.seasonal)
        trend_strength = trend_var / (trend_var + resid_var + 1e-9)
        seas_strength  = seas_var  / (seas_var  + resid_var + 1e-9)
        return float(trend_strength), float(seas_strength)
    except Exception:
        return np.nan, np.nan


# ------------ main feature builder ------------

def build_time_indexed_features(
    wide: pd.DataFrame,
    keyword: str,
    cohort_cols: Optional[List[str]] = None,
    feat_window: int = FEAT_WINDOW,
    ref_window: int = REF_WINDOW,
    stl_period: int = 7,
    future_horizon: int = FUTURE_HORIZON,
) -> pd.DataFrame:
    """
    Build time-indexed, leakage-safe features for a single keyword.

    For each time t:
      - features use history up to and including t
      - label targets use t+1 .. t+future_horizon
    """
    df = wide.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
    if keyword not in df.columns:
        return pd.DataFrame()

    if cohort_cols is None:
        cohort_cols = [c for c in df.columns if c not in ["date", keyword]]

    out_rows = []

    for i in range(len(df)):
        # need enough history (for windows + STL) and enough future for the label
        if i < max(5, feat_window // 3, stl_period * 3):
            continue
        if i + future_horizon >= len(df):
            break

        hist = df.iloc[: i + 1].copy()
        fut  = df.iloc[i + 1 : i + 1 + future_horizon].copy()

        ts = hist[keyword].astype(float).fillna(0.0)
        last = float(ts.iloc[-1])

        # --- rolling windows for short-term stats ---
        roll = ts.rolling(feat_window, min_periods=max(5, feat_window // 3))
        mean_w_series = roll.mean()
        std_w_series  = roll.std()
        med_w_series  = roll.median()

        mean_w_val = float(mean_w_series.iloc[-1]) if not np.isnan(mean_w_series.iloc[-1]) else np.nan
        std_w_val  = float(std_w_series.iloc[-1])  if not np.isnan(std_w_series.iloc[-1])  else np.nan
        med_w_val  = float(med_w_series.iloc[-1])  if not np.isnan(med_w_series.iloc[-1])  else np.nan

        # --- longer reference window ---
        ref_series = ts.rolling(ref_window, min_periods=max(7, ref_window // 4)).mean()
        ref_val = float(ref_series.iloc[-1]) if not np.isnan(ref_series.iloc[-1]) else np.nan

        # --- slope and z-score ---
        slope = _ols_slope(ts.iloc[-feat_window:].values) if len(ts) >= feat_window else np.nan
        z_last = _zscore_last(ts.values)

        # --- lifts & basic dynamics ---
        if not np.isnan(mean_w_val) and mean_w_val != 0:
            lift_vs_mean = (last - mean_w_val) / (mean_w_val + 1e-9)
        else:
            lift_vs_mean = np.nan

        if not np.isnan(ref_val) and ref_val != 0:
            lift_vs_ref = (last - ref_val) / (ref_val + 1e-9)
        else:
            lift_vs_ref = np.nan

        momentum_1 = float(ts.diff(1).iloc[-1]) if len(ts) >= 2 else np.nan
        momentum_7 = float(ts.diff(7).iloc[-1]) if len(ts) >= 8 else np.nan

        coefvar = (
            float(std_w_val / (mean_w_val + 1e-9))
            if not np.isnan(mean_w_val)
            else np.nan
        )

        # --- shape / burstiness stats ---
        dsp        = _days_since_peak(ts)
        peak       = float(ts.max())
        peak_gini  = _gini(ts.values)
        if ts.notna().any():
            q75, q25 = np.percentile(ts.values, [75, 25])
        else:
            q75, q25 = np.nan, np.nan
        iqr = float(q75 - q25) if not (np.isnan(q75) or np.isnan(q25)) else np.nan
        burstiness = (
            iqr / (med_w_val + 1e-9)
            if not np.isnan(iqr) and not np.isnan(med_w_val)
            else np.nan
        )

        # --- STL-based trend/seasonality strength ---
        t_strength, s_strength = _stl_strength(ts.values, period=stl_period)

        row = {
            "date": hist["date"].iloc[-1],
            "keyword": keyword,
            "level_last": last,
            "mean_w": mean_w_val,            # <- used by add_labels
            "slope_w": slope,
            "z_last": z_last,
            "lift_vs_mean_w": lift_vs_mean,
            "lift_vs_ref": lift_vs_ref,
            "momentum_1": momentum_1,
            "momentum_7": momentum_7,
            "coefvar_w": coefvar,
            "days_since_peak": dsp,
            "peak": peak,
            "peak_gini": peak_gini,
            "burstiness": burstiness,
            "trend_strength": t_strength,
            "seasonality_strength": s_strength,
        }

        # --- cohort features, if any ---
        if cohort_cols:
            cohort_hist = hist[cohort_cols].astype(float).fillna(0.0)
            cohort_med_series = cohort_hist.median(axis=1)
            cohort_last_med = float(cohort_med_series.iloc[-1])

            if cohort_last_med != 0:
                row["lift_vs_cohort_med"] = (last - cohort_last_med) / (cohort_last_med + 1e-9)
            else:
                row["lift_vs_cohort_med"] = np.nan

            ys = (
                cohort_med_series.iloc[-feat_window:].values
                if len(cohort_med_series) >= feat_window
                else cohort_med_series.values
            )
            xs = ts.iloc[-len(ys):].values

            if (
                len(xs) > 3
                and len(ys) > 3
                and np.std(xs) > 1e-9
                and np.std(ys) > 1e-9
            ):
                row["corr_with_cohort_med"] = float(np.corrcoef(xs, ys)[0, 1])
            else:
                row["corr_with_cohort_med"] = np.nan
        else:
            row["lift_vs_cohort_med"] = np.nan
            row["corr_with_cohort_med"] = np.nan

        # --- FUTURE window (for labels) ---
        fut_vals = fut[keyword].astype(float).values
        row["future_mean_7d"] = float(np.mean(fut_vals))
        row["future_max_7d"]  = float(np.max(fut_vals))
        row["future_sum_7d"]  = float(np.sum(fut_vals))

        out_rows.append(row)

    return pd.DataFrame(out_rows)


# ------------ build features for all seed keywords ------------

all_rows = []
for seed in SEED_KEYWORDS:
    cohort_cols = [c for c in wide.columns if c not in ["date", seed]]
    feats = build_time_indexed_features(wide, seed, cohort_cols=cohort_cols)
    if not feats.empty:
        all_rows.append(feats)

features_df = pd.concat(all_rows, ignore_index=True) if all_rows else pd.DataFrame()
features_df.head()


Unnamed: 0,date,keyword,level_last,mean_w,slope_w,z_last,lift_vs_mean_w,lift_vs_ref,momentum_1,momentum_7,...,peak,peak_gini,burstiness,trend_strength,seasonality_strength,lift_vs_cohort_med,corr_with_cohort_med,future_mean_7d,future_max_7d,future_sum_7d
0,2021-04-11,romance movie,0.0,0.0,,0.0,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0
1,2021-04-18,romance movie,0.0,0.0,,0.0,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0
2,2021-04-25,romance movie,0.0,0.0,,0.0,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0
3,2021-05-02,romance movie,0.0,0.0,,0.0,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0
4,2021-05-09,romance movie,0.0,0.0,,0.0,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0


In [None]:
features_df.head(25)

Unnamed: 0,date,keyword,level_last,mean_w,slope_w,z_last,lift_vs_mean_w,lift_vs_ref,momentum_1,momentum_7,...,peak,peak_gini,burstiness,trend_strength,seasonality_strength,lift_vs_cohort_med,corr_with_cohort_med,future_mean_7d,future_max_7d,future_sum_7d
0,2025-04-06,marvel movies,5.0,4.863636,,0.114708,0.028037,0.028037,0.0,-4.0,...,9.0,0.117672,0.2,0.673889,0.668931,-0.545455,0.020872,5.357143,8.0,75.0
1,2025-04-13,marvel movies,5.0,4.869565,,0.112037,0.026786,0.026786,0.0,0.0,...,9.0,0.11413,0.2,0.550491,0.599816,-0.615385,0.075842,5.5,8.0,77.0
2,2025-04-20,marvel movies,5.0,4.875,,0.109545,0.025641,0.025641,0.0,0.0,...,9.0,0.110755,0.2,0.614023,0.656058,-0.5,-0.109513,5.785714,9.0,81.0
3,2025-04-27,marvel movies,6.0,4.92,,0.964901,0.219512,0.219512,1.0,2.0,...,9.0,0.11252,0.2,0.705021,0.660484,-0.5,-0.072655,5.785714,9.0,81.0
4,2025-05-04,marvel movies,8.0,5.038462,,2.647315,0.587786,0.587786,2.0,4.0,...,9.0,0.12478,0.2,0.767148,0.614138,-0.466667,0.075031,5.571429,9.0,78.0
5,2025-05-11,marvel movies,6.0,5.074074,,0.74802,0.182482,0.182482,-2.0,1.0,...,9.0,0.124899,0.2,0.806503,0.563422,-0.571429,0.161341,5.428571,9.0,76.0
6,2025-05-18,marvel movies,5.0,5.071429,0.067323,-0.058124,-0.014085,-0.014085,-1.0,0.0,...,9.0,0.12173,0.2,0.797639,0.548374,-0.642857,0.217961,5.428571,9.0,76.0
7,2025-05-25,marvel movies,6.0,5.142857,0.065681,0.741949,0.166667,0.175676,1.0,1.0,...,9.0,0.121622,0.2,0.805856,0.468743,-0.73913,0.485581,5.357143,9.0,75.0
8,2025-06-01,marvel movies,5.0,5.178571,0.055556,-0.083333,-0.034483,-0.019608,-1.0,0.0,...,9.0,0.118736,0.2,0.760987,0.464304,-0.8,0.67208,5.214286,9.0,73.0
9,2025-06-08,marvel movies,5.0,5.214286,0.044882,-0.081923,-0.041096,-0.018987,0.0,0.0,...,9.0,0.115966,0.2,0.743368,0.496411,-0.583333,0.454911,5.071429,9.0,71.0


## 3. Label creation
Default: **binary 'Emerging'** if `future_mean_7d` shows ≥35% lift over current short-window mean.

In [None]:
def add_labels(df: pd.DataFrame,
               binary: bool = BINARY_LABEL,
               thresh_lift: float = THRESH_LIFT) -> pd.DataFrame:
    """
    Creates ground-truth labels for training/validation,
    based only on leakage-safe "future_mean_7d".

    If binary=True:
        1 => emerging (future_mean_7d >= mean(last_window) * (1 + thresh_lift))
        0 => not emerging

    If binary=False:
        0 => stable_or_low
        1 => rising
        2 => emerging
    """
    df = df.copy()

    mean_window = df.get("mean_w", (df["level_last"] / (1 + df["lift_vs_mean_w"].fillna(-1.0))))

    # guard against division by zero
    denom = (mean_window + 1e-9)

    # growth ratio used for labeling
    growth = (df["future_mean_7d"] - mean_window) / denom
    df["growth_label_metric"] = growth

    if binary:
        # True/False → 1/0
        df["target"] = (growth >= thresh_lift).astype(int)
    else:
        # 3-class labeling based on quantiles
        # compute quantiles using only valid numeric values
        q1, q2 = growth.quantile([0.33, 0.66])

        def bucket(g):
            if pd.isna(g):
                return np.nan
            if g <= q1:
                return 0
            if g <= q2:
                return 1
            return 2

        df["target"] = growth.apply(bucket)

    return df
labeled = add_labels(features_df)
labeled.head()

Unnamed: 0,date,keyword,level_last,mean_w,slope_w,z_last,lift_vs_mean_w,lift_vs_ref,momentum_1,momentum_7,...,burstiness,trend_strength,seasonality_strength,lift_vs_cohort_med,corr_with_cohort_med,future_mean_7d,future_max_7d,future_sum_7d,growth_label_metric,target
0,2021-04-11,romance movie,0.0,0.0,,0.0,,,0.0,0.0,...,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0
1,2021-04-18,romance movie,0.0,0.0,,0.0,,,0.0,0.0,...,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0
2,2021-04-25,romance movie,0.0,0.0,,0.0,,,0.0,0.0,...,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0
3,2021-05-02,romance movie,0.0,0.0,,0.0,,,0.0,0.0,...,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0
4,2021-05-09,romance movie,0.0,0.0,,0.0,,,0.0,0.0,...,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0


In [None]:
labeled['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,11596
1,1797


## 4. Quality checks — class balance & missing values

In [None]:
print("Rows:", len(labeled))
print("Date range:", labeled["date"].min(), "->", labeled["date"].max())
print("\nClass distribution:")
print(labeled["target"].value_counts(dropna=False).sort_index())

missing = labeled.isna().mean().sort_values(ascending=False)
print("\nColumns with >10% missing:")
print(missing[missing > 0.10].head(20))


Rows: 13393
Date range: 2021-04-11 00:00:00 -> 2025-08-10 00:00:00

Class distribution:
target
0    11596
1     1797
Name: count, dtype: int64

Columns with >10% missing:
lift_vs_cohort_med      1.000000
corr_with_cohort_med    1.000000
lift_vs_mean_w          0.714254
lift_vs_ref             0.483910
dtype: float64


## 5. Save artifacts

In [None]:
labeled.to_parquet("{OUT_PARQUET}", index=False)
labeled.to_csv("{OUT_CSV}", index=False)
print("Saved:", OUT_PARQUET, "and", OUT_CSV)

Saved: labeled_trends.parquet and labeled_trends.csv
