In [15]:

import sys, numpy as np, pandas as pd
sys.path.append(".")

from bayes_core import (
    load_data, build_features, detect_channels,
    enumerate_models_BMA, lasso_cd, student_t_irls, mean_shift_bma,
    random_intercept_by, ols_diagnostics
)
from meridian_helper import fit_meridian_and_summarize  
pd.set_option("display.max_columns", 120)
CSV_PATH = "MMM_Takehome_Dataset.csv"  
df = load_data(CSV_PATH)

print(df.shape)
display(df.head(10))


print("Date span:", df["date"].min().date(), "→", df["date"].max().date())
print("Columns:", list(df.columns))


print("Subscriptions summary:\n", df["subscriptions"].describe())

nulls = df.isna().sum().sort_values(ascending=False)
display(nulls[nulls>0])


(74, 26)


Unnamed: 0,date,subscriptions,meta_spend,meta_impressions,google_spend,google_impressions,snapchat_spend,snapchat_impressions,tiktok_spend,tiktok_impressions,engine_spend,engine_impressions,moloco_spend,moloco_impressions,liveintent_spend,liveintent_impressions,roku_spend,roku_impressions,beehiiv_spend,beehiiv_impressions,amazon_spend,amazon_impressions,walmart_spend,billboards_spend,emails_spend,tv_spend
0,2024-03-11,8466,24453.32013,1334082,48184.00505,2686578,54700.64863,2266423,0.0,0,11397.63242,466002,23641.67681,1246014,3285.905595,189071,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
1,2024-03-18,11438,12056.40304,724253,43667.36138,2090050,60680.8297,3491582,0.0,0,6641.258838,255007,9275.368834,301812,4060.587295,157247,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
2,2024-03-25,10697,0.0,0,44484.47785,2426908,72718.80076,2816049,0.0,0,8237.150123,394318,6295.163207,209783,4437.924958,247443,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
3,2024-04-01,11401,0.0,0,50505.08565,3033946,73616.60488,4104614,0.0,0,8414.940046,283026,4828.646059,223832,4463.799368,213650,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
4,2024-04-08,12042,0.0,0,55324.65109,2889430,60141.06131,2878530,0.0,0,9431.563691,519155,12370.87312,411710,6375.286002,347811,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
5,2024-04-15,12243,0.0,0,65756.68238,3589663,78238.53728,4268404,0.0,0,6455.424826,228904,15862.45248,612069,6515.640889,391408,0,0,0.0,0,0.0,0,0.0,0.0,0.0,15082.34943
6,2024-04-22,10758,0.0,0,63406.0832,3028597,82316.62829,4944931,0.0,0,3656.627562,139687,15808.39581,864474,4296.886506,224412,0,0,0.0,0,0.0,0,0.0,0.0,0.0,14205.49025
7,2024-04-29,9084,0.0,0,64243.6849,4026826,97255.61028,5079351,0.0,0,3302.396142,174049,13653.28286,492589,4334.577226,236624,0,0,0.0,0,0.0,0,0.0,0.0,0.0,4763.885819
8,2024-05-06,9015,0.0,0,63533.83205,2240726,74327.78997,4057560,0.0,0,1913.497754,62263,4128.631775,241292,3332.655726,159184,0,0,0.0,0,0.0,0,0.0,0.0,0.0,5012.15105
9,2024-05-13,10782,0.0,0,64872.21643,2575712,62022.12201,2962492,0.0,0,2788.668617,92930,23317.2047,1412033,4937.259131,309469,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0


Date span: 2024-03-11 → 2025-08-04
Columns: ['date', 'subscriptions', 'meta_spend', 'meta_impressions', 'google_spend', 'google_impressions', 'snapchat_spend', 'snapchat_impressions', 'tiktok_spend', 'tiktok_impressions', 'engine_spend', 'engine_impressions', 'moloco_spend', 'moloco_impressions', 'liveintent_spend', 'liveintent_impressions', 'roku_spend', 'roku_impressions', 'beehiiv_spend', 'beehiiv_impressions', 'amazon_spend', 'amazon_impressions', 'walmart_spend', 'billboards_spend', 'emails_spend', 'tv_spend']
Subscriptions summary:
 count       74.000000
mean     12387.702703
std       2549.657005
min       8427.000000
25%      10496.750000
50%      11580.500000
75%      14489.500000
max      20113.000000
Name: subscriptions, dtype: float64


Series([], dtype: int64)

In [20]:
# --- Build & sanitize features ---
X, y, names = build_features(
    df,
    decay=0.5,
    max_lag=8,
    sat_pct=0.60
)

# 1) Drop constant/near-constant columns
std = X.std(axis=0, ddof=0)
keep = std > 1e-8
X, names = X[:, keep], [n for n,k in zip(names, keep) if k]

# 2) Standardize
Xn = (X - X.mean(0)) / np.clip(X.std(0, ddof=0), 1e-8, None)

# 3) Prune near-duplicate features (|corr| > 0.999)
C = np.corrcoef(Xn, rowvar=False)
to_drop = set()
for i in range(C.shape[0]):
    if i in to_drop: 
        continue
    for j in range(i+1, C.shape[0]):
        if j in to_drop: 
            continue
        if np.isfinite(C[i,j]) and abs(C[i,j]) > 0.999:
            to_drop.add(j)

keep2 = np.array([k not in to_drop for k in range(Xn.shape[1])])
Xn, names = Xn[:, keep2], [n for n,k in zip(names, keep2) if k]

print("Final X shape:", Xn.shape, "| features kept:", len(names))

# Intercept for some models
X_int = np.column_stack([np.ones(len(Xn)), Xn])



# ---- 1) Lasso shortlist ----
y_centered = y - y.mean()
lambdas = np.geomspace(0.5, 50, 12)

from bayes_core import lasso_cd  # already in your file
coef_mag = np.zeros(Xn.shape[1])

for lam in lambdas:
    beta = lasso_cd(Xn, y_centered, lam=lam, max_iter=1500)
    coef_mag += np.abs(beta)

TOPK = min(12, Xn.shape[1])          # shortlist size
top_idx = np.argsort(-coef_mag)[:TOPK]
Xn_sub = Xn[:, top_idx]
names_sub = [names[i] for i in top_idx]

print(f"Shortlisted to {len(names_sub)} features:", names_sub)

# ---- 2) Exact hyper-g BMA on shortlist ----
from bayes_core import enumerate_models_BMA
MAX_K = min(6, Xn_sub.shape[1], len(y)-1)  # smaller K keeps things tractable

bma = enumerate_models_BMA(
    Xn_sub, y,
    feature_names=names_sub,
    prior_inclusion=0.50,
    hyper_g_a=3.0,
    max_k=MAX_K
)

pip_sorted = sorted(bma["pip"].items(), key=lambda kv: -kv[1])
print("Top PIPs:")
for k, (feat, pip) in enumerate(pip_sorted[:15], 1):
    print(f"{k:>2}. {feat:35s}  PIP={pip:0.3f}")


Final X shape: (74, 22) | features kept: 22
Shortlisted to 12 features: ['beehiiv_spend_adstock', 'beehiiv_impr_sat', 'google_spend_adstock', 'liveintent_spend_adstock', 'engine_spend_adstock', 'moloco_spend_adstock', 'meta_impr_sat', 'moloco_impr_sat', 'amazon_impr_sat', 'google_impr_sat', 'tiktok_spend_adstock', 'tiktok_impr_sat']
Top PIPs:
 1. beehiiv_impr_sat                     PIP=0.437
 2. google_spend_adstock                 PIP=0.427
 3. beehiiv_spend_adstock                PIP=0.424
 4. google_impr_sat                      PIP=0.419
 5. engine_spend_adstock                 PIP=0.417
 6. liveintent_spend_adstock             PIP=0.416
 7. moloco_spend_adstock                 PIP=0.413
 8. tiktok_spend_adstock                 PIP=0.403
 9. moloco_impr_sat                      PIP=0.403
10. tiktok_impr_sat                      PIP=0.401
11. meta_impr_sat                        PIP=0.400
12. amazon_impr_sat                      PIP=0.395


In [21]:
y_centered = y - y.mean()
lambdas = np.geomspace(0.1, 100, 8)
lasso_paths = []
for lam in lambdas:
    beta = lasso_cd(Xn, y_centered, lam=lam, max_iter=2000)
    lasso_paths.append(beta)
lasso_paths = np.vstack(lasso_paths)

# Show strongest coefs at a mid lambda
idx = 3
coefs = list(zip(names, lasso_paths[idx]))
coefs = sorted(coefs, key=lambda kv: -abs(kv[1]))[:10]
print(f"Lasso @ lambda={lambdas[idx]:.3f} (top 10 by |coef|):")
for n,c in coefs:
    print(f"{n:35s}  {c:+.4f}")


Lasso @ lambda=1.931 (top 10 by |coef|):
beehiiv_spend_adstock                -4866.1583
beehiiv_impr_sat                     +3333.1491
google_spend_adstock                 +1690.6697
liveintent_spend_adstock             +1585.1232
engine_spend_adstock                 +1153.7697
moloco_spend_adstock                 +858.4268
meta_impr_sat                        -598.7274
moloco_impr_sat                      +540.3861
amazon_impr_sat                      +538.8747
google_impr_sat                      -508.1827


In [22]:
# Robust t on intercept+standardized features
beta_t, s2_t, w_t = student_t_irls(X_int, y, nu=4.0)
print("Student-t residual variance:", s2_t)

# Outlier mean-shift BMA
out_bma = mean_shift_bma(X_int, y, top_k=6, prior_pi=0.20)
candidates = out_bma["candidates"]
pip_idx = sorted(out_bma["pip"].items(), key=lambda kv: -kv[1])

print("\nOutlier candidate weeks (index → date):")
for i in candidates:
    print(i, "→", df.loc[i, "date"].date())

print("\nOutlier inclusion probabilities:")
for ix, prob in pip_idx:
    print(f"idx={ix:3d}  date={df.loc[ix,'date'].date()}  PIP={prob:0.3f}")


Student-t residual variance: 1370357.5075127746

Outlier candidate weeks (index → date):
70 → 2025-07-14
60 → 2025-05-05
0 → 2024-03-11
43 → 2025-01-06
40 → 2024-12-16
69 → 2025-07-07

Outlier inclusion probabilities:
idx= 70  date=2025-07-14  PIP=0.992
idx= 60  date=2025-05-05  PIP=0.923
idx=  0  date=2024-03-11  PIP=0.904
idx= 43  date=2025-01-06  PIP=0.739
idx= 40  date=2024-12-16  PIP=0.365
idx= 69  date=2025-07-07  PIP=0.064


In [23]:
month = df["date"].dt.to_period("M").astype(str)
beta_mx, tau2, sigma2, V_inv, fitted, resid = random_intercept_by(y, X_int, month)
print("Random-intercept variance (tau^2):", tau2)
print("Residual variance (sigma^2):", sigma2)
print(pd.Series(resid).describe())


Random-intercept variance (tau^2): 4528.439838808681
Residual variance (sigma^2): 2929700.139578532
count      74.000000
mean       -0.009657
std       959.748954
min     -2210.567661
25%      -631.896606
50%        20.397369
75%       598.893957
max      2871.610055
dtype: float64


In [24]:
diag = ols_diagnostics(X_int, y)
lever, cooks, t_ext = diag["leverage"], diag["cooks"], diag["t_ext"]

print("sigma^2 (OLS):", diag["sigma2"])

print("\nTop 5 leverage:")
for i in np.argsort(-lever)[:5]:
    print(i, df.loc[i,"date"].date(), f"lev={lever[i]:.3f}", f"Cook={cooks[i]:.3f}", f"t_ext={t_ext[i]:+.2f}")

print("\nTop 5 Cook's distance:")
for i in np.argsort(-cooks)[:5]:
    print(i, df.loc[i,"date"].date(), f"Cook={cooks[i]:.3f}", f"lev={lever[i]:.3f}", f"t_ext={t_ext[i]:+.2f}")


sigma^2 (OLS): 1321237.3178489828

Top 5 leverage:
34 2024-11-04 lev=0.727 Cook=0.016 t_ext=-0.37
35 2024-11-11 lev=0.628 Cook=0.070 t_ext=+0.98
73 2025-08-04 lev=0.622 Cook=0.000 t_ext=+0.06
65 2025-06-09 lev=0.588 Cook=0.139 t_ext=+1.52
50 2025-02-24 lev=0.555 Cook=0.043 t_ext=+0.89

Top 5 Cook's distance:
0 2024-03-11 Cook=0.235 lev=0.510 t_ext=-2.38
70 2025-07-14 Cook=0.140 lev=0.273 t_ext=+3.18
65 2025-06-09 Cook=0.139 lev=0.588 t_ext=+1.52
40 2024-12-16 Cook=0.132 lev=0.447 t_ext=+1.99
60 2025-05-05 Cook=0.127 lev=0.341 t_ext=-2.49


In [25]:
out = fit_meridian_and_summarize(df, outdir="reports")
print(out)


{'ok': False, 'reason': "Meridian unavailable: cannot import name 'api' from 'meridian' (/Users/justinhuang/monthlymocha/.venv/lib/python3.12/site-packages/meridian/__init__.py)"}


In [26]:
# OLS on standardized features + intercept
beta_ols = diag["beta"]  # includes intercept as beta_ols[0]
coefs = list(zip(["Intercept"] + names, beta_ols))
coefs_sorted = sorted(coefs[1:], key=lambda kv: -abs(kv[1]))[:12]

print("OLS coefficient magnitudes (top 12):")
for n, c in coefs_sorted:
    print(f"{n:35s}  {c:+.4f}")

# crude weekly contribution proxy (standardized scaling)
y_hat = X_int @ beta_ols
resid_ols = y - y_hat
print("\nRough R^2:", 1 - np.var(resid_ols) / np.var(y))
# Save top PIPs and lasso summary
pip_df = pd.DataFrame(pip_sorted, columns=["feature", "pip"])
pip_df.to_csv("bma_pips.csv", index=False)

lasso_row = pd.DataFrame({"feature": names, "coef": lasso_paths[3]})
lasso_row.sort_values("coef", key=np.abs, ascending=False).to_csv("lasso_coefs_lambda_idx3.csv", index=False)

# Save outlier PIPs
outlier_rows = [
    {"index": int(ix), "date": str(df.loc[ix,"date"].date()), "pip": float(prob)}
    for ix, prob in pip_idx
]
pd.DataFrame(outlier_rows).to_csv("outlier_pips.csv", index=False)

print("Wrote: bma_pips.csv, lasso_coefs_lambda_idx3.csv, outlier_pips.csv")


OLS coefficient magnitudes (top 12):
beehiiv_spend_adstock                -4881.7482
beehiiv_impr_sat                     +3344.6791
google_spend_adstock                 +1692.2236
liveintent_spend_adstock             +1586.2147
engine_spend_adstock                 +1158.1465
moloco_spend_adstock                 +859.2435
meta_impr_sat                        -601.6837
moloco_impr_sat                      +541.1882
amazon_impr_sat                      +539.8775
google_impr_sat                      -509.4056
tiktok_impr_sat                      -491.1704
tiktok_spend_adstock                 +486.3031

Rough R^2: 0.8580077546720764
Wrote: bma_pips.csv, lasso_coefs_lambda_idx3.csv, outlier_pips.csv
