In [17]:
import pandas as pd
import numpy as np


# Base model

In [13]:


final = pd.read_csv("cigarettes_monthly.csv")


final["market_size_month"] = 6.685 * final["custcount_monthly"]*0.2325
final["prod_mkt_share"] = final["total_packs"] / final["market_size_month"]

In [21]:



df = final.copy()




def series_flag(df, col, dtype="int8"):
    if col in df.columns:
        return pd.to_numeric(df[col], errors="coerce").fillna(0).astype(dtype)
    return pd.Series(0, index=df.index, dtype=dtype)


"""

counts = (
    df.groupby(["store", "month_idx"], observed=True)["prod_key"]
                 .transform("count")
)
df = prod_market_m.loc[counts > 2].copy()
"""

df = df.rename(columns={"avg_pack_price": "price"})
mkt = ["store","month_idx"]

sum_inside = df.groupby(mkt, observed=True)["prod_mkt_share"].transform("sum")
df = df[(df["prod_mkt_share"] > 0) & (sum_inside < 1)].copy()
df["s0"]    = np.clip(1.0 - sum_inside, 1e-12, 1 - 1e-12)
df["log_s"] = np.log(df["prod_mkt_share"]) - np.log(df["s0"])

if "prod_id" not in df.columns:
    if "upc_norm" in df.columns:
        df["prod_id"] = df["upc_norm"].astype("string")
    elif "upc" in df.columns:
        df["prod_id"] = df["upc"].astype("string").str.replace(r"\D","", regex=True)
    else:
        df["prod_id"] = df.index.astype("string")

brand_raw = df.get("brand", pd.Series("", index=df.index))
brand_key = (brand_raw.astype("string").str.strip().str.lower()
             .mask(lambda s: s.eq("") | s.isna(), "generic"))
is_generic = (series_flag(df, "generic_hardcoded") > 0) | brand_key.eq("generic")
brand_key  = brand_key.mask(is_generic, "generic")

In [35]:

import numpy as np, pandas as pd
import statsmodels.api as sm
from linearmodels.iv import IV2SLS

# ## Nested Logit

# In[ ]:


import numpy as np, pandas as pd
import statsmodels.api as sm
from linearmodels.iv import IV2SLS

# -----------------------
# helpers
# -----------------------
def series_flag(df, col, dtype="int8"):
    if col in df.columns:
        return pd.to_numeric(df[col], errors="coerce").fillna(0).astype(dtype)
    return pd.Series(0, index=df.index, dtype=dtype)

def demean_within(frame, cols, keys):
    means = frame.groupby(keys, observed=True)[cols].transform("mean")
    return frame[cols] - means

def prune_instruments_for_full_rank(exog_df, Z_df, tol=1e-10):
    W, keep = exog_df.copy(), []
    for c in Z_df.columns:
        r_old = np.linalg.matrix_rank(W.to_numpy(), tol)
        W_try = pd.concat([W, Z_df[[c]]], axis=1)
        r_new = np.linalg.matrix_rank(W_try.to_numpy(), tol)
        if r_new > r_old:
            keep.append(c); W = W_try
    return Z_df[keep]

def standardize_cols(df):
    out = df.copy()
    for c in out.columns:
        s = float(out[c].std(skipna=True))
        if np.isfinite(s) and s > 0:
            out[c] = out[c] / s
    return out

# ========================
# 0) data & keys
# ========================
df = final.copy()
df = df.rename(columns={"avg_pack_price": "price"})
mkt = ["store","month_idx"]

# inside/outside shares
sum_inside = df.groupby(mkt, observed=True)["prod_mkt_share"].transform("sum")
df = df[(df["prod_mkt_share"] > 0) & (sum_inside < 1)].copy()
df["s0"] = np.clip(1.0 - sum_inside, 1e-12, 1 - 1e-12)

# ids
if "prod_id" not in df.columns:
    if "upc_norm" in df.columns:
        df["prod_id"] = df["upc_norm"].astype("string")
    elif "upc" in df.columns:
        df["prod_id"] = df["upc"].astype("string").str.replace(r"\D","", regex=True)
    else:
        df["prod_id"] = df.index.astype("string")

# brand vs generic; nest = branded/generic
brand_raw = df.get("brand", pd.Series("", index=df.index))
brand_key = (brand_raw.astype("string").str.strip().str.lower()
             .mask(lambda s: s.eq("") | s.isna(), "generic"))
is_generic = (series_flag(df, "generic_hardcoded") > 0) | brand_key.eq("generic")
brand_key  = brand_key.mask(is_generic, "generic")
df["nest"] = np.where(brand_key.eq("generic"), "generic", "branded")

# ========================
# 1) nested-logit shares (RAW)
# ========================
# ln s_j - ln s_0
df["ln_s"] = np.log(df["prod_mkt_share"]) - np.log(df["s0"])
# within-nest share s_{j|g} and its log
sg = df.groupby(mkt + ["nest"], observed=True)["prod_mkt_share"].transform("sum")
df = df[sg > 0].copy()
df["ln_s_within"] = np.log(df["prod_mkt_share"]) - np.log(sg)

# regressors
dummy_cols = [c for c in ["dlx","supslim","slim","value","premium","flavored","carton"] if c in df.columns]
cont_cols  = [c for c in ["tar_mean","nic_mean","co_mean"] if c in df.columns]
Xnames     = ["price","ln_s_within"] + dummy_cols + cont_cols

need = ["ln_s","price","store","month_idx","nest","prod_id"] + Xnames
df2  = df.dropna(subset=need).copy()
df2["brand_for_iv"] = brand_key.loc[df2.index].astype("string")

# ========================
# 2) instruments (RAW) – Hausman + BLP(cont) + Nest(cont)
# ========================
# --- Hausman (coalesced z1 -> z2 -> z3) ---
g_uq = df2.groupby(["prod_id","month_idx"], observed=True)
cnt_uq = g_uq["price"].transform("count"); sum_uq = g_uq["price"].transform("sum")
z1_raw = np.where(cnt_uq.gt(1), (sum_uq - df2["price"]) / (cnt_uq - 1), np.nan)

g_u  = df2.groupby(["prod_id"], observed=True)
cnt_u = g_u["price"].transform("count"); sum_u = g_u["price"].transform("sum")
g_us = df2.groupby(["prod_id","store"], observed=True)
cnt_us = g_us["price"].transform("count"); sum_us = g_us["price"].transform("sum")
z2_raw = np.where((cnt_u - cnt_us).gt(0), (sum_u - sum_us) / (cnt_u - cnt_us), np.nan)

gbm  = df2.groupby(["brand_for_iv","month_idx"], observed=True)
cnt_bm = gbm["price"].transform("count"); sum_bm = gbm["price"].transform("sum")
gbms = df2.groupby(["brand_for_iv","month_idx","store"], observed=True)
cnt_bms = gbms["price"].transform("count"); sum_bms = gbms["price"].transform("sum")
z3_raw = np.where((cnt_bm - cnt_bms).gt(0), (sum_bm - sum_bms) / (cnt_bm - cnt_bms), np.nan)

z_haus_raw = pd.Series(z1_raw, index=df2.index)
z_haus_raw = z_haus_raw.where(z_haus_raw.notna(), pd.Series(z2_raw, index=df2.index))
z_haus_raw = z_haus_raw.where(z_haus_raw.notna(), pd.Series(z3_raw, index=df2.index))

Z_raw = pd.DataFrame({"z_haus": z_haus_raw}, index=df2.index)

# --- BLP(cont) by brand (as in simple logit) ---
gm  = df2.groupby(mkt, observed=True)
gfb = df2.groupby(mkt + ["brand_for_iv"], observed=True)
for c in cont_cols:
    tot = gm[c].transform("sum")
    own = gfb[c].transform("sum")
    Z_raw[f"iv_brand_riv_sum_{c}"] = tot - own
    Z_raw[f"iv_brand_own_sum_{c}"] = own - df2[c]
# rival count in market (helps 1st stage)
Z_raw["iv_rival_count"] = gm["price"].transform("size") - gfb["price"].transform("size")

# --- Nest(cont) proxies for ln_s_within (same-nest & other-nest sums/counts) ---
gmn = df2.groupby(mkt + ["nest"], observed=True)
for c in cont_cols:
    nest_tot = gmn[c].transform("sum")
    Z_raw[f"iv_nest_same_sum_{c}"]  = nest_tot - df2[c]
    Z_raw[f"iv_nest_other_sum_{c}"] = gm[c].transform("sum") - nest_tot
# counts
Z_raw["iv_nest_same_cnt"]  = gmn["price"].transform("size") - 1
Z_raw["iv_nest_other_cnt"] = gm["price"].transform("size") - gmn["price"].transform("size")

# ========================
# 3) ONE FE removal for y, X, Z
# ========================
X_tilde = demean_within(df2, Xnames, mkt)
y_tilde = (df2["ln_s"] - df2.groupby(mkt, observed=True)["ln_s"].transform("mean")).rename("ln_s")
Z_tilde = demean_within(pd.concat([df2[mkt], Z_raw], axis=1), list(Z_raw.columns), mkt)

# single estimation sample
all_parts = pd.concat([y_tilde, X_tilde, Z_tilde], axis=1).replace([np.inf,-np.inf], np.nan).dropna()
y_iv  = all_parts["ln_s"]
X_iv  = all_parts[Xnames]
Z_iv  = all_parts[Z_tilde.columns]

# drop zero-variance cols; standardize IVs; prune for rank
X_iv = X_iv.loc[:, X_iv.apply(lambda s: np.nanstd(s.to_numpy()) > 0)]
Z_iv = standardize_cols(Z_iv.loc[:, Z_iv.apply(lambda s: np.nanstd(s.to_numpy()) > 0)])

# endogenous: price & ln_s_within
exog  = X_iv.drop(columns=["price","ln_s_within"])
Z_iv  = prune_instruments_for_full_rank(exog, Z_iv)

clusters_iv = pd.to_numeric(df2.loc[all_parts.index, "store"], errors="coerce").astype(int).to_numpy()

# ========================
# 4) Estimation
# ========================
ols = sm.OLS(y_iv, X_iv).fit(cov_type="cluster", cov_kwds={"groups": clusters_iv})
print("\n[OLS Nested Logit | Market FE absorbed]")
print(ols.summary().tables[1])

iv = IV2SLS(
    dependent=y_iv,
    exog=exog,
    endog=X_iv[["price","ln_s_within"]],
    instruments=Z_iv
).fit(cov_type="clustered", clusters=clusters_iv)
print("\n[IV Nested Logit | Market FE absorbed | Hausman + BLP(cont) + Nest(cont)]")
#print(iv.summary)
try:
  #  print("\n[First stages]"); print(iv.first_stage.summary)
except Exception:
    pass

# quick elasticity diagnostic
sigma = float(iv.params.get("ln_s_within", np.nan))
alpha = float(iv.params.get("price", np.nan))
sbar   = float(df2.loc[all_parts.index, "prod_mkt_share"].mean())
sjgbar = float((df2.loc[all_parts.index, "prod_mkt_share"] /
                df2.loc[all_parts.index].groupby(mkt + ["nest"], observed=True)["prod_mkt_share"].transform("sum")).mean())
pbar   = float(df2.loc[all_parts.index, "price"].mean())
eps    = -alpha * pbar * (1 - sigma*(1 - sjgbar) - sbar)
print(f"\nσ (nesting): {sigma:.3f} | implied avg own-price elasticity ≈ {eps:.2f}")


# In[ ]:




[OLS Nested Logit | Market FE absorbed]
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
price           0.0333      0.021      1.615      0.106      -0.007       0.074
ln_s_within     0.9337      0.003    309.996      0.000       0.928       0.940
slim            0.3884      0.071      5.434      0.000       0.248       0.528
value          -1.0275      0.051    -20.114      0.000      -1.128      -0.927
premium         0.1379      0.013     10.575      0.000       0.112       0.163
flavored       -0.2744      0.020    -13.476      0.000      -0.314      -0.234
carton          0.0370      0.016      2.323      0.020       0.006       0.068
tar_mean       -0.0099      0.021     -0.465      0.642      -0.052       0.032
nic_mean        0.5580      0.449      1.244      0.214      -0.321       1.437
co_mean        -0.2358      0.023    -10.114      0.000      -0.282      -0.190

In [52]:
# ========================
# 4) Present Brand × Brand elasticities nicely
# ========================
alpha  = float(iv.params.get("price", np.nan))
beta_f = float(iv.params.get("flavored", 0.0))   # 0 if not in Xnames

df_calc = df2.loc[df2.index].copy()
df_calc["brand_key"] = brand_key.loc[df_calc.index].astype("string")
brands = sorted(list(df_calc["brand_key"].unique()))

E_price_sum={}; E_price_cnt={}; E_flav_sum={}; E_flav_cnt={}
for b in brands:
    for c in brands:
        E_price_sum[(b,c)]=0.0; E_price_cnt[(b,c)]=0
        E_flav_sum [(b,c)]=0.0; E_flav_cnt [(b,c)]=0

for key, g in df_calc.groupby(mkt, observed=True):
    s = g["prod_mkt_share"].to_numpy()
    p = g["price"].to_numpy()
    B = g["brand_key"].to_numpy()
    if s.size < 2: continue

    Jp = alpha * (np.diag(s) - np.outer(s, s))
    if "flavored" in g.columns:
        f = g["flavored"].to_numpy().astype(float)
        Jf = beta_f * (np.diag(s * f) - np.outer(s, s * f))
    else:
        Jf = np.zeros((s.size, s.size))
    JpP = Jp * p[None, :]  # % change

    brands_here = np.unique(B)
    S_b = {bk: float(s[B==bk].sum()) for bk in brands_here}

    for b in brands_here:
        Sb = S_b[b]
        if Sb <= 0: continue
        rows = (B == b)
        rJpP = JpP[rows, :].sum(axis=0)
        rJf  = Jf [rows, :].sum(axis=0)
        for c in brands_here:
            cols = (B == c)
            E_bc_price = (rJpP[cols].sum()) / Sb
            E_bc_flav  = - rJf[cols].sum()
            E_price_sum[(b,c)] += E_bc_price; E_price_cnt[(b,c)] += 1
            E_flav_sum [(b,c)] += E_bc_flav;  E_flav_cnt [(b,c)] += 1

def avg_matrix(sum_d, cnt_d, brands):
    M = np.full((len(brands), len(brands)), np.nan)
    for i,b in enumerate(brands):
        for j,c in enumerate(brands):
            if cnt_d[(b,c)] > 0:
                M[i,j] = sum_d[(b,c)] / cnt_d[(b,c)]
    return pd.DataFrame(M, index=brands, columns=brands)
    
E_price = avg_matrix(E_price_sum, E_price_cnt, brands)  # %
E_flav  = avg_matrix(E_flav_sum,  E_flav_cnt,  brands)  # share levels

# 1) Ensure consistent brand order and build a % version for price
brands = sorted(E_price.index.tolist())
E_price = E_price.reindex(index=brands, columns=brands)
E_flav  = E_flav .reindex(index=brands, columns=brands)
E_price_pct = E_price 
E_price_pct = E_price_pct.fillna(0)

# 2) Continuous (long/tidy) table — prints once without "row-by-row" updates
elasticities_long = (
    E_price_pct.stack().rename("price_elasticity_pct")
    .to_frame()
    .join(E_flav.stack().rename("flavored_semi_delta"))
    .rename_axis(index=["affected_brand","shocked_brand"])
    .reset_index()
    .sort_values(["affected_brand","shocked_brand"], kind="stable")
)

# Pretty print in console
pd.set_option("display.width", 160)
pd.set_option("display.max_rows", 100000)
"""
print("\nBrand × Brand elasticities (continuous long format)")
print(elasticities_long.to_string(
    index=False,
    formatters={
        "price_elasticity_pct": lambda v: f"{v:8.3f}",   # % ΔS_b for 1% ↑ price of brand c
        "flavored_semi_delta":  lambda v: f"{v: .5f}",   # ΔS_b when brand c flavored is banned
    }
))
"""
try:
    from IPython.display import display
    # price elasticities
    display(
        E_price_pct.style
            .format("{:.2f}%")
            .background_gradient(cmap="coolwarm", axis=None)
            .set_caption("Brand × Brand PRICE Elasticities (% ΔS_b for 1% ↑ price of brand c)")
            .set_table_styles([
                {"selector": "th.col_heading", "props": "text-align:center;"},
                {"selector": "th.row_heading", "props": "text-align:right;"},
            ])
    )
    # flavored semi-elasticities
    display(
        E_flav.style
            .format("{:.4f}")
            .background_gradient(cmap="Greens", axis=None)
            .set_caption("Brand × Brand FLAVORED Semi-Elasticities (ΔS_b when brand c flavored is banned)")
            .set_table_styles([
                {"selector": "th.col_heading", "props": "text-align:center;"},
                {"selector": "th.row_heading", "props": "text-align:right;"},
            ])
    )
except Exception:
    # safe no-op if running outside a notebook
    pass


Unnamed: 0,benson & hedges,doral,generic,kool,marlboro,virginia slims,winston,winston select
benson & hedges,-1.66%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
doral,0.00%,-2.87%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
generic,0.00%,0.00%,-1.52%,0.00%,0.00%,0.00%,0.00%,0.00%
kool,0.00%,0.00%,0.00%,-1.74%,0.00%,0.00%,0.00%,0.00%
marlboro,0.00%,0.00%,0.00%,0.00%,-1.72%,0.00%,0.00%,0.00%
virginia slims,0.00%,0.00%,0.00%,0.00%,0.00%,-1.68%,0.00%,0.00%
winston,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,-1.07%,0.00%
winston select,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,-1.41%


Unnamed: 0,benson & hedges,doral,generic,kool,marlboro,virginia slims,winston,winston select
benson & hedges,-0.0,,0.0,0.0,0.0,0.0,0.0,0.0
doral,,-0.0,0.0,,,,,
generic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
kool,0.0,,0.0,-0.0,0.0,0.0,0.0,0.0
marlboro,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
virginia slims,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
winston,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
winston select,0.0,,0.0,0.0,0.0,0.0,0.0,0.0


# Last ditch attempts at getting better elasticisties

## Aggregate quarterly instead of monthly

In [82]:
########### 1: aggregate quarterly instead of monthly ##############
import pandas as pd
import numpy as np
import re

final = pd.read_csv("cigarettes_monthly.csv")

base = int(final["month_idx"].min())
q_idx = ((final["month_idx"] - base) // 3 + 1).astype("Int64")   # running quarter index
final["year_quarter"] = "Q" + q_idx.astype(str)                  # simple label
#final["year"] = pd.NA
final["quarter_id"] = q_idx
final["custcount"] = final["custcount_monthly"]
final = final.drop('custcount_monthly', axis=1)

known_dummies = [
    "menthol","dlx","special","supslim","slim","generic","single","carton","pack_kw","value",
    "generic_automated","generic_hardcoded","cigar","snuff","loose_tobacco","flavored","premium",
    "cigarettes","ok","sale"
]
dummy_cols = [c for c in known_dummies if c in final.columns]
#for c in dummy_cols:
#    final[c] = coerce_binary(final[c])

#if isinstance(final.columns, pd.MultiIndex):
#    final.columns = ['_'.join(map(str, c)).strip('_') for c in final.columns]

known_continuous = [
    "tar_mean","nic_mean","co_mean",
    "income","educ","hsizeavg","age9","age60","ethnic","nocar"
]
if "implied discount" in final.columns:
    known_continuous.append("implied discount")

cat_cols = [c for c in ["brand","size","pack"] if c in final.columns]

time_cols = ["month_idx", "year52", "m4", "month_label"]

group_cols = ["store", "quarter_id", "prod_key", "prod_type", "prod_id"] + [c for c in time_cols if c != "month_idx"]

agg_dict = {
    "total_packs": 'sum',
    "total_rev": 'sum',
    "packs_per_item_wavg": 'mean',
    "avg_pack_price": 'mean',
    "custcount": 'sum'}


for c in dummy_cols:
    agg_dict[c] = "max"
for c in known_continuous:
    agg_dict[c] = first_nonnull
for c in cat_cols + ["brand_clean","upc_norm"]:
    if c in final.columns:
        agg_dict[c] = first_nonnull
    
#agg_dict = {k:(k,v) for k,v in agg_dict.items()}

quarterly = (
    final.groupby(group_cols, as_index=False, observed=True)
           .agg(agg_dict, numeric_only=False)
)


final["market_size_quarter"] = 6.685 * final["custcount"]*0.2325
final["prod_mkt_share"] = final["total_packs"] / final["market_size_quarter"]


print("\n✅ Done.")
print("Final shape:", final.shape)

quarterly.to_csv("cigarettes_quarterly.csv", index = False)




✅ Done.
Final shape: (55690, 52)


In [88]:
################## Rerun the code 


import numpy as np, pandas as pd
import statsmodels.api as sm
from linearmodels.iv import IV2SLS

# -----------------------
# helpers
# -----------------------
def series_flag(df, col, dtype="int8"):
    if col in df.columns:
        return pd.to_numeric(df[col], errors="coerce").fillna(0).astype(dtype)
    return pd.Series(0, index=df.index, dtype=dtype)

def demean_within(frame, cols, keys):
    means = frame.groupby(keys, observed=True)[cols].transform("mean")
    return frame[cols] - means

def prune_instruments_for_full_rank(exog_df, Z_df, tol=1e-10):
    W, keep = exog_df.copy(), []
    for c in Z_df.columns:
        r_old = np.linalg.matrix_rank(W.to_numpy(), tol)
        W_try = pd.concat([W, Z_df[[c]]], axis=1)
        r_new = np.linalg.matrix_rank(W_try.to_numpy(), tol)
        if r_new > r_old:
            keep.append(c); W = W_try
    return Z_df[keep]

def standardize_cols(df):
    out = df.copy()
    for c in out.columns:
        s = float(out[c].std(skipna=True))
        if np.isfinite(s) and s > 0:
            out[c] = out[c] / s
    return out

# ========================
# 0) data & keys
# ========================
df = final.copy()
df = df.rename(columns={"avg_pack_price": "price"})
mkt = ["store","quarter_id"]

# inside/outside shares
sum_inside = df.groupby(mkt, observed=True)["prod_mkt_share"].transform("sum")
df = df[(df["prod_mkt_share"] > 0) & (sum_inside < 1)].copy()
df["s0"] = np.clip(1.0 - sum_inside, 1e-12, 1 - 1e-12)

# ids
if "prod_id" not in df.columns:
    if "upc_norm" in df.columns:
        df["prod_id"] = df["upc_norm"].astype("string")
    elif "upc" in df.columns:
        df["prod_id"] = df["upc"].astype("string").str.replace(r"\D","", regex=True)
    else:
        df["prod_id"] = df.index.astype("string")

# brand vs generic; nest = branded/generic
brand_raw = df.get("brand", pd.Series("", index=df.index))
brand_key = (brand_raw.astype("string").str.strip().str.lower()
             .mask(lambda s: s.eq("") | s.isna(), "generic"))
is_generic = (series_flag(df, "generic_hardcoded") > 0) | brand_key.eq("generic")
brand_key  = brand_key.mask(is_generic, "generic")
df["nest"] = np.where(brand_key.eq("generic"), "generic", "branded")

# ========================
# 1) nested-logit shares (RAW)
# ========================
# ln s_j - ln s_0
df["ln_s"] = np.log(df["prod_mkt_share"]) - np.log(df["s0"])
# within-nest share s_{j|g} and its log
sg = df.groupby(mkt + ["nest"], observed=True)["prod_mkt_share"].transform("sum")
df = df[sg > 0].copy()
df["ln_s_within"] = np.log(df["prod_mkt_share"]) - np.log(sg)

# regressors
dummy_cols = [c for c in ["dlx","supslim","slim","value","premium","flavored","carton"] if c in df.columns]
cont_cols  = [c for c in ["tar_mean","nic_mean","co_mean"] if c in df.columns]
Xnames     = ["price","ln_s_within"] + dummy_cols + cont_cols

need = ["ln_s","price","store","quarter_id","nest","prod_id"] + Xnames
df2  = df.dropna(subset=need).copy()
df2["brand_for_iv"] = brand_key.loc[df2.index].astype("string")

# ========================
# 2) instruments (RAW) – Hausman + BLP(cont) + Nest(cont)
# ========================
# --- Hausman (coalesced z1 -> z2 -> z3) ---
g_uq = df2.groupby(["prod_id","quarter_id"], observed=True)
cnt_uq = g_uq["price"].transform("count"); sum_uq = g_uq["price"].transform("sum")
z1_raw = np.where(cnt_uq.gt(1), (sum_uq - df2["price"]) / (cnt_uq - 1), np.nan)

g_u  = df2.groupby(["prod_id"], observed=True)
cnt_u = g_u["price"].transform("count"); sum_u = g_u["price"].transform("sum")
g_us = df2.groupby(["prod_id","store"], observed=True)
cnt_us = g_us["price"].transform("count"); sum_us = g_us["price"].transform("sum")
z2_raw = np.where((cnt_u - cnt_us).gt(0), (sum_u - sum_us) / (cnt_u - cnt_us), np.nan)

gbm  = df2.groupby(["brand_for_iv","quarter_id"], observed=True)
cnt_bm = gbm["price"].transform("count"); sum_bm = gbm["price"].transform("sum")
gbms = df2.groupby(["brand_for_iv","quarter_id","store"], observed=True)
cnt_bms = gbms["price"].transform("count"); sum_bms = gbms["price"].transform("sum")
z3_raw = np.where((cnt_bm - cnt_bms).gt(0), (sum_bm - sum_bms) / (cnt_bm - cnt_bms), np.nan)

z_haus_raw = pd.Series(z1_raw, index=df2.index)
z_haus_raw = z_haus_raw.where(z_haus_raw.notna(), pd.Series(z2_raw, index=df2.index))
z_haus_raw = z_haus_raw.where(z_haus_raw.notna(), pd.Series(z3_raw, index=df2.index))

Z_raw = pd.DataFrame({"z_haus": z_haus_raw}, index=df2.index)

# --- BLP(cont) by brand (as in simple logit) ---
gm  = df2.groupby(mkt, observed=True)
gfb = df2.groupby(mkt + ["brand_for_iv"], observed=True)
for c in cont_cols:
    tot = gm[c].transform("sum")
    own = gfb[c].transform("sum")
    Z_raw[f"iv_brand_riv_sum_{c}"] = tot - own
    Z_raw[f"iv_brand_own_sum_{c}"] = own - df2[c]
# rival count in market (helps 1st stage)
Z_raw["iv_rival_count"] = gm["price"].transform("size") - gfb["price"].transform("size")

# --- Nest(cont) proxies for ln_s_within (same-nest & other-nest sums/counts) ---
gmn = df2.groupby(mkt + ["nest"], observed=True)
for c in cont_cols:
    nest_tot = gmn[c].transform("sum")
    Z_raw[f"iv_nest_same_sum_{c}"]  = nest_tot - df2[c]
    Z_raw[f"iv_nest_other_sum_{c}"] = gm[c].transform("sum") - nest_tot
# counts
Z_raw["iv_nest_same_cnt"]  = gmn["price"].transform("size") - 1
Z_raw["iv_nest_other_cnt"] = gm["price"].transform("size") - gmn["price"].transform("size")

# ========================
# 3) ONE FE removal for y, X, Z
# ========================
X_tilde = demean_within(df2, Xnames, mkt)
y_tilde = (df2["ln_s"] - df2.groupby(mkt, observed=True)["ln_s"].transform("mean")).rename("ln_s")
Z_tilde = demean_within(pd.concat([df2[mkt], Z_raw], axis=1), list(Z_raw.columns), mkt)

# single estimation sample
all_parts = pd.concat([y_tilde, X_tilde, Z_tilde], axis=1).replace([np.inf,-np.inf], np.nan).dropna()
y_iv  = all_parts["ln_s"]
X_iv  = all_parts[Xnames]
Z_iv  = all_parts[Z_tilde.columns]

# drop zero-variance cols; standardize IVs; prune for rank
X_iv = X_iv.loc[:, X_iv.apply(lambda s: np.nanstd(s.to_numpy()) > 0)]
Z_iv = standardize_cols(Z_iv.loc[:, Z_iv.apply(lambda s: np.nanstd(s.to_numpy()) > 0)])

# endogenous: price & ln_s_within
exog  = X_iv.drop(columns=["price","ln_s_within"])
Z_iv  = prune_instruments_for_full_rank(exog, Z_iv)

clusters_iv = pd.to_numeric(df2.loc[all_parts.index, "store"], errors="coerce").astype(int).to_numpy()

# ========================
# 4) Estimation
# ========================
ols = sm.OLS(y_iv, X_iv).fit(cov_type="cluster", cov_kwds={"groups": clusters_iv})
print("\n[OLS Nested Logit | Market FE absorbed]")
print(ols.summary().tables[1])

iv = IV2SLS(
    dependent=y_iv,
    exog=exog,
    endog=X_iv[["price","ln_s_within"]],
    instruments=Z_iv
).fit(cov_type="clustered", clusters=clusters_iv)
print("\n[IV Nested Logit | Market FE absorbed | Hausman + BLP(cont) + Nest(cont)]")
#print(iv.summary)
#try:
  #  print("\n[First stages]"); print(iv.first_stage.summary)
#except:
#    pass

# quick elasticity diagnostic
sigma = float(iv.params.get("ln_s_within", np.nan))
alpha = float(iv.params.get("price", np.nan))
sbar   = float(df2.loc[all_parts.index, "prod_mkt_share"].mean())
sjgbar = float((df2.loc[all_parts.index, "prod_mkt_share"] /
                df2.loc[all_parts.index].groupby(mkt + ["nest"], observed=True)["prod_mkt_share"].transform("sum")).mean())
pbar   = float(df2.loc[all_parts.index, "price"].mean())
eps    = -alpha * pbar * (1 - sigma*(1 - sjgbar) - sbar)
print(f"\nσ (nesting): {sigma:.3f} | implied avg own-price elasticity ≈ {eps:.2f}")


[OLS Nested Logit | Market FE absorbed]
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
price          -0.1143      0.026     -4.361      0.000      -0.166      -0.063
ln_s_within     0.9270      0.003    286.288      0.000       0.921       0.933
slim            0.3760      0.049      7.710      0.000       0.280       0.472
value          -0.9105      0.059    -15.394      0.000      -1.026      -0.795
premium         0.0860      0.011      7.731      0.000       0.064       0.108
flavored       -0.1001      0.020     -5.026      0.000      -0.139      -0.061
carton         -0.0027      0.018     -0.155      0.877      -0.037       0.032
tar_mean        0.0917      0.015      6.071      0.000       0.062       0.121
nic_mean       -1.8413      0.325     -5.666      0.000      -2.478      -1.204
co_mean        -0.1350      0.019     -7.272      0.000      -0.171      -0.099

In [90]:
alpha  = float(iv.params.get("price", np.nan))
beta_f = float(iv.params.get("flavored", 0.0))   # 0 if not in Xnames

df_calc = df2.loc[df2.index].copy()
df_calc["brand_key"] = brand_key.loc[df_calc.index].astype("string")
brands = sorted(list(df_calc["brand_key"].unique()))

E_price_sum={}; E_price_cnt={}; E_flav_sum={}; E_flav_cnt={}
for b in brands:
    for c in brands:
        E_price_sum[(b,c)]=0.0; E_price_cnt[(b,c)]=0
        E_flav_sum [(b,c)]=0.0; E_flav_cnt [(b,c)]=0

for key, g in df_calc.groupby(mkt, observed=True):
    s = g["prod_mkt_share"].to_numpy()
    p = g["price"].to_numpy()
    B = g["brand_key"].to_numpy()
    if s.size < 2: continue

    Jp = alpha * (np.diag(s) - np.outer(s, s))
    if "flavored" in g.columns:
        f = g["flavored"].to_numpy().astype(float)
        Jf = beta_f * (np.diag(s * f) - np.outer(s, s * f))
    else:
        Jf = np.zeros((s.size, s.size))
    JpP = Jp * p[None, :]  # % change

    brands_here = np.unique(B)
    S_b = {bk: float(s[B==bk].sum()) for bk in brands_here}

    for b in brands_here:
        Sb = S_b[b]
        if Sb <= 0: continue
        rows = (B == b)
        rJpP = JpP[rows, :].sum(axis=0)
        rJf  = Jf [rows, :].sum(axis=0)
        for c in brands_here:
            cols = (B == c)
            E_bc_price = (rJpP[cols].sum()) / Sb
            E_bc_flav  = - rJf[cols].sum()
            E_price_sum[(b,c)] += E_bc_price; E_price_cnt[(b,c)] += 1
            E_flav_sum [(b,c)] += E_bc_flav;  E_flav_cnt [(b,c)] += 1

def avg_matrix(sum_d, cnt_d, brands):
    M = np.full((len(brands), len(brands)), np.nan)
    for i,b in enumerate(brands):
        for j,c in enumerate(brands):
            if cnt_d[(b,c)] > 0:
                M[i,j] = sum_d[(b,c)] / cnt_d[(b,c)]
    return pd.DataFrame(M, index=brands, columns=brands)
    
E_price = avg_matrix(E_price_sum, E_price_cnt, brands)  # %
E_flav  = avg_matrix(E_flav_sum,  E_flav_cnt,  brands)  # share levels

# 1) Ensure consistent brand order and build a % version for price
brands = sorted(E_price.index.tolist())
E_price = E_price.reindex(index=brands, columns=brands)
E_flav  = E_flav .reindex(index=brands, columns=brands)
E_price_pct = E_price 
E_price_pct = E_price_pct.fillna(0)

# 2) Continuous (long/tidy) table — prints once without "row-by-row" updates
elasticities_long = (
    E_price_pct.stack().rename("price_elasticity_pct")
    .to_frame()
    .join(E_flav.stack().rename("flavored_semi_delta"))
    .rename_axis(index=["affected_brand","shocked_brand"])
    .reset_index()
    .sort_values(["affected_brand","shocked_brand"], kind="stable")
)

# Pretty print in console
pd.set_option("display.width", 160)
pd.set_option("display.max_rows", 100000)
"""
print("\nBrand × Brand elasticities (continuous long format)")
print(elasticities_long.to_string(
    index=False,
    formatters={
        "price_elasticity_pct": lambda v: f"{v:8.3f}",   # % ΔS_b for 1% ↑ price of brand c
        "flavored_semi_delta":  lambda v: f"{v: .5f}",   # ΔS_b when brand c flavored is banned
    }
))
"""
try:
    from IPython.display import display
    # price elasticities
    display(
        E_price_pct.style
            .format("{:.2f}%")
            .background_gradient(cmap="coolwarm", axis=None)
            .set_caption("Brand × Brand PRICE Elasticities (% ΔS_b for 1% ↑ price of brand c)")
            .set_table_styles([
                {"selector": "th.col_heading", "props": "text-align:center;"},
                {"selector": "th.row_heading", "props": "text-align:right;"},
            ])
    )
    # flavored semi-elasticities
    display(
        E_flav.style
            .format("{:.4f}")
            .background_gradient(cmap="Greens", axis=None)
            .set_caption("Brand × Brand FLAVORED Semi-Elasticities (ΔS_b when brand c flavored is banned)")
            .set_table_styles([
                {"selector": "th.col_heading", "props": "text-align:center;"},
                {"selector": "th.row_heading", "props": "text-align:right;"},
            ])
    )
except Exception:
    # safe no-op if running outside a notebook
    pass


Unnamed: 0,benson & hedges,doral,generic,kool,marlboro,virginia slims,winston,winston select
benson & hedges,-2.24%,0.00%,0.02%,0.00%,0.03%,0.00%,0.00%,0.00%
doral,0.00%,-3.87%,0.02%,0.00%,0.00%,0.00%,0.00%,0.00%
generic,0.01%,0.00%,-2.07%,0.00%,0.03%,0.00%,0.00%,0.00%
kool,0.01%,0.00%,0.02%,-2.35%,0.03%,0.00%,0.00%,0.00%
marlboro,0.01%,0.00%,0.02%,0.00%,-2.30%,0.00%,0.00%,0.00%
virginia slims,0.01%,0.00%,0.02%,0.00%,0.03%,-2.25%,0.00%,0.00%
winston,0.01%,0.00%,0.01%,0.00%,0.04%,0.00%,-1.44%,0.00%
winston select,0.00%,0.00%,0.02%,0.00%,0.02%,0.00%,0.01%,-1.91%


Unnamed: 0,benson & hedges,doral,generic,kool,marlboro,virginia slims,winston,winston select
benson & hedges,-0.0003,,0.0,0.0,0.0,0.0,0.0,0.0
doral,,-0.0,0.0,,,,0.0,
generic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
kool,0.0,,0.0,-0.0002,0.0,0.0,0.0,0.0
marlboro,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
virginia slims,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
winston,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
winston select,0.0,,0.0,0.0,0.0,0.0,0.0,0.0


## Incorporate wholesale prices from Maxwell report 
https://www.industrydocuments.ucsf.edu/tobacco/documents/viewer/?iid=kzkx0049&id=kzkx0049&q=q%3Dnull%2Call%2Ccontains%2Cbox%3A%221125%22&db-set=documents&industry=tobacco&sort=relevance&pg=1&npp=20&rtool=metadata

The OCR the did not pan out :(

# Counterfactuals

In the first counterfactual, the tax does *not* get raised to 44c in 1993, and thus cigs are 14c cheaper. In the second counterfactual, the Illinois government is as paternalistic (revenue-hungry?) as their neighbors in Michigan, and sets the tax rate at 75c. We count govt revenue and social surplus separatedly. 

Social cost" refers to a quantification of the externality/tax burden created by a smoker. This varies from state to state depending on local conditions (eg welfare, employment, avg age). We do not have data for 1990, but according to a study conducted in 2018, the difference between the social cost of an active smoker and those of a former smoker was 2986$/yr in 2018 dollars, or 1,810.63/yr in 1995 dollars. 

We want to notice that 1. the study only counts productivity losses associated with smoking, the actual welfare loss is necessarily much higher 2. Both the quantity consumed by the average smoker, and the tolerance for smoking in public places, has declined significantly since 2018. Thus this recent estimation is likely very conservative and provides a lower-bound for the social surplus. 

https://pmc.ncbi.nlm.nih.gov/articles/PMC10108669/

In [166]:
df.columns

Index(['store', 'month_idx', 'prod_key', 'prod_type', 'prod_id', 'year52', 'm4', 'month_label', 'total_packs', 'total_rev', 'packs_per_item_wavg', 'price',
       'menthol', 'dlx', 'special', 'supslim', 'slim', 'single', 'carton', 'pack_kw', 'value', 'generic_automated', 'generic_hardcoded', 'cigar', 'snuff',
       'flavored', 'premium', 'cigarettes', 'ok', 'sale', 'tar_mean', 'nic_mean', 'co_mean', 'income', 'educ', 'hsizeavg', 'age9', 'age60', 'ethnic', 'nocar',
       'implied discount', 'brand', 'size', 'pack', 'brand_clean', 'upc_norm', 'market_size_month', 'prod_mkt_share', 'year_quarter', 'quarter_id',
       'custcount', 'market_size_quarter', 's0', 'nest', 'ln_s', 'ln_s_within'],
      dtype='object')

In [174]:
import numpy as np, pandas as pd

# ------------------------------------------------------------
# Inputs expected in df2 (your estimation sample):
# - keys: ["store","quarter_id","nest","prod_id"]
# - baseline cols: "prod_mkt_share" (s_j), "s0", "price"
# Optional:
# - market size column, e.g. "market_size" (units of potential consumers or total quantity base)
# - counterfactual prices in "price_cf"
# If "price_cf" is missing, you can pass a scalar tax or a dict of deltas to the function.
# ------------------------------------------------------------

def simulate_nested_welfare(
    df2,
    alpha,                 # price coefficient (negative)
    sigma,                 # nesting (0 < sigma < 1)
    *,
    tax_scalar=None,       # e.g., +0.50 (add-on to all prices); mutually exclusive with price_delta_map
    price_delta_map=None,  # dict {prod_id: Δp} to add product-specific deltas
    price_cf_col="price_cf",
    market_keys=("store","quarter_id"),
    market_size_col="market_size_quarter"   # if provided, Δ outside *units* is returned; else Δ outside share
):
    if not (0 < sigma < 1):
        raise ValueError("sigma should be in (0,1) for nested logit aggregation.")
    one_minus_sigma = 1.0 - sigma

    # Work copy
    d = df2.copy()

    # Ensure required logs exist (recompute robustly)
    # ln s_j - ln s_0
    d["ln_s"] = np.log(np.clip(d["prod_mkt_share"], 1e-15, 1)) - np.log(np.clip(d["s0"], 1e-15, 1))
    # ln s_{j|g} = ln s_j - ln (sum_{k in g} s_k)
    sg = d.groupby(list(market_keys)+["nest"], observed=True)["prod_mkt_share"].transform("sum")
    d = d[sg > 0].copy()
    d["ln_s_within"] = np.log(np.clip(d["prod_mkt_share"], 1e-15, 1)) - np.log(np.clip(sg, 1e-15, 1))

    # Invert baseline mean utility δ_hat from shares: δ_j = (ln s_j - ln s_0) - sigma * ln s_{j|g}
    d["delta_base"] = d["ln_s"] - sigma * d["ln_s_within"]

    # Build counterfactual prices
    p_new = d["price"].astype(float).copy()
    if price_cf_col in d.columns:
        p_new = d[price_cf_col].astype(float)
    else:
        if tax_scalar is not None:
            p_new = p_new + float(tax_scalar)
        if price_delta_map is not None:
            # add product-specific deltas where provided
            idxer = d["prod_id"].map(price_delta_map).fillna(0.0).astype(float)
            p_new = p_new + idxer

    # Counterfactual δ: δ_new = δ_base - α * (p_new - p_base)
    d["delta_new"] = d["delta_base"] + alpha * (p_new - d["price"].astype(float))
    # Helper to compute nested-logit shares from δ within each market:
    def nl_shares_for_market(frame):
        # frame has rows for a single market with columns: nest, delta_base, delta_new
        out = frame.copy()

        def shares_from_delta(colname):
            # D_g = sum_{j in g} exp( δ_j / (1-σ) )
            ex = np.exp(out[colname] / one_minus_sigma)
            Dg = ex.groupby(out["nest"]).transform("sum")
            # Group mass S_g = D_g^{1-σ} / (1 + sum_h D_h^{1-σ})
            Dg_group = ex.groupby(out["nest"]).sum()
            group_mass = (Dg_group ** one_minus_sigma).sum()
            s0 = 1.0 / (1.0 + group_mass)
            Sg = (Dg ** one_minus_sigma) * s0  # broadcast S_g to rows via Dg

            # Within-nest share s_{j|g} = exp(δ/(1-σ)) / D_g
            sj_given_g = ex / Dg

            # Product share: s_j = s_{j|g} * S_g
            sj = sj_given_g * Sg
            return sj, s0

        sj_base, s0_base = shares_from_delta("delta_base")
        sj_new,  s0_new  = shares_from_delta("delta_new")

        out["s_base_pred"] = sj_base
        out["s_new_pred"]  = sj_new
        # attach s0 once per market (duplicate per row for convenience)
        out["s0_base_pred"] = s0_base
        out["s0_new_pred"]  = s0_new
        return out

    # Apply per market
    d = d.groupby(list(market_keys), observed=True, group_keys=False).apply(nl_shares_for_market)

    # Consumer surplus per market (per consumer):
    # CS = (1/(-alpha)) * ln( 1 + sum_g [ sum_{j in g} exp(δ_j/(1-σ)) ]^{1-σ} )
    def cs_from_delta(frame, colname):
        ex = np.exp(frame[colname] / one_minus_sigma)
        Dg = ex.groupby(frame["nest"]).sum()
        logsum = np.log(1.0 + np.power(Dg, one_minus_sigma).sum())
        return float(logsum / (-alpha))

    cs_rows = []
    for mk, g in d.groupby(list(market_keys), observed=True):
        cs_base = cs_from_delta(g, "delta_base")
        cs_new  = cs_from_delta(g, "delta_new")
        s0_b    = float(g["s0_base_pred"].iloc[0])
        s0_n    = float(g["s0_new_pred"].iloc[0])

        # Δ outside option (units if market_size_col provided, else share)
        if (market_size_col is not None) and (market_size_col in g.columns):
            M = float(g[market_size_col].iloc[0])
            delta_outside_units = (s0_n - s0_b) * M
        else:
            M = np.nan
            delta_outside_units = (s0_n - s0_b)  # share-point change

        cs_rows.append({
            **{k:v for k,v in zip(market_keys, mk if isinstance(mk, tuple) else (mk,))},
            "CS_base": cs_base,
            "CS_new": cs_new,
            "dCS": cs_new - cs_base,
            "s0_base_pred": s0_b,
            "s0_new_pred": s0_n,
            "delta_outside": s0_n - s0_b,
            "market_size_used": M
        })
    market_summary = pd.DataFrame(cs_rows)

    # Totals
    if (market_size_col is not None) and (market_size_col in d.columns):
        # Unit-weighted (by market size) totals
        w = market_summary["market_size_used"].fillna(0.0)
        total_dCS = float((market_summary["dCS"] * w).sum())
        total_delta_outside = float(market_summary["delta_outside"].sum())
        total_note = "Weighted by provided market sizes"
    else:
        # Simple averages across markets (per consumer CS)
        total_dCS = float(market_summary["dCS"].mean())
        total_delta_outside = float(market_summary["delta_outside"].mean())
        total_note = "Averaged across markets (no market size provided)"

    totals = {
        "total_dCS": total_dCS,
        "total_delta_outside": total_delta_outside,
        "note": total_note
    }

    # Also return product-level predicted shares if you want them
    product_shares = d[list(market_keys)+["nest","prod_id","s_base_pred","s_new_pred","s0_base_pred","s0_new_pred"]].copy()

    return market_summary, totals, product_shares

# -----------------------------
# Example usage
# -----------------------------
alpha = float(iv.params["price"])
sigma = float(iv.params["ln_s_within"])

# Option A: everyone gets a +$0.50 tax
mkt_sum, totals, sh = simulate_nested_welfare(df2, alpha, sigma, tax_scalar=0.50)

# Option B: per-product deltas (e.g., a dict of UPC->Δp)
# price_delta = {"012345678901": 0.20, "098765432109": 0.10}
# mkt_sum, totals, sh = simulate_nested_welfare(df2, alpha, sigma, price_delta_map=price_delta)

# Option C: if you pre-computed df2["price_cf"], just call:
# mkt_sum, totals, sh = simulate_nested_welfare(df2, alpha, sigma)

print("\nPer-market summary (first rows):")
print(mkt_sum.head())
print("\nTotals:")
print(totals)



Per-market summary (first rows):
   store  quarter_id   CS_base    CS_new       dCS  s0_base_pred  s0_new_pred  delta_outside  market_size_used
0      2           1  0.039945  0.025569 -0.014376      0.964482     0.977117       0.012635     130056.541300
1      2           2  0.027723  0.017710 -0.010013      0.975214     0.984094       0.008880     723371.352870
2      2           3  0.036980  0.023659 -0.013320      0.967075     0.978808       0.011733     107830.291500
3      2           4  0.024935  0.015922 -0.009013      0.977679     0.985689       0.008010     677430.358277
4      2           5  0.018595  0.011861 -0.006734      0.983307     0.989319       0.006013     615094.484748

Totals:
{'total_dCS': -7988221.85213534, 'total_delta_outside': 12.480918828988365, 'note': 'Weighted by provided market sizes'}


In [None]:
import pandas as pd

# -----------------------------
# 1) Compute per-capita averages
# -----------------------------
# market_summary is the per-market output from simulate_nested_welfare(...)

mkt_sum, totals, sh = simulate_nested_welfare(df2, alpha, sigma, tax_scalar = 0.40)


# weighted average ΔCS per consumer (decrease in CS PER PACK, times average packs per month illinois)
wa_dCS = (
    (mkt_sum["dCS"] * mkt_sum["market_size_used"]).sum()
    / mkt_sum["market_size_used"].sum()
)*(6.685*12 *0.2325)

# weighted average change in outside-option share (Δ in share of non-bought PACKS 
wa_dS0 = (
    (mkt_sum["delta_outside"] * mkt_sum["market_size_used"]).sum()
    / mkt_sum["market_size_used"].sum()
)

# -----------------------------
# 2) Scale up to Illinois totals
# -----------------------------
ILLINOIS_SMOKERS = 11_450_000 * 0.2325          # ≈ 2.661 million smokers
COST_PER_SMOKER  = 1_810.63                     # yearly external social cost (USD)

# total consumer surplus change (aggregate for all consumers)
total_CS = wa_dCS * ILLINOIS_SMOKERS            # USD

# change in number of smokers (negative if fewer smokers)
delta_smokers = wa_dS0 * ILLINOIS_SMOKERS       # persons

# social surplus gain from reduced smoking
social_gain = delta_smokers * COST_PER_SMOKER  # USD (minus sign → fewer smokers increases welfare)

# net welfare effect = social gain − consumer surplus loss
net_effect = social_gain + total_CS

# -----------------------------
# 3) Build publication-ready table
# -----------------------------
rows = [
    ["Weighted avg. ΔCS (per consumer, $)", f"{wa_dCS:,.2f}"],
    ["Weighted avg. Δ outside share", f"{wa_dS0:.5f}"],
    ["Illinois smokers (millions × share)", f"{ILLINOIS_SMOKERS/1e6:,.3f}"],
    ["Aggregate ΔCS (Illinois, $)", f"{total_CS:,.0f}"],
    ["Δ smokers (individuals)", f"{delta_smokers:,.0f}"],
    ["Social gain from reduced smoking ($)", f"{social_gain:,.0f}"],
    ["Net welfare = social − CS ($)", f"{net_effect:,.0f}"],
]
table = pd.DataFrame(rows, columns=["Component", "Value"])

# pretty print
print("\n=== Welfare Decomposition: Statewide Effects of Cigarette Tax (Nested Logit) ===")
print(table.to_string(index=False))
