# $\text{Statistics}$

In [175]:
import pandas as pd

# === set your path ===
df = pd.read_csv("cigarettes.csv", low_memory=False, encoding="utf-8")

df

Unnamed: 0,upc,store,week,move,price,qty,profit,sale,ok,quantity,...,premium,implied discount,custcount,income,educ,hsizeavg,age9,age60,ethnic,nocar
0,190,8,195,2.0,21.51,1.0,50.49,,1,2.0,...,0.0,0.0,3395.0,10.597010,0.095173,2.769603,0.123155,0.252394,0.035243,0.075113
1,190,21,195,1.0,2.00,1.0,22.86,,1,1.0,...,0.0,0.0,2034.0,10.716194,0.177503,3.110391,0.175926,0.066896,0.105039,0.017598
2,190,32,195,2.0,21.51,1.0,50.49,,1,2.0,...,0.0,0.0,4109.0,10.674475,0.198260,2.401154,0.099061,0.254953,0.031939,0.071701
3,190,32,196,1.0,21.51,1.0,50.49,,1,1.0,...,0.0,0.0,4339.0,10.674475,0.198260,2.401154,0.099061,0.254953,0.031939,0.071701
4,190,32,198,1.0,21.51,1.0,50.48,,1,1.0,...,0.0,0.0,4740.0,10.674475,0.198260,2.401154,0.099061,0.254953,0.031939,0.071701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801465,8640912356,139,384,3.0,1.69,1.0,1.42,,1,3.0,...,1.0,0.0,2244.0,,,,,,,
1801466,8640912356,139,386,2.0,1.69,1.0,1.42,,1,2.0,...,1.0,0.0,2388.0,,,,,,,
1801467,8640912356,139,387,1.0,1.69,1.0,1.42,,1,1.0,...,1.0,0.0,2586.0,,,,,,,
1801468,8640912356,139,392,2.0,1.69,1.0,1.42,,1,2.0,...,1.0,0.0,2645.0,,,,,,,


In [62]:
# normalize brand to string and strip spaces
df["brand"] = df["brand"].astype("string").str.strip()

# coverage before drop
total = len(df)
has_brand = df["brand"].notna() & (df["brand"] != "")
has_category = (
    has_brand
    | (df["generic_hardcoded"] > 0)
    | (df["cigar"] > 0)
    | (df["snuff"] > 0)
    | (df["loose tobacco"] > 0)   # column name with a space is fine in []
)
n_has = has_category.sum()
print(f"Total rows: {total:,}")
print(f"Rows with brand (non-blank): {n_has:,}  ({n_has/total:.2%})")

# drop unlabeled (blank) rows
df = df.loc[has_category]
df

Total rows: 1,801,470
Rows with brand (non-blank): 1,698,355  (94.28%)


Unnamed: 0,upc,store,week,move,price,qty,profit,sale,ok,quantity,...,premium,implied discount,custcount,income,educ,hsizeavg,age9,age60,ethnic,nocar
0,190,8,195,2.0,21.51,1.0,50.49,,1,2.0,...,0.0,0.0,3395.0,10.597010,0.095173,2.769603,0.123155,0.252394,0.035243,0.075113
1,190,21,195,1.0,2.00,1.0,22.86,,1,1.0,...,0.0,0.0,2034.0,10.716194,0.177503,3.110391,0.175926,0.066896,0.105039,0.017598
2,190,32,195,2.0,21.51,1.0,50.49,,1,2.0,...,0.0,0.0,4109.0,10.674475,0.198260,2.401154,0.099061,0.254953,0.031939,0.071701
3,190,32,196,1.0,21.51,1.0,50.49,,1,1.0,...,0.0,0.0,4339.0,10.674475,0.198260,2.401154,0.099061,0.254953,0.031939,0.071701
4,190,32,198,1.0,21.51,1.0,50.48,,1,1.0,...,0.0,0.0,4740.0,10.674475,0.198260,2.401154,0.099061,0.254953,0.031939,0.071701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801465,8640912356,139,384,3.0,1.69,1.0,1.42,,1,3.0,...,1.0,0.0,2244.0,,,,,,,
1801466,8640912356,139,386,2.0,1.69,1.0,1.42,,1,2.0,...,1.0,0.0,2388.0,,,,,,,
1801467,8640912356,139,387,1.0,1.69,1.0,1.42,,1,1.0,...,1.0,0.0,2586.0,,,,,,,
1801468,8640912356,139,392,2.0,1.69,1.0,1.42,,1,2.0,...,1.0,0.0,2645.0,,,,,,,


In [15]:
# --- brand distribution ---
brand_counts = df["brand"].value_counts(dropna=True)
brand_share = (brand_counts / brand_counts.sum() * 100).round(2)

summary = pd.DataFrame({
    "count": brand_counts,
    "share(%)": brand_share
})

# display top 3 brands
print("Top brands by frequency:")
display(summary)

# summary stats
print("\nNumber of unique brands:", summary.shape[0])
print("Total observations:", len(df))
print("Top 3 share cumulative:", brand_share.head(3).sum().round(2), "%")

Top brands by frequency:


Unnamed: 0_level_0,count,share(%)
brand,Unnamed: 1_level_1,Unnamed: 2_level_1
Marlboro,41518,41.36
Benson & Hedges,22232,22.15
Kool,14660,14.6
Virginia Slims,13798,13.74
Winston,6625,6.6
Basic,1553,1.55
Doral,1,0.0



Number of unique brands: 7
Total observations: 100387
Top 3 share cumulative: 78.11 %


# $\text{Benchmark}$

In [102]:
import pandas as pd
import numpy as np

# ================================
# Helpers
# ================================
def to_num_col(df, col, default=0):
    """Always return a numeric Series (even if the column is missing)."""
    if col in df.columns:
        return pd.to_numeric(df[col], errors="coerce").fillna(default)
    return pd.Series(default, index=df.index, dtype="float64")

def coerce_binary(series):
    """Coerce possibly-object/boolean string to numeric 0/1 int8."""
    s_num = pd.to_numeric(series, errors="coerce")
    s_txt = (series.astype("string").str.upper().str.strip()
             .map({"Y":1, "YES":1, "T":1, "TRUE":1,
                   "N":0, "NO":0, "F":0, "FALSE":0}))
    s = s_num.fillna(s_txt).fillna(0)
    return (s > 0).astype("int8")

def first_nonnull(s):
    return s.dropna().iloc[0] if s.notna().any() else np.nan

def pick_col(cols, *cands):
    for c in cands:
        if c in cols: return c
    low = {c.lower(): c for c in cols}
    for c in cands:
        if c.lower() in low: return low[c.lower()]
    return None

# ================================
# 1) Filter to tobacco categories you want
# ================================
brand_nonempty = df["brand"].astype("string").str.strip().ne("").fillna(False)
has_category = (
      brand_nonempty
    | (to_num_col(df, "generic_hardcoded") > 0)
    | (to_num_col(df, "cigar") > 0)
    | (to_num_col(df, "snuff") > 0)
    | (to_num_col(df, "loose tobacco") > 0)
)
df = df.loc[has_category].copy()

# generic are cigarettes by definition; make a clean cigarettes flag
cig_f = (to_num_col(df, "cigarettes") > 0).astype(int)
is_generic = (to_num_col(df, "generic_hardcoded") > 0)
cig_f = np.where(is_generic, 1, cig_f)
df["cigarettes"] = cig_f

# ================================
# 2) Normalize keys & validity
# ================================
df["store"] = pd.to_numeric(df["store"], errors="coerce").astype("Int64")
df["week"]  = pd.to_numeric(df["week"],  errors="coerce").astype("Int64")
# UPC (digits only) – this is the product id for branded cigarettes
df["upc_norm"] = df["upc"].astype("string").str.replace(r"\D", "", regex=True)
df = df.loc[df["store"].notna() & df["week"].notna() & df["upc_norm"].notna()].copy()

# ================================
# 3) Convert to PACKS (handle cartons / 10PK / 10CT)
# ================================
carton_col = pick_col(df.columns, "carton", "Carton", "CARTON")

s = df["size"].astype("string").str.upper().str.replace(r"\s+", " ", regex=True).str.strip()
is_10ct   = s.str.contains(r"\b10\s*CT\b", regex=True, na=False)
is_10pk   = s.str.contains(r"\b10\s*PK\b", regex=True, na=False)
is_carton = s.str.contains(r"\bCARTON\b",  regex=True, na=False)
looks_10  = is_10ct | is_10pk | is_carton

if carton_col is None:
    df["packs_per_item"] = np.where(looks_10, 10.0, 1.0)
else:
    c = to_num_col(df, carton_col).astype(int)
    df["packs_per_item"] = np.where(c == 1, 10.0, 1.0)
    df.loc[(c != 1) & looks_10, "packs_per_item"] = 10.0

df["price"] = to_num_col(df, "price")
df["move"]  = to_num_col(df, "move")
df["qty"]   = to_num_col(df, "qty")

df = df.loc[(df["qty"] > 0) & (df["packs_per_item"] > 0)].copy()
df["row_revenue"] = df["price"] * df["move"] / df["qty"]
df["pack_sales"]  = df["move"]  * df["packs_per_item"]

# ================================
# 4) Time: 13-week quarters aligned to min week
# ================================
base_week = int(df["week"].min())
df["year52"] = ((df["week"] - base_week) // 52 + 1).astype("Int64")
df["qtr13"]  = (((df["week"] - base_week) % 52) // 13 + 1).astype("Int64")
df["quarter_idx"] = ((df["year52"] - 1) * 4 + df["qtr13"]).astype("Int64")

# ================================
# 5) Product definition
#    - Branded cigarettes:   product_id = UPC
#    - Generic cigarettes:   product_id = "generic"
#    - Cigar/Snuff/Loose:    product_id = "cigar"/"snuff"/"loose_tobacco"
# ================================
df["brand_clean"] = (
    df["brand"].astype("string")
      .str.strip()
      .str.replace(r"\s+", " ", regex=True)
      .str.lower()
)
has_brand = df["brand_clean"].ne("").fillna(False)

is_cigar = (to_num_col(df, "cigar") > 0)
is_snuff = (to_num_col(df, "snuff") > 0)
is_loose = (to_num_col(df, "loose tobacco") > 0)
is_cig   = (df["cigarettes"] > 0) | is_generic | has_brand

# product family (lower-case tokens)
df["prod_type"] = np.select(
    [is_cigar,  is_snuff,  is_loose,  is_cig],
    ["cigar",   "snuff",   "loose_tobacco", "cigarette"],
    default="cigarette",
)

# product id per your rule
df["prod_id"] = np.where(
    (df["prod_type"] == "cigarette") & has_brand, df["upc_norm"],
    np.where((df["prod_type"] == "cigarette") & (~has_brand) & is_generic, "generic",
             np.where(df["prod_type"] == "cigar", "cigar",
                      np.where(df["prod_type"] == "snuff", "snuff",
                               np.where(df["prod_type"] == "loose_tobacco", "loose_tobacco", "unbranded"))))
)

df["prod_key"] = df["prod_type"] + "|" + df["prod_id"]

# ================================
# 6) Prepare characteristics (coerce dummies to 0/1)
# ================================
known_dummies = [
    "menthol","dlx","special","supslim","slim","generic","single","carton","pack_kw","value",
    "generic_automated","generic_hardcoded","cigar","snuff","loose_tobacco","flavored","premium",
    "cigarettes","ok","sale"
]
dummy_cols = [c for c in known_dummies if c in df.columns]
for c in dummy_cols:
    df[c] = coerce_binary(df[c])

known_continuous = [
    "tar_mean","nic_mean","co_mean",
    "income","educ","hsizeavg","age9","age60","ethnic","nocar","custcount"
]
if "implied discount" in df.columns:
    known_continuous.append("implied discount")

cat_cols = [c for c in ["brand","size","pack"] if c in df.columns]

# ================================
# 7) Aggregate to product × (store, quarter)
# ================================
group_cols = ["store","quarter_idx","year52","qtr13","prod_key","prod_type","prod_id"]

agg_dict = {
    "pack_sales": ("pack_sales","sum"),
    "row_revenue": ("row_revenue","sum"),
}
for c in dummy_cols:
    agg_dict[c] = (c, "max")
for c in known_continuous:
    agg_dict[c] = (c, first_nonnull)
for c in cat_cols + ["brand_clean","upc_norm"]:
    if c in df.columns:
        agg_dict[c] = (c, first_nonnull)

prod_market_q = (
    df.groupby(group_cols, as_index=False, observed=True)
      .agg(**agg_dict)
      .rename(columns={"pack_sales":"total_packs","row_revenue":"total_rev"})
)

# sales-weighted packs_per_item (handles mix of packs/cartons)
if "packs_per_item" in df.columns:
    w = (
        df.groupby(group_cols, observed=True)
          .apply(lambda g: np.average(g["packs_per_item"], weights=g["pack_sales"])
                 if g["pack_sales"].sum() > 0 else np.nan)
          .reset_index(name="packs_per_item_wavg")
    )
    prod_market_q = prod_market_q.merge(w, on=group_cols, how="left")

# price per pack
prod_market_q["avg_pack_price"] = prod_market_q["total_rev"] / prod_market_q["total_packs"]
prod_market_q.loc[~np.isfinite(prod_market_q["avg_pack_price"]), "avg_pack_price"] = np.nan

# ================================
# 8) Market size per quarter = 1.5 × (max store total in that quarter across stores)
# ================================
store_qtr_total = (
    prod_market_q.groupby(["store","quarter_idx"], observed=True)["total_packs"]
                 .sum()
                 .reset_index(name="store_quarter_total_packs")
)
qtr_max_store = (
    store_qtr_total.groupby("quarter_idx", observed=True)["store_quarter_total_packs"]
                   .max()
                   .reset_index(name="max_store_total_in_quarter")
)
qtr_max_store["market_size_quarter"] = 1.5 * qtr_max_store["max_store_total_in_quarter"]

prod_market_q = prod_market_q.merge(
    qtr_max_store[["quarter_idx","market_size_quarter"]],
    on="quarter_idx", how="left", validate="many_to_one"
)

prod_market_q["prod_mkt_share"] = prod_market_q["total_packs"] / prod_market_q["market_size_quarter"]

# ================================
# 9) Diagnostics
# ================================
n_markets = df[["store","quarter_idx"]].drop_duplicates().shape[0]
n_rows = prod_market_q.shape[0]
print(f"Product–market rows: {n_rows:,}")
print(f"Store–quarter markets: {n_markets:,}")
print(f"Avg products per market: {n_rows / max(n_markets,1):.2f}")

counts = prod_market_q.groupby(["store","quarter_idx"]).size()
print(counts.describe())

sum_share_storeq = (
    prod_market_q.groupby(["store","quarter_idx"], observed=True)["prod_mkt_share"]
                 .sum()
)
print("Mean ∑ shares per (store,quarter):", float(sum_share_storeq.mean()))
print("Max  ∑ shares per (store,quarter):",  float(sum_share_storeq.max()))

# Final DataFrame: prod_market_q
prod_market_q


Product–market rows: 34,565
Store–quarter markets: 2,556
Avg products per market: 13.52
count    2556.000000
mean       13.523083
std        21.670885
min         1.000000
25%         1.000000
50%         1.000000
75%        16.000000
max        83.000000
dtype: float64
Mean ∑ shares per (store,quarter): 0.27195963274752066
Max  ∑ shares per (store,quarter): 0.6666666666666667


  .apply(lambda g: np.average(g["packs_per_item"], weights=g["pack_sales"])


Unnamed: 0,store,quarter_idx,year52,qtr13,prod_key,prod_type,prod_id,total_packs,total_rev,menthol,...,implied discount,brand,size,pack,brand_clean,upc_norm,packs_per_item_wavg,avg_pack_price,market_size_quarter,prod_mkt_share
0,2,1,1,1,cigarette|generic,cigarette,generic,15875.0,30036.42,0,...,0.0,,1 CT,UNK,,193,2.672441,1.892058,83493.0,0.190136
1,2,2,1,2,cigarette|generic,cigarette,generic,16143.0,31889.01,0,...,0.0,,1 CT,UNK,,193,2.616800,1.975408,66259.5,0.243633
2,2,3,1,3,cigarette|generic,cigarette,generic,12707.0,25139.76,0,...,0.0,,1 CT,UNK,,193,2.494452,1.978418,66882.0,0.189991
3,2,4,1,4,cigarette|generic,cigarette,generic,13441.0,26578.98,0,...,0.0,,1 CT,UNK,,193,2.359274,1.977456,69153.0,0.194366
4,2,5,2,1,cigarette|generic,cigarette,generic,8809.0,17573.58,0,...,0.0,,1 CT,UNK,,193,1.572142,1.994957,64165.5,0.137286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34560,146,31,8,3,cigarette|2820011110,cigarette,2820011110,20.0,43.98,0,...,0.0,Benson & Hedges,10 PK,UNK,benson & hedges,2820011110,10.000000,2.199000,13425.0,0.001490
34561,146,31,8,3,cigarette|2820011600,cigarette,2820011600,10.0,21.99,0,...,0.0,Virginia Slims,10 PK,UNK,virginia slims,2820011600,10.000000,2.199000,13425.0,0.000745
34562,146,31,8,3,cigarette|2820011620,cigarette,2820011620,30.0,65.97,0,...,0.0,Virginia Slims,10 PK,UNK,virginia slims,2820011620,10.000000,2.199000,13425.0,0.002235
34563,146,31,8,3,cigarette|2820012100,cigarette,2820012100,10.0,21.99,0,...,0.0,Virginia Slims,10 PK,UNK,virginia slims,2820012100,10.000000,2.199000,13425.0,0.000745


In [156]:
# Impute the tar, nic, co for generic

# Work on a copy
pm = prod_market_q.copy()

# Target TNCO columns (keep only those present)
targets = [c for c in ["tar_mean","nic_mean","co_mean"] if c in pm.columns]
if not targets:
    raise ValueError("None of tar_mean/nic_mean/co_mean are present in prod_market_q.")

# Ensure numeric dtype
for c in targets:
    pm[c] = pd.to_numeric(pm[c], errors="coerce")

# --- Identify generic cigarettes (robust) ---
is_cig = pm["prod_type"].astype(str).str.lower().eq("cigarette")

is_generic_pid  = ("prod_id" in pm.columns) and pm["prod_id"].astype(str).str.lower().eq("generic")
is_generic_tok  = ("brand_token" in pm.columns) and pm["brand_token"].astype(str).str.lower().eq("generic")
is_generic_flag = ("generic_hardcoded" in pm.columns) and pd.to_numeric(pm["generic_hardcoded"], errors="coerce").fillna(0).gt(0)

# Convert potential booleans/False into Series of False if needed
def _ensure_series(x):
    return x if isinstance(x, pd.Series) else pd.Series(False, index=pm.index)

is_generic = is_cig & (_ensure_series(is_generic_pid) | _ensure_series(is_generic_tok) | _ensure_series(is_generic_flag))
donors     = is_cig & ~is_generic

# --- Global means from non-generic cigarettes only ---
global_means = pm.loc[donors, targets].mean(skipna=True)

# Optional sanity check
print("Global TNCO means from non-generic cigarettes:")
print(global_means.to_string())

# --- Assign time-invariant TNCO to ALL generic cigarette rows ---
for c in targets:
    pm.loc[is_generic, c] = float(global_means[c])
    # add a provenance flag (0/1) so you can track these later
    pm.loc[is_generic, f"{c}_imputed_global_nongeneric_mean"] = 1
    pm.loc[~is_generic, f"{c}_imputed_global_nongeneric_mean"] = 0

# Resulting frame with generic TNCO set to global non-generic means
prod_market_q_imputed = pm

# Quick report
n_gen = int(is_generic.sum())
print(f"\nGeneric cigarette rows updated: {n_gen:,}")
for c in targets:
    after_na = prod_market_q_imputed.loc[is_generic, c].isna().mean()
    print(f"{c}: generic NA rate AFTER = {after_na:.1%} (should be 0.0% if mean was defined)")

Global TNCO means from non-generic cigarettes:
tar_mean    13.646174
nic_mean     0.991526
co_mean     12.732814

Generic cigarette rows updated: 3,436
tar_mean: generic NA rate AFTER = 0.0% (should be 0.0% if mean was defined)
nic_mean: generic NA rate AFTER = 0.0% (should be 0.0% if mean was defined)
co_mean: generic NA rate AFTER = 0.0% (should be 0.0% if mean was defined)


In [169]:
import numpy as np, pandas as pd
import statsmodels.api as sm
from linearmodels.iv import IV2SLS

# =========================
# 0) Start from your quarter panel
# =========================
df = prod_market_q.copy()
df = df.rename(columns={"avg_pack_price": "price"})
mkt = ["store","quarter_idx"]  # market = store × quarter

# Keep valid inside share and build log share ratio
sum_inside = df.groupby(mkt, observed=True)["prod_mkt_share"].transform("sum")
df = df[(df["prod_mkt_share"] > 0) & (sum_inside < 1)].copy()
df["s0"]    = np.clip(1.0 - sum_inside, 1e-12, 1 - 1e-12)                # outside share
df["log_s"] = np.log(df["prod_mkt_share"]) - np.log(df["s0"])

# Controls (use only those that exist in your frame)
ctrl_cands = ["dlx","supslim","slim","value","premium","flavored",
                       "tar_mean","nic_mean","co_mean", "carton"]
Xnames = ["price"] + [c for c in ctrl_cands if c in df.columns]

# Keep rows with no NA in used columns
need = ["log_s","price","store","quarter_idx"] + Xnames
df2 = df.dropna(subset=need).copy()

# A safe product id for IVs (UPC-level for branded)
if "prod_id" not in df2.columns:
    if "upc_norm" in df2.columns:
        df2["prod_id"] = df2["upc_norm"].astype("string")
    elif "upc" in df2.columns:
        df2["prod_id"] = df2["upc"].astype("string").str.replace(r"\D","", regex=True)
    else:
        # fallback (weak for IVs, but avoids a crash)
        df2["prod_id"] = df2.index.astype("string")

# =========================
# 1) MARKET FE (store×quarter) — absorb by within transform
# =========================
def demean_within(frame, cols, keys):
    """Within transform to absorb market FE."""
    means = frame.groupby(keys, observed=True)[cols].transform("mean")
    return frame[cols] - means

# Build X (price + controls), then demean X and y within market
X_raw   = df2[Xnames].copy()
y_raw   = df2["log_s"].copy()

X_tilde = demean_within(df2, Xnames, mkt)
y_tilde = (y_raw - df2.groupby(mkt, observed=True)["log_s"].transform("mean")).rename("log_s")

# Drop rows with NA after demeaning; drop any zero-variance columns
keep_idx = X_tilde.join(y_tilde).dropna().index
X_tilde, y_tilde, df2 = X_tilde.loc[keep_idx], y_tilde.loc[keep_idx], df2.loc[keep_idx].copy()
X_tilde = X_tilde.loc[:, X_tilde.apply(lambda s: np.nanstd(s.to_numpy()) > 0)]

clusters = pd.to_numeric(df2["store"], errors="coerce").astype(int).to_numpy()

# -------------------------
# (A) OLS with MARKET FE
# -------------------------
ols = sm.OLS(y_tilde, X_tilde).fit(cov_type="cluster", cov_kwds={"groups": clusters})
print("\n[Simple Logit OLS | Market FE absorbed]")
print(ols.summary().tables[1])

alpha_ols = ols.params.get("price", np.nan)
eps_ols   = alpha_ols * df2["price"].mean() * (1 - df2["prod_mkt_share"].mean())
print(f"\nImplied avg own-price elasticity (OLS, simple logit): {eps_ols:.2f}")

# =========================
# 2) IV (2SLS) with MARKET FE
# Instruments:
#   z1: same UPC × same quarter, other-store mean price (best when available)
#   z2: same UPC, other-store mean price across ALL quarters (fallback)
#   z3: brand×quarter other-store mean (for branded cigs) else prod_type×quarter (fallback)
# All demeaned within market to match FE absorption.
# =========================

# Safe brand string (if you preserved it)
if "brand_clean" in df2.columns:
    brand_series = df2["brand_clean"]
elif "brand" in df2.columns:
    brand_series = df2["brand"]
else:
    brand_series = pd.Series("", index=df2.index)
brand = brand_series.astype("string").str.strip().str.lower().fillna("")
is_branded_cig = df2["prod_type"].str.lower().eq("cigarette") & brand.ne("")

# --- z1: UPC × quarter, leave-one-store mean price ---
g_uq = df2.groupby(["prod_id","quarter_idx"])
cnt_uq = g_uq["price"].transform("count")
sum_uq = g_uq["price"].transform("sum")
z1_raw = np.where(cnt_uq > 1, (sum_uq - df2["price"]) / (cnt_uq - 1), np.nan)
Z1 = demean_within(pd.concat([df2[mkt], pd.Series(z1_raw, index=df2.index, name="z1")], axis=1),
                   ["z1"], mkt)["z1"]

# --- z2: UPC across all quarters, leave-one-STORE mean ---
g_u  = df2.groupby(["prod_id"])
cnt_u, sum_u = g_u["price"].transform("count"), g_u["price"].transform("sum")
g_us = df2.groupby(["prod_id","store"])
cnt_us, sum_us = g_us["price"].transform("count"), g_us["price"].transform("sum")
z2_raw = np.where((cnt_u - cnt_us) > 0, (sum_u - sum_us) / (cnt_u - cnt_us), np.nan)
Z2 = demean_within(pd.concat([df2[mkt], pd.Series(z2_raw, index=df2.index, name="z2")], axis=1),
                   ["z2"], mkt)["z2"]

# --- z3: brand×quarter (branded cigs) or prod_type×quarter (others), leave-one-store mean ---
# brand×quarter
gbq     = df2.groupby(["brand_clean","quarter_idx"]) if "brand_clean" in df2.columns else None
if gbq is not None:
    cnt_bq, sum_bq = gbq["price"].transform("count"), gbq["price"].transform("sum")
    gbqs           = df2.groupby(["brand_clean","quarter_idx","store"])
    cnt_bqs, sum_bqs = gbqs["price"].transform("count"), gbqs["price"].transform("sum")
    z3b_raw = np.where(is_branded_cig & ((cnt_bq - cnt_bqs) > 0),
                       (sum_bq - sum_bqs) / (cnt_bq - cnt_bqs), np.nan)
else:
    z3b_raw = np.full(len(df2), np.nan)

# prod_type×quarter
gtq     = df2.groupby(["prod_type","quarter_idx"])
cnt_tq, sum_tq = gtq["price"].transform("count"), gtq["price"].transform("sum")
gtqs    = df2.groupby(["prod_type","quarter_idx","store"])
cnt_tqs, sum_tqs = gtqs["price"].transform("count"), gtqs["price"].transform("sum")
z3t_raw = np.where((~is_branded_cig) & ((cnt_tq - cnt_tqs) > 0),
                   (sum_tq - sum_tqs) / (cnt_tq - cnt_tqs), np.nan)

z3_raw = np.where(is_branded_cig, z3b_raw, z3t_raw)
Z3 = demean_within(pd.concat([df2[mkt], pd.Series(z3_raw, index=df2.index, name="z3")], axis=1),
                   ["z3"], mkt)["z3"]

# Combine instruments, drop non-varying/empty ones
Z = pd.concat([Z1, Z2, Z3], axis=1)
goodZ = [c for c in Z.columns if Z[c].notna().any() and Z[c].std(skipna=True) > 0]
Z = Z[goodZ]

# Align IV matrices and drop rows with NA in y/X/Z
data_iv = pd.concat([y_tilde, X_tilde, Z], axis=1).dropna()
y_iv    = data_iv["log_s"]
X_iv    = data_iv[X_tilde.columns]          # includes price + controls (demeaned)
Z_iv    = data_iv[goodZ]
clusters_iv = pd.to_numeric(df2.loc[data_iv.index, "store"], errors="coerce").astype(int).to_numpy()

# Guard against collinearity in exog (demeaned, no constant)
exog = X_iv.drop(columns=["price"])
exog = exog.loc[:, exog.apply(lambda s: np.nanstd(s.to_numpy()) > 0)]

iv = IV2SLS(
    dependent=y_iv,
    exog=exog,                # no constant; within transform removed it
    endog=X_iv[["price"]],
    instruments=Z_iv
).fit(cov_type="clustered", clusters=clusters_iv)

print("\n[Simple Logit IV (2SLS) | Market FE absorbed | multi-source instruments]")
print(iv.summary)

# First-stage (for price), if available
try:
    print("\n[First Stage for price]")
    print(iv.first_stage["price"].summary)
except Exception:
    pass

alpha_iv = iv.params.get("price", np.nan)
eps_iv   = alpha_iv * df2.loc[data_iv.index, "price"].mean() * (1 - df2.loc[data_iv.index, "prod_mkt_share"].mean())
print(f"\nImplied avg own-price elasticity (IV, simple logit): {eps_iv:.2f}")



[Simple Logit OLS | Market FE absorbed]
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
price         -0.4810      0.064     -7.560      0.000      -0.606      -0.356
slim           0.7850      0.152      5.180      0.000       0.488       1.082
value         -1.3156      0.098    -13.423      0.000      -1.508      -1.124
premium       -0.4195      0.037    -11.397      0.000      -0.492      -0.347
flavored      -0.2469      0.046     -5.380      0.000      -0.337      -0.157
tar_mean       0.0743      0.053      1.401      0.161      -0.030       0.178
nic_mean      -3.1642      1.040     -3.042      0.002      -5.203      -1.126
co_mean       -0.1346      0.036     -3.762      0.000      -0.205      -0.064
carton         0.5972      0.052     11.512      0.000       0.495       0.699

Implied avg own-price elasticity (OLS, simple logit): -1.16

[Simple Logit IV (2SLS) | Ma

In [164]:
import numpy as np, pandas as pd
import statsmodels.api as sm
from linearmodels.iv import IV2SLS

# =========================
# 0) Start & basic fields
# =========================
df = prod_market_q.copy()
df = df.rename(columns={"avg_pack_price": "price"})
mkt = ["store", "quarter_idx"]

# 2-way nest: cigarettes vs non-cigarettes
df["nest"] = np.where(df["prod_type"].str.lower().eq("cigarette"), "cig", "noncig")

# =========================
# 1) Shares for nested logit
# =========================
sum_inside = df.groupby(mkt, observed=True)["prod_mkt_share"].transform("sum")
df = df[(df["prod_mkt_share"] > 0) & (sum_inside < 1)].copy()

df["s0"] = np.clip(1.0 - sum_inside, 1e-12, 1 - 1e-12)
df["sg"] = df.groupby(mkt + ["nest"], observed=True)["prod_mkt_share"].transform("sum")
df = df[df["sg"] > 0].copy()

df["ln_s"]        = np.log(df["prod_mkt_share"]) - np.log(df["s0"])
df["ln_s_within"] = np.log(df["prod_mkt_share"]) - np.log(df["sg"])   # ln(s_{j|g})

# controls (use only those that exist)
controls = [c for c in ["dlx","supslim","slim","value","premium","flavored",
                        "tar_mean","nic_mean","co_mean","carton"] if c in df.columns]
use_cols = ["price", "ln_s_within"] + controls

# =========================
# 2) Absorb MARKET FE: within transform
# =========================
def demean_within(df_in, cols, key_cols):
    means = df_in.groupby(key_cols, observed=True)[cols].transform("mean")
    return df_in[cols] - means

X_tilde = demean_within(df, use_cols, mkt)
y_tilde = (df["ln_s"] - df.groupby(mkt, observed=True)["ln_s"].transform("mean")).rename("ln_s")

# drop rows with NA post-demean
keep_idx = X_tilde.join(y_tilde).dropna().index
X_tilde, y_tilde, df2 = X_tilde.loc[keep_idx], y_tilde.loc[keep_idx], df.loc[keep_idx].copy()

# drop any columns with ~zero variance after demeaning
nzv = [c for c in X_tilde.columns if np.nanstd(X_tilde[c].to_numpy()) > 0]
X_tilde = X_tilde[nzv]
use_cols = [c for c in use_cols if c in nzv]  # keep in sync

clusters = pd.to_numeric(df2["store"], errors="coerce").astype(int).to_numpy()

# =========================
# 3) Instruments that survive coverage gaps
# =========================
# (a) UPC×quarter, leave-one-store mean price (z1)
g = df2.groupby(["prod_id", "quarter_idx"])
cnt_uq = g["price"].transform("count")
sum_uq = g["price"].transform("sum")
z1_raw = np.where(cnt_uq > 1, (sum_uq - df2["price"]) / (cnt_uq - 1), np.nan)
Z1 = demean_within(pd.concat([df2[mkt], pd.Series(z1_raw, index=df2.index, name="z1")], axis=1),
                   ["z1"], mkt)["z1"]

# (b) UPC, leave-one-STORE mean price over all quarters (z2)
gu  = df2.groupby(["prod_id"])
cnt_u, sum_u = gu["price"].transform("count"), gu["price"].transform("sum")
gus = df2.groupby(["prod_id","store"])
cnt_us, sum_us = gus["price"].transform("count"), gus["price"].transform("sum")
z2_raw = np.where((cnt_u - cnt_us) > 0, (sum_u - sum_us) / (cnt_u - cnt_us), np.nan)
Z2 = demean_within(pd.concat([df2[mkt], pd.Series(z2_raw, index=df2.index, name="z2")], axis=1),
                   ["z2"], mkt)["z2"]

# (c) brand×quarter (branded cigs) else prod_type×quarter (z3)
# safe brand series (no 'or' on Series)
if "brand_clean" in df2.columns:
    brand_series = df2["brand_clean"]
elif "brand" in df2.columns:
    brand_series = df2["brand"]
else:
    brand_series = pd.Series("", index=df2.index)

brand = brand_series.astype("string").str.strip().str.lower().fillna("")
is_branded_cig = df2["prod_type"].str.lower().eq("cigarette") & brand.ne("")

# brand×quarter leave-one-store mean
gbq = df2.groupby(["brand_clean","quarter_idx"])
cnt_bq, sum_bq = gbq["price"].transform("count"), gbq["price"].transform("sum")
gbqs = df2.groupby(["brand_clean","quarter_idx","store"])
cnt_bqs, sum_bqs = gbqs["price"].transform("count"), gbqs["price"].transform("sum")
z3b_raw = np.where(is_branded_cig & ((cnt_bq - cnt_bqs) > 0),
                   (sum_bq - sum_bqs) / (cnt_bq - cnt_bqs), np.nan)

# prod_type×quarter leave-one-store mean
gtq = df2.groupby(["prod_type","quarter_idx"])
cnt_tq, sum_tq = gtq["price"].transform("count"), gtq["price"].transform("sum")
gtqs = df2.groupby(["prod_type","quarter_idx","store"])
cnt_tqs, sum_tqs = gtqs["price"].transform("count"), gtqs["price"].transform("sum")
z3t_raw = np.where((~is_branded_cig) & ((cnt_tq - cnt_tqs) > 0),
                   (sum_tq - sum_tqs) / (cnt_tq - cnt_tqs), np.nan)

z3_raw = np.where(is_branded_cig, z3b_raw, z3t_raw)
Z3 = demean_within(pd.concat([df2[mkt], pd.Series(z3_raw, index=df2.index, name="z3")], axis=1),
                   ["z3"], mkt)["z3"]

# combine and keep instruments that exist and vary
Z = pd.concat([Z1, Z2, Z3], axis=1)
goodZ = [c for c in Z.columns if Z[c].notna().any() and Z[c].std(skipna=True) > 0]
Z = Z[goodZ]

# align for IV (drop rows with NA in y/X/Z)
data_iv = pd.concat([y_tilde, X_tilde, Z], axis=1).dropna()
y_iv    = data_iv["ln_s"]
X_iv    = data_iv[use_cols]                # includes price & ln_s_within + controls (demeaned)
Z_iv    = data_iv[goodZ]
clusters_iv = pd.to_numeric(df2.loc[data_iv.index, "store"], errors="coerce").astype(int).to_numpy()

# =========================
# 4) OLS (market FE absorbed) — reference
# =========================
ols = sm.OLS(y_iv, sm.add_constant(X_iv)).fit(cov_type="cluster",
                                              cov_kwds={"groups": clusters_iv})
print("\n[OLS Nested Logit | Market FE absorbed]")
print(ols.summary().tables[1])

# =========================
# 5) IV 2SLS (market FE absorbed) — NO constant (demeaned data)
# =========================
# guard against collinearity: exog must have full column rank
exog = X_iv.drop(columns=["price"])
# drop any zero-variance exog columns (rare but safe)
exog = exog.loc[:, exog.apply(lambda s: np.nanstd(s.to_numpy()) > 0)]

iv = IV2SLS(
    dependent=y_iv,
    exog=exog,               # no constant: within transform removed it
    endog=X_iv[["price"]],
    instruments=Z_iv
).fit(cov_type="clustered", clusters=clusters_iv)
print("\n[IV Nested Logit | Market FE absorbed | multi-source instruments]")
print(iv.summary)

# First-stage for 'price' (if available)
try:
    print("\n[First Stage for price]")
    print(iv.first_stage["price"].summary)
except Exception:
    pass

# quick elasticity
sigma = iv.params.get("ln_s_within", np.nan)
alpha = iv.params.get("price", np.nan)
sbar   = df2.loc[data_iv.index, "prod_mkt_share"].mean()
sjgbar = (df2.loc[data_iv.index, "prod_mkt_share"] / df2.loc[data_iv.index, "sg"]).mean()
pbar   = df2.loc[data_iv.index, "price"].mean()
eps    = -alpha * pbar * (1 - sigma*(1 - sjgbar) - sbar)
print(f"\nσ (nesting): {sigma:.3f} | implied avg own-price elasticity ≈ {eps:.2f}")



[OLS Nested Logit | Market FE absorbed]
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.0246      0.005      4.915      0.000       0.015       0.034
price           0.0129      0.003      3.943      0.000       0.007       0.019
ln_s_within     0.9982      0.000   2544.174      0.000       0.997       0.999
slim            0.0594      0.013      4.719      0.000       0.035       0.084
value           0.0036      0.003      1.204      0.229      -0.002       0.010
premium         0.0009      0.002      0.561      0.575      -0.002       0.004
flavored       -0.0046      0.002     -2.253      0.024      -0.009      -0.001
tar_mean        0.0181      0.004      4.624      0.000       0.010       0.026
nic_mean       -0.3636      0.078     -4.669      0.000      -0.516      -0.211
co_mean        -0.0117      0.003     -4.380      0.000      -0.017      -0.006

# $\text{Benchmark}:$ Alternative Market Definition (Drop those ill markets)

In [177]:
import pandas as pd
import numpy as np

# ================================
# Helpers
# ================================
def to_num_col(df, col, default=0):
    """Always return a numeric Series (even if the column is missing)."""
    if col in df.columns:
        return pd.to_numeric(df[col], errors="coerce").fillna(default)
    return pd.Series(default, index=df.index, dtype="float64")

def coerce_binary(series):
    """Coerce possibly-object/boolean string to numeric 0/1 int8."""
    s_num = pd.to_numeric(series, errors="coerce")
    s_txt = (series.astype("string").str.upper().str.strip()
             .map({"Y":1, "YES":1, "T":1, "TRUE":1,
                   "N":0, "NO":0, "F":0, "FALSE":0}))
    s = s_num.fillna(s_txt).fillna(0)
    return (s > 0).astype("int8")

def first_nonnull(s):
    return s.dropna().iloc[0] if s.notna().any() else np.nan

def pick_col(cols, *cands):
    for c in cands:
        if c in cols: return c
    low = {c.lower(): c for c in cols}
    for c in cands:
        if c.lower() in low: return low[c.lower()]
    return None

# ================================
# 1) Filter to tobacco categories you want
# ================================
brand_nonempty = df["brand"].astype("string").str.strip().ne("").fillna(False)
has_category = (
      brand_nonempty
    | (to_num_col(df, "generic_hardcoded") > 0)
    | (to_num_col(df, "cigar") > 0)
    | (to_num_col(df, "snuff") > 0)
    | (to_num_col(df, "loose tobacco") > 0)
)
df = df.loc[has_category].copy()

# generic are cigarettes by definition; make a clean cigarettes flag
cig_f = (to_num_col(df, "cigarettes") > 0).astype(int)
is_generic = (to_num_col(df, "generic_hardcoded") > 0)
cig_f = np.where(is_generic, 1, cig_f)
df["cigarettes"] = cig_f

# ================================
# 2) Normalize keys & validity
# ================================
df["store"] = pd.to_numeric(df["store"], errors="coerce").astype("Int64")
df["week"]  = pd.to_numeric(df["week"],  errors="coerce").astype("Int64")
# UPC (digits only) – this is the product id for branded cigarettes
df["upc_norm"] = df["upc"].astype("string").str.replace(r"\D", "", regex=True)
df = df.loc[df["store"].notna() & df["week"].notna() & df["upc_norm"].notna()].copy()

# ================================
# 3) Convert to PACKS (handle cartons / 10PK / 10CT)
# ================================
carton_col = pick_col(df.columns, "carton", "Carton", "CARTON")

s = df["size"].astype("string").str.upper().str.replace(r"\s+", " ", regex=True).str.strip()
is_10ct   = s.str.contains(r"\b10\s*CT\b", regex=True, na=False)
is_10pk   = s.str.contains(r"\b10\s*PK\b", regex=True, na=False)
is_carton = s.str.contains(r"\bCARTON\b",  regex=True, na=False)
looks_10  = is_10ct | is_10pk | is_carton

if carton_col is None:
    df["packs_per_item"] = np.where(looks_10, 10.0, 1.0)
else:
    c = to_num_col(df, carton_col).astype(int)
    df["packs_per_item"] = np.where(c == 1, 10.0, 1.0)
    df.loc[(c != 1) & looks_10, "packs_per_item"] = 10.0

df["price"] = to_num_col(df, "price")
df["move"]  = to_num_col(df, "move")
df["qty"]   = to_num_col(df, "qty")

df = df.loc[(df["qty"] > 0) & (df["packs_per_item"] > 0)].copy()
df["row_revenue"] = df["price"] * df["move"] / df["qty"]
df["pack_sales"]  = df["move"]  * df["packs_per_item"]

# ================================
# 4) Time: 13-week quarters aligned to min week
# ================================
base_week = int(df["week"].min())
df["year52"] = ((df["week"] - base_week) // 52 + 1).astype("Int64")
df["qtr13"]  = (((df["week"] - base_week) % 52) // 13 + 1).astype("Int64")
df["quarter_idx"] = ((df["year52"] - 1) * 4 + df["qtr13"]).astype("Int64")

# ================================
# 5) Product definition
#    - Branded cigarettes:   product_id = UPC
#    - Generic cigarettes:   product_id = "generic"
#    - Cigar/Snuff/Loose:    product_id = "cigar"/"snuff"/"loose_tobacco"
# ================================
df["brand_clean"] = (
    df["brand"].astype("string")
      .str.strip()
      .str.replace(r"\s+", " ", regex=True)
      .str.lower()
)
has_brand = df["brand_clean"].ne("").fillna(False)

is_cigar = (to_num_col(df, "cigar") > 0)
is_snuff = (to_num_col(df, "snuff") > 0)
is_loose = (to_num_col(df, "loose tobacco") > 0)
is_cig   = (df["cigarettes"] > 0) | is_generic | has_brand

# product family (lower-case tokens)
df["prod_type"] = np.select(
    [is_cigar,  is_snuff,  is_loose,  is_cig],
    ["cigar",   "snuff",   "loose_tobacco", "cigarette"],
    default="cigarette",
)

# product id per your rule
df["prod_id"] = np.where(
    (df["prod_type"] == "cigarette") & has_brand, df["upc_norm"],
    np.where((df["prod_type"] == "cigarette") & (~has_brand) & is_generic, "generic",
             np.where(df["prod_type"] == "cigar", "cigar",
                      np.where(df["prod_type"] == "snuff", "snuff",
                               np.where(df["prod_type"] == "loose_tobacco", "loose_tobacco", "unbranded"))))
)

df["prod_key"] = df["prod_type"] + "|" + df["prod_id"]

# ================================
# 6) Prepare characteristics (coerce dummies to 0/1)
# ================================
known_dummies = [
    "menthol","dlx","special","supslim","slim","generic","single","carton","pack_kw","value",
    "generic_automated","generic_hardcoded","cigar","snuff","loose_tobacco","flavored","premium",
    "cigarettes","ok","sale"
]
dummy_cols = [c for c in known_dummies if c in df.columns]
for c in dummy_cols:
    df[c] = coerce_binary(df[c])

known_continuous = [
    "tar_mean","nic_mean","co_mean",
    "income","educ","hsizeavg","age9","age60","ethnic","nocar","custcount"
]
if "implied discount" in df.columns:
    known_continuous.append("implied discount")

cat_cols = [c for c in ["brand","size","pack"] if c in df.columns]

# ================================
# 7) Aggregate to product × (store, quarter)
# ================================
group_cols = ["store","quarter_idx","year52","qtr13","prod_key","prod_type","prod_id"]

agg_dict = {
    "pack_sales": ("pack_sales","sum"),
    "row_revenue": ("row_revenue","sum"),
}
for c in dummy_cols:
    agg_dict[c] = (c, "max")
for c in known_continuous:
    agg_dict[c] = (c, first_nonnull)
for c in cat_cols + ["brand_clean","upc_norm"]:
    if c in df.columns:
        agg_dict[c] = (c, first_nonnull)

prod_market_q = (
    df.groupby(group_cols, as_index=False, observed=True)
      .agg(**agg_dict)
      .rename(columns={"pack_sales":"total_packs","row_revenue":"total_rev"})
)

# sales-weighted packs_per_item (handles mix of packs/cartons)
if "packs_per_item" in df.columns:
    w = (
        df.groupby(group_cols, observed=True)
          .apply(lambda g: np.average(g["packs_per_item"], weights=g["pack_sales"])
                 if g["pack_sales"].sum() > 0 else np.nan)
          .reset_index(name="packs_per_item_wavg")
    )
    prod_market_q = prod_market_q.merge(w, on=group_cols, how="left")

# price per pack
prod_market_q["avg_pack_price"] = prod_market_q["total_rev"] / prod_market_q["total_packs"]
prod_market_q.loc[~np.isfinite(prod_market_q["avg_pack_price"]), "avg_pack_price"] = np.nan

# ================================
# 8) Market size per quarter = 1.5 × (max store total in that quarter across stores)
# ================================
store_qtr_total = (
    prod_market_q.groupby(["store","quarter_idx"], observed=True)["total_packs"]
                 .sum()
                 .reset_index(name="store_quarter_total_packs")
)
qtr_max_store = (
    store_qtr_total.groupby("quarter_idx", observed=True)["store_quarter_total_packs"]
                   .max()
                   .reset_index(name="max_store_total_in_quarter")
)
qtr_max_store["market_size_quarter"] = 1.5 * qtr_max_store["max_store_total_in_quarter"]

prod_market_q = prod_market_q.merge(
    qtr_max_store[["quarter_idx","market_size_quarter"]],
    on="quarter_idx", how="left", validate="many_to_one"
)

prod_market_q["prod_mkt_share"] = prod_market_q["total_packs"] / prod_market_q["market_size_quarter"]

# ================================
# 9) Diagnostics
# ================================
n_markets = df[["store","quarter_idx"]].drop_duplicates().shape[0]
n_rows = prod_market_q.shape[0]
print(f"Product–market rows: {n_rows:,}")
print(f"Store–quarter markets: {n_markets:,}")
print(f"Avg products per market: {n_rows / max(n_markets,1):.2f}")

counts = prod_market_q.groupby(["store","quarter_idx"]).size()
print(counts.describe())

sum_share_storeq = (
    prod_market_q.groupby(["store","quarter_idx"], observed=True)["prod_mkt_share"]
                 .sum()
)
print("Mean ∑ shares per (store,quarter):", float(sum_share_storeq.mean()))
print("Max  ∑ shares per (store,quarter):",  float(sum_share_storeq.max()))

# Final DataFrame: prod_market_q
prod_market_q

  .apply(lambda g: np.average(g["packs_per_item"], weights=g["pack_sales"])


Product–market rows: 34,565
Store–quarter markets: 2,556
Avg products per market: 13.52
count    2556.000000
mean       13.523083
std        21.670885
min         1.000000
25%         1.000000
50%         1.000000
75%        16.000000
max        83.000000
dtype: float64
Mean ∑ shares per (store,quarter): 0.27195963274752066
Max  ∑ shares per (store,quarter): 0.6666666666666667


Unnamed: 0,store,quarter_idx,year52,qtr13,prod_key,prod_type,prod_id,total_packs,total_rev,menthol,...,implied discount,brand,size,pack,brand_clean,upc_norm,packs_per_item_wavg,avg_pack_price,market_size_quarter,prod_mkt_share
0,2,1,1,1,cigarette|generic,cigarette,generic,15875.0,30036.42,0,...,0.0,,1 CT,UNK,,193,2.672441,1.892058,83493.0,0.190136
1,2,2,1,2,cigarette|generic,cigarette,generic,16143.0,31889.01,0,...,0.0,,1 CT,UNK,,193,2.616800,1.975408,66259.5,0.243633
2,2,3,1,3,cigarette|generic,cigarette,generic,12707.0,25139.76,0,...,0.0,,1 CT,UNK,,193,2.494452,1.978418,66882.0,0.189991
3,2,4,1,4,cigarette|generic,cigarette,generic,13441.0,26578.98,0,...,0.0,,1 CT,UNK,,193,2.359274,1.977456,69153.0,0.194366
4,2,5,2,1,cigarette|generic,cigarette,generic,8809.0,17573.58,0,...,0.0,,1 CT,UNK,,193,1.572142,1.994957,64165.5,0.137286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34560,146,31,8,3,cigarette|2820011110,cigarette,2820011110,20.0,43.98,0,...,0.0,Benson & Hedges,10 PK,UNK,benson & hedges,2820011110,10.000000,2.199000,13425.0,0.001490
34561,146,31,8,3,cigarette|2820011600,cigarette,2820011600,10.0,21.99,0,...,0.0,Virginia Slims,10 PK,UNK,virginia slims,2820011600,10.000000,2.199000,13425.0,0.000745
34562,146,31,8,3,cigarette|2820011620,cigarette,2820011620,30.0,65.97,0,...,0.0,Virginia Slims,10 PK,UNK,virginia slims,2820011620,10.000000,2.199000,13425.0,0.002235
34563,146,31,8,3,cigarette|2820012100,cigarette,2820012100,10.0,21.99,0,...,0.0,Virginia Slims,10 PK,UNK,virginia slims,2820012100,10.000000,2.199000,13425.0,0.000745


In [179]:
counts = (
    prod_market_q.groupby(["store", "quarter_idx"], observed=True)["prod_key"]
                 .transform("count")
)
prod_market_q = prod_market_q.loc[counts > 1].copy()

# Final DataFrame: prod_market_q
prod_market_q

Unnamed: 0,store,quarter_idx,year52,qtr13,prod_key,prod_type,prod_id,total_packs,total_rev,menthol,...,implied discount,brand,size,pack,brand_clean,upc_norm,packs_per_item_wavg,avg_pack_price,market_size_quarter,prod_mkt_share
14,2,15,4,3,cigarette|1230011039,cigarette,1230011039,9.0,20.61,0,...,0.0,Winston,1 CT,UNK,winston,1230011039,1.000000,2.290000,22800.0,0.000395
15,2,15,4,3,cigarette|1230011339,cigarette,1230011339,1.0,2.29,0,...,0.0,Winston,1 CT,UNK,winston,1230011339,1.000000,2.290000,22800.0,0.000044
16,2,15,4,3,cigarette|1230011436,cigarette,1230011436,30.0,54.42,0,...,0.0,Winston,10 CT,UNK,winston,1230011436,10.000000,1.814000,22800.0,0.001316
17,2,15,4,3,cigarette|generic,cigarette,generic,3535.0,9007.57,0,...,0.0,,1 CT,UNK,,193,3.062235,2.548110,22800.0,0.155044
18,2,16,4,4,cigarette|1230011039,cigarette,1230011039,60.0,138.10,0,...,0.0,Winston,1 CT,UNK,winston,1230011039,1.000000,2.301667,21768.0,0.002756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34560,146,31,8,3,cigarette|2820011110,cigarette,2820011110,20.0,43.98,0,...,0.0,Benson & Hedges,10 PK,UNK,benson & hedges,2820011110,10.000000,2.199000,13425.0,0.001490
34561,146,31,8,3,cigarette|2820011600,cigarette,2820011600,10.0,21.99,0,...,0.0,Virginia Slims,10 PK,UNK,virginia slims,2820011600,10.000000,2.199000,13425.0,0.000745
34562,146,31,8,3,cigarette|2820011620,cigarette,2820011620,30.0,65.97,0,...,0.0,Virginia Slims,10 PK,UNK,virginia slims,2820011620,10.000000,2.199000,13425.0,0.002235
34563,146,31,8,3,cigarette|2820012100,cigarette,2820012100,10.0,21.99,0,...,0.0,Virginia Slims,10 PK,UNK,virginia slims,2820012100,10.000000,2.199000,13425.0,0.000745


In [187]:
n_markets = df[["store","quarter_idx"]].drop_duplicates().shape[0]
n_rows = prod_market_q.shape[0]
print(f"Product–market rows: {n_rows:,}")
print(f"Store–quarter markets: {n_markets:,}")
print(f"Avg products per market: {n_rows / max(n_markets,1):.2f}")

counts = prod_market_q.groupby(["store","quarter_idx"]).size()
print(counts.describe())

Product–market rows: 32,987
Store–quarter markets: 978
Avg products per market: 33.73
count    978.000000
mean      33.729039
std       23.793835
min        2.000000
25%        8.000000
50%       37.000000
75%       54.750000
max       83.000000
dtype: float64


In [183]:
import numpy as np, pandas as pd
import statsmodels.api as sm
from linearmodels.iv import IV2SLS

# =========================
# 0) Start & basic fields
# =========================
df = prod_market_q.copy()
df = df.rename(columns={"avg_pack_price": "price"})
mkt = ["store", "quarter_idx"]

# 2-way nest: cigarettes vs non-cigarettes
df["nest"] = np.where(df["prod_type"].str.lower().eq("cigarette"), "cig", "noncig")

# =========================
# 1) Shares for nested logit
# =========================
sum_inside = df.groupby(mkt, observed=True)["prod_mkt_share"].transform("sum")
df = df[(df["prod_mkt_share"] > 0) & (sum_inside < 1)].copy()

df["s0"] = np.clip(1.0 - sum_inside, 1e-12, 1 - 1e-12)
df["sg"] = df.groupby(mkt + ["nest"], observed=True)["prod_mkt_share"].transform("sum")
df = df[df["sg"] > 0].copy()

df["ln_s"]        = np.log(df["prod_mkt_share"]) - np.log(df["s0"])
df["ln_s_within"] = np.log(df["prod_mkt_share"]) - np.log(df["sg"])   # ln(s_{j|g})

# controls (use only those that exist)
controls = [c for c in ["dlx","supslim","slim","value","premium","flavored",
                        "tar_mean","nic_mean","co_mean","carton"] if c in df.columns]
use_cols = ["price", "ln_s_within"] + controls

# =========================
# 2) Absorb MARKET FE: within transform
# =========================
def demean_within(df_in, cols, key_cols):
    means = df_in.groupby(key_cols, observed=True)[cols].transform("mean")
    return df_in[cols] - means

X_tilde = demean_within(df, use_cols, mkt)
y_tilde = (df["ln_s"] - df.groupby(mkt, observed=True)["ln_s"].transform("mean")).rename("ln_s")

# drop rows with NA post-demean
keep_idx = X_tilde.join(y_tilde).dropna().index
X_tilde, y_tilde, df2 = X_tilde.loc[keep_idx], y_tilde.loc[keep_idx], df.loc[keep_idx].copy()

# drop any columns with ~zero variance after demeaning
nzv = [c for c in X_tilde.columns if np.nanstd(X_tilde[c].to_numpy()) > 0]
X_tilde = X_tilde[nzv]
use_cols = [c for c in use_cols if c in nzv]  # keep in sync

clusters = pd.to_numeric(df2["store"], errors="coerce").astype(int).to_numpy()

# =========================
# 3) Instruments that survive coverage gaps
# =========================
# (a) UPC×quarter, leave-one-store mean price (z1)
g = df2.groupby(["prod_id", "quarter_idx"])
cnt_uq = g["price"].transform("count")
sum_uq = g["price"].transform("sum")
z1_raw = np.where(cnt_uq > 1, (sum_uq - df2["price"]) / (cnt_uq - 1), np.nan)
Z1 = demean_within(pd.concat([df2[mkt], pd.Series(z1_raw, index=df2.index, name="z1")], axis=1),
                   ["z1"], mkt)["z1"]

# (b) UPC, leave-one-STORE mean price over all quarters (z2)
gu  = df2.groupby(["prod_id"])
cnt_u, sum_u = gu["price"].transform("count"), gu["price"].transform("sum")
gus = df2.groupby(["prod_id","store"])
cnt_us, sum_us = gus["price"].transform("count"), gus["price"].transform("sum")
z2_raw = np.where((cnt_u - cnt_us) > 0, (sum_u - sum_us) / (cnt_u - cnt_us), np.nan)
Z2 = demean_within(pd.concat([df2[mkt], pd.Series(z2_raw, index=df2.index, name="z2")], axis=1),
                   ["z2"], mkt)["z2"]

# (c) brand×quarter (branded cigs) else prod_type×quarter (z3)
# safe brand series (no 'or' on Series)
if "brand_clean" in df2.columns:
    brand_series = df2["brand_clean"]
elif "brand" in df2.columns:
    brand_series = df2["brand"]
else:
    brand_series = pd.Series("", index=df2.index)

brand = brand_series.astype("string").str.strip().str.lower().fillna("")
is_branded_cig = df2["prod_type"].str.lower().eq("cigarette") & brand.ne("")

# brand×quarter leave-one-store mean
gbq = df2.groupby(["brand_clean","quarter_idx"])
cnt_bq, sum_bq = gbq["price"].transform("count"), gbq["price"].transform("sum")
gbqs = df2.groupby(["brand_clean","quarter_idx","store"])
cnt_bqs, sum_bqs = gbqs["price"].transform("count"), gbqs["price"].transform("sum")
z3b_raw = np.where(is_branded_cig & ((cnt_bq - cnt_bqs) > 0),
                   (sum_bq - sum_bqs) / (cnt_bq - cnt_bqs), np.nan)

# prod_type×quarter leave-one-store mean
gtq = df2.groupby(["prod_type","quarter_idx"])
cnt_tq, sum_tq = gtq["price"].transform("count"), gtq["price"].transform("sum")
gtqs = df2.groupby(["prod_type","quarter_idx","store"])
cnt_tqs, sum_tqs = gtqs["price"].transform("count"), gtqs["price"].transform("sum")
z3t_raw = np.where((~is_branded_cig) & ((cnt_tq - cnt_tqs) > 0),
                   (sum_tq - sum_tqs) / (cnt_tq - cnt_tqs), np.nan)

z3_raw = np.where(is_branded_cig, z3b_raw, z3t_raw)
Z3 = demean_within(pd.concat([df2[mkt], pd.Series(z3_raw, index=df2.index, name="z3")], axis=1),
                   ["z3"], mkt)["z3"]

# combine and keep instruments that exist and vary
Z = pd.concat([Z1, Z2, Z3], axis=1)
goodZ = [c for c in Z.columns if Z[c].notna().any() and Z[c].std(skipna=True) > 0]
Z = Z[goodZ]

# align for IV (drop rows with NA in y/X/Z)
data_iv = pd.concat([y_tilde, X_tilde, Z], axis=1).dropna()
y_iv    = data_iv["ln_s"]
X_iv    = data_iv[use_cols]                # includes price & ln_s_within + controls (demeaned)
Z_iv    = data_iv[goodZ]
clusters_iv = pd.to_numeric(df2.loc[data_iv.index, "store"], errors="coerce").astype(int).to_numpy()

# =========================
# 4) OLS (market FE absorbed) — reference
# =========================
ols = sm.OLS(y_iv, sm.add_constant(X_iv)).fit(cov_type="cluster",
                                              cov_kwds={"groups": clusters_iv})
print("\n[OLS Nested Logit | Market FE absorbed]")
print(ols.summary().tables[1])

# =========================
# 5) IV 2SLS (market FE absorbed) — NO constant (demeaned data)
# =========================
# guard against collinearity: exog must have full column rank
exog = X_iv.drop(columns=["price"])
# drop any zero-variance exog columns (rare but safe)
exog = exog.loc[:, exog.apply(lambda s: np.nanstd(s.to_numpy()) > 0)]

iv = IV2SLS(
    dependent=y_iv,
    exog=exog,               # no constant: within transform removed it
    endog=X_iv[["price"]],
    instruments=Z_iv
).fit(cov_type="clustered", clusters=clusters_iv)
print("\n[IV Nested Logit | Market FE absorbed | multi-source instruments]")
print(iv.summary)

# First-stage for 'price' (if available)
try:
    print("\n[First Stage for price]")
    print(iv.first_stage["price"].summary)
except Exception:
    pass

# quick elasticity
sigma = iv.params.get("ln_s_within", np.nan)
alpha = iv.params.get("price", np.nan)
sbar   = df2.loc[data_iv.index, "prod_mkt_share"].mean()
sjgbar = (df2.loc[data_iv.index, "prod_mkt_share"] / df2.loc[data_iv.index, "sg"]).mean()
pbar   = df2.loc[data_iv.index, "price"].mean()
eps    = -alpha * pbar * (1 - sigma*(1 - sjgbar) - sbar)
print(f"\nσ (nesting): {sigma:.3f} | implied avg own-price elasticity ≈ {eps:.2f}")



[OLS Nested Logit | Market FE absorbed]
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.0246      0.005      4.915      0.000       0.015       0.034
price           0.0129      0.003      3.943      0.000       0.007       0.019
ln_s_within     0.9982      0.000   2544.174      0.000       0.997       0.999
slim            0.0594      0.013      4.719      0.000       0.035       0.084
value           0.0036      0.003      1.204      0.229      -0.002       0.010
premium         0.0009      0.002      0.561      0.575      -0.002       0.004
flavored       -0.0046      0.002     -2.253      0.024      -0.009      -0.001
tar_mean        0.0181      0.004      4.624      0.000       0.010       0.026
nic_mean       -0.3636      0.078     -4.669      0.000      -0.516      -0.211
co_mean        -0.0117      0.003     -4.380      0.000      -0.017      -0.006