# US Innovation: Patent Event Studies and Panel Regressions

This notebook reproduces the analyses from the Stata scripts:

1. `event_study_all_patents.do`
2. `panel_regressions_6675.do`
3. `panel_regresssions_HHI_final .do`

It assumes the corresponding `.dta` files have been copied into the `../data` directory. Sections below follow the original workflow order and document each transformation so the analysis is portfolio-ready.

## Contents
- [Setup](#Setup)
- [1. Event Study](#1-Event-Study)
- [2. Panel Regressions (6675)](#2-Panel-Regressions-6675)
- [3. Panel Regressions with HHI Splits](#3-Panel-Regressions-with-HHI-Splits)
- [Appendix](#Appendix)

## Setup

In [None]:
import warnings

warnings.filterwarnings("ignore")

%config InlineBackend.figure_format = "retina"


In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

try:
    from linearmodels.iv import IV2SLS
except ImportError as exc:
    raise ImportError(
        "Install `linearmodels` via `pip install linearmodels` to run the IV specifications."
    ) from exc

sns.set_theme(style="whitegrid", context="talk")

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)
pd.set_option("display.float_format", "{:,.4f}".format)


In [None]:
PROJECT_ROOT = Path.cwd().resolve().parent
DATA_DIR = PROJECT_ROOT / "data"
FIGURE_DIR = PROJECT_ROOT / "figures"
TABLE_DIR = PROJECT_ROOT / "tables"

for path in (FIGURE_DIR, TABLE_DIR):
    path.mkdir(parents=True, exist_ok=True)

RAW_DATA = {
    "panel": DATA_DIR / "intermediate" / "panel" / "built.dta",
    "defcon": DATA_DIR / "intermediate" / "defcon_patent_merge_county_year_final.dta",
    "panel5": DATA_DIR / "intermediate" / "panel" / "built5.dta",
    "hhi": DATA_DIR / "intermediate" / "hhi_aggregate_all1_pre80.dta",
}

missing = {name: path for name, path in RAW_DATA.items() if not path.exists()}
if missing:
    print("⚠️ Update `RAW_DATA` if your directory structure differs. Missing files detected:")
    for name, path in missing.items():
        print(f" - {name}: {path}")
else:
    print("All expected source files found.")


### Helper utilities

In [None]:
def build_fe_terms(fe_vars):
    return [f"C({col})" for col in fe_vars]


def build_ols_formula(outcome, key_var, controls=None, fe=("county_fips", "fyear")):
    controls = controls or []
    fe_terms = build_fe_terms(fe)
    rhs_parts = [key_var] + controls + fe_terms
    return f"{outcome} ~ " + " + ".join(rhs_parts)


def build_iv_formula(
    outcome,
    endog,
    instruments,
    controls=None,
    fe=("county_fips", "fyear"),
):
    controls = controls or []
    endog_terms = endog if isinstance(endog, (list, tuple)) else [endog]
    instrument_terms = instruments if isinstance(instruments, (list, tuple)) else [instruments]
    fe_terms = build_fe_terms(fe)
    rhs = ["1"] + controls + fe_terms
    endog_expr = " + \".join(endog_terms)
    instr_expr = " + \".join(instrument_terms)
    base = " + \".join(rhs)
    return f"{outcome} ~ {base} + [{endog_expr} ~ {instr_expr}]"


def tidy_result(result, term, model_label):
    if not hasattr(result, "params"):
        raise ValueError("Result object does not expose parameters.")
    params = result.params
    if term not in params.index:
        raise KeyError(
            f"Term `{term}` not found in model {model_label}. Available terms: {list(params.index)}"
        )
    if hasattr(result, "bse"):
        se = result.bse[term]
    elif hasattr(result, "std_errors"):
        se = result.std_errors[term]
    else:
        raise AttributeError("Cannot locate standard errors on result object.")
    if hasattr(result, "pvalues"):
        p_value = result.pvalues[term]
    else:
        p_value = np.nan
    coef = params[term]
    return pd.Series(
        {
            "model": model_label,
            "term": term,
            "coef": coef,
            "std_err": se,
            "ci_low": coef - 1.96 * se,
            "ci_high": coef + 1.96 * se,
            "p_value": p_value,
            "nobs": getattr(result, "nobs", np.nan),
        }
    )


def summarize_models(rows):
    table = pd.DataFrame(rows).set_index("model")
    return table[["coef", "std_err", "ci_low", "ci_high", "p_value", "nobs"]]


## 1. Event Study

This section rebuilds the treatment definition from the spending surge, applies the semiconductor-intensive sample restriction, and replicates the difference-in-differences regressions together with the trend and event-time visuals.

In [None]:
event_df = (
    pd.read_stata(RAW_DATA["panel"])
    .merge(pd.read_stata(RAW_DATA["defcon"]), on=["county_fips", "fyear"], how="inner")
)

event_df = event_df.loc[event_df["fyear"].between(1965, 2003)].copy()
event_df["semi_intens"] = event_df.groupby("county_fips")["semi_intens"].transform("max")
event_df = event_df.loc[event_df["semi_intens"] == 1].copy()
event_df["county_id"] = event_df.groupby("county_fips").ngroup()

mean_spend1 = (
    event_df.loc[event_df["fyear"].between(1976, 1981)]
    .groupby("county_fips")["total_dollars"]
    .mean()
)
mean_spend2 = (
    event_df.loc[event_df["fyear"].between(1981, 1989)]
    .groupby("county_fips")["total_dollars"]
    .mean()
)

event_df["mean_spend1"] = event_df["county_fips"].map(mean_spend1)
event_df["mean_spend2"] = event_df["county_fips"].map(mean_spend2)
event_df["surge"] = event_df["mean_spend2"] - event_df["mean_spend1"]

surge_stats = (
    event_df.drop_duplicates("county_fips")[["county_fips", "surge"]]
    .dropna()
    .set_index("county_fips")
)

surge_thresholds = surge_stats["surge"].agg(["mean", "std", "median"])
surge_thresholds["mean_plus_sd"] = surge_thresholds["mean"] + surge_thresholds["std"]
surge_thresholds["mean_plus_half_sd"] = surge_thresholds["mean"] + surge_thresholds["std"] / 2

surge_cutoff = surge_thresholds["median"]
event_df["treated"] = (event_df["surge"] > surge_cutoff).astype(int)
event_df["after"] = (event_df["fyear"] > 1981).astype(int)
event_df["treatment"] = (event_df["treated"] * event_df["after"]).astype(int)

surge_thresholds


In [None]:
did_formula_patents = build_ols_formula("num_patents", "treatment")
did_patents = smf.ols(did_formula_patents, data=event_df).fit(
    cov_type="cluster", cov_kwds={"groups": event_df["county_fips"]}
)

did_formula_cites = build_ols_formula("w_cites_sub", "treatment")
did_cites = smf.ols(did_formula_cites, data=event_df).fit(
    cov_type="cluster", cov_kwds={"groups": event_df["county_fips"]}
)

did_results = summarize_models(
    [
        tidy_result(did_patents, "treatment", "Patents (DiD)"),
        tidy_result(did_cites, "treatment", "Citation-weighted (DiD)"),
    ]
)

did_results


In [None]:
trend = (
    event_df.groupby(["fyear", "treated"])["w_cites_sub"]
    .mean()
    .reset_index()
    .rename(columns={"treated": "treated_group"})
)
trend["treated_group"] = trend["treated_group"].map({0: "Control", 1: "Treated"})

fig, ax = plt.subplots(figsize=(12, 6))
sns.lineplot(data=trend, x="fyear", y="w_cites_sub", hue="treated_group", ax=ax)
ax.axvline(1981, color="black", linestyle="--", linewidth=1)
ax.set(
    title="Average Citation-weighted Patents by Treatment Group",
    xlabel="Year",
    ylabel="Average w_cites_sub",
)
ax.legend(title="Group")
fig.tight_layout()
fig_path = FIGURE_DIR / "event_study_trend.png"
fig.savefig(fig_path, dpi=300)
fig_path


In [None]:
BASE_YEAR = 1981
WINDOW = range(-10, 11)

event_df["event_time"] = event_df["fyear"] - BASE_YEAR
for k in WINDOW:
    col = f"lead_lag_{k}"
    event_df[col] = np.where(
        (event_df["event_time"] == k) & (event_df["treated"] == 1),
        1,
        0,
    )

event_terms = [f"lead_lag_{k}" for k in WINDOW if k != 0]
event_formula = "w_cites_sub ~ " + " + ".join(event_terms + ["C(county_fips)", "C(fyear)"])
event_model = smf.ols(event_formula, data=event_df).fit(
    cov_type="cluster", cov_kwds={"groups": event_df["county_fips"]}
)

event_summary = pd.DataFrame(
    {
        "event_time": [k for k in WINDOW if k != 0],
        "coef": [event_model.params[f"lead_lag_{k}"] for k in WINDOW if k != 0],
        "std_err": [event_model.bse[f"lead_lag_{k}"] for k in WINDOW if k != 0],
    }
)
event_summary["ci_low"] = event_summary["coef"] - 1.96 * event_summary["std_err"]
event_summary["ci_high"] = event_summary["coef"] + 1.96 * event_summary["std_err"]

fig, ax = plt.subplots(figsize=(12, 6))
ax.axvline(-0.5, color="black", linestyle="--", linewidth=1)
ax.axhline(0, color="gray", linestyle=":", linewidth=1)
ax.fill_between(
    event_summary["event_time"],
    event_summary["ci_low"],
    event_summary["ci_high"],
    alpha=0.3,
    color="#1f77b4",
)
ax.plot(event_summary["event_time"], event_summary["coef"], marker="o", color="#1f77b4")
ax.set(
    title="Event-time Effects on Citation-weighted Patents",
    xlabel="Years relative to 1981",
    ylabel="Coefficient (vs. 1981 baseline)",
)
fig.tight_layout()
event_fig_path = FIGURE_DIR / "event_time_coefficients.png"
fig.savefig(event_fig_path, dpi=300)
event_summary.head()


## 2. Panel Regressions (6675)

We recreate the two-stage empirical strategy from the baseline Stata script: first diagnosing the instrument, then reporting the OLS and IV specifications with progressively richer fixed effects and controls.

In [None]:
panel5 = pd.read_stata(RAW_DATA["panel5"]).copy()
panel5["state_yr"] = panel5["state"].astype(str) + "_" + panel5["fyear"].astype(int).astype(str)
panel5.head()


### First-stage diagnostics

In [None]:
first_stage = panel5.query(
    "semi_intens == 1 and fyear > 1975 and ltotal_dollars > 0 and lspending_6675_iv > 0"
).copy()

first_stage["lspending_6675_iv_centered"] = (
    first_stage["lspending_6675_iv"] - first_stage["lspending_6675_iv"].mean()
)

resid_lt_model = smf.ols("ltotal_dollars ~ C(fyear)", data=first_stage).fit()
first_stage["resid_lt"] = resid_lt_model.resid

resid_iv_model = smf.ols("lspending_6675_iv_centered ~ C(fyear)", data=first_stage).fit()
first_stage["resid_iv"] = resid_iv_model.resid

first_stage[["ltotal_dollars", "lspending_6675_iv_centered", "resid_lt", "resid_iv"]].head()


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

sns.scatterplot(data=first_stage, x="resid_iv", y="resid_lt", alpha=0.5, ax=ax[0])
sns.regplot(
    data=first_stage,
    x="resid_iv",
    y="resid_lt",
    scatter=False,
    line_kws={"color": "black"},
    ax=ax[0],
)
ax[0].set(
    title="Residualized Relationship",
    xlabel="Residual IV (centered)",
    ylabel="Residual log spending",
)

sns.scatterplot(data=first_stage, x="lspending_6675_iv", y="ltotal_dollars", alpha=0.3, ax=ax[1])
sns.regplot(
    data=first_stage,
    x="lspending_6675_iv",
    y="ltotal_dollars",
    scatter=False,
    line_kws={"color": "black"},
    ax=ax[1],
)
ax[1].set(
    title="Raw Relationship",
    xlabel="Log IV spending",
    ylabel="Log direct spending",
)

fig.tight_layout()
fig_path = FIGURE_DIR / "first_stage_checks.png"
fig.savefig(fig_path, dpi=300)
fig_path


### Intensive-margin regressions

In [None]:
semi_sample = panel5.query("semi_intens == 1 and fyear > 1975").copy()
all_sample = panel5.query("fyear > 1975").copy()

controls = ["avg_wages", "pop", "emp"]

semi_rows = []

model = smf.ols(build_ols_formula("lw_cites_sub", "ltotal_dollars"), data=semi_sample).fit(
    cov_type="cluster", cov_kwds={"groups": semi_sample["county_fips"]}
)
semi_rows.append(tidy_result(model, "ltotal_dollars", "Semi counties – OLS (simple)"))

iv_formula = build_iv_formula(
    "lw_cites_sub",
    "ltotal_dollars",
    "log_spending_6675_iv",
    fe=("county_fips", "fyear"),
)
iv_model = IV2SLS.from_formula(iv_formula, data=semi_sample).fit(
    cov_type="clustered", clusters=semi_sample["county_fips"]
)
semi_rows.append(tidy_result(iv_model, "ltotal_dollars", "Semi counties – IV (simple)"))

model_full = smf.ols(
    build_ols_formula("lw_cites_sub", "ltotal_dollars", controls),
    data=semi_sample,
).fit(cov_type="cluster", cov_kwds={"groups": semi_sample["county_fips"]})
semi_rows.append(tidy_result(model_full, "ltotal_dollars", "Semi counties – OLS (controls)"))

iv_formula_full = build_iv_formula(
    "lw_cites_sub",
    "ltotal_dollars",
    "log_spending_6675_iv",
    controls=controls,
)
iv_model_full = IV2SLS.from_formula(iv_formula_full, data=semi_sample).fit(
    cov_type="clustered", clusters=semi_sample["county_fips"]
)
semi_rows.append(tidy_result(iv_model_full, "ltotal_dollars", "Semi counties – IV (controls)"))

model_state = smf.ols(
    build_ols_formula("lw_cites_sub", "ltotal_dollars", controls, fe=("county_fips", "fyear", "state_yr")),
    data=semi_sample,
).fit(cov_type="cluster", cov_kwds={"groups": semi_sample["county_fips"]})
semi_rows.append(tidy_result(model_state, "ltotal_dollars", "Semi counties – OLS (state-year FE)"))

iv_formula_state = build_iv_formula(
    "lw_cites_sub",
    "ltotal_dollars",
    "log_spending_6675_iv",
    controls=controls,
    fe=("county_fips", "fyear", "state_yr"),
)
iv_model_state = IV2SLS.from_formula(iv_formula_state, data=semi_sample).fit(
    cov_type="clustered", clusters=semi_sample["county_fips"]
)
semi_rows.append(tidy_result(iv_model_state, "ltotal_dollars", "Semi counties – IV (state-year FE)"))

semi_table = summarize_models(semi_rows)
semi_table


In [None]:
all_rows = []

model = smf.ols(build_ols_formula("lw_cites_sub", "ltotal_dollars"), data=all_sample).fit(
    cov_type="cluster", cov_kwds={"groups": all_sample["county_fips"]}
)
all_rows.append(tidy_result(model, "ltotal_dollars", "All counties – OLS (simple)"))

a_iv_formula = build_iv_formula(
    "lw_cites_sub",
    "ltotal_dollars",
    "lspending_6675_iv",
)
a_iv_model = IV2SLS.from_formula(a_iv_formula, data=all_sample).fit(
    cov_type="clustered", clusters=all_sample["county_fips"]
)
all_rows.append(tidy_result(a_iv_model, "ltotal_dollars", "All counties – IV (simple)"))

model_full = smf.ols(
    build_ols_formula("lw_cites_sub", "ltotal_dollars", controls),
    data=all_sample,
).fit(cov_type="cluster", cov_kwds={"groups": all_sample["county_fips"]})
all_rows.append(tidy_result(model_full, "ltotal_dollars", "All counties – OLS (controls)"))

a_iv_formula_full = build_iv_formula(
    "lw_cites_sub",
    "ltotal_dollars",
    "lspending_6675_iv",
    controls=controls,
)
a_iv_model_full = IV2SLS.from_formula(a_iv_formula_full, data=all_sample).fit(
    cov_type="clustered", clusters=all_sample["county_fips"]
)
all_rows.append(tidy_result(a_iv_model_full, "ltotal_dollars", "All counties – IV (controls)"))

model_state = smf.ols(
    build_ols_formula("lw_cites_sub", "ltotal_dollars", controls, fe=("county_fips", "fyear", "state_yr")),
    data=all_sample,
).fit(cov_type="cluster", cov_kwds={"groups": all_sample["county_fips"]})
all_rows.append(tidy_result(model_state, "ltotal_dollars", "All counties – OLS (state-year FE)"))

a_iv_formula_state = build_iv_formula(
    "lw_cites_sub",
    "ltotal_dollars",
    "lspending_6675_iv",
    controls=controls,
    fe=("county_fips", "fyear", "state_yr"),
)
a_iv_model_state = IV2SLS.from_formula(a_iv_formula_state, data=all_sample).fit(
    cov_type="clustered", clusters=all_sample["county_fips"]
)
all_rows.append(tidy_result(a_iv_model_state, "ltotal_dollars", "All counties – IV (state-year FE)"))

all_table = summarize_models(all_rows)
all_table


## 3. Panel Regressions with HHI Splits

The final section merges the HHI aggregates, constructs the quartile-by-instrument interactions, and re-estimates the heterogeneous IV effects alongside balance checks across concentration groups.

In [None]:
hhi_data = (
    panel5.merge(pd.read_stata(RAW_DATA["hhi"]), on="county_fips", how="inner")
    .query("semi_intens == 1")
    .copy()
)

hhi_data = hhi_data.sort_values("comp_wtd").reset_index(drop=True)
rank = np.arange(1, len(hhi_data) + 1)
hhi_data["group"] = np.ceil(4 * rank / len(hhi_data)).astype(int)
hhi_data.loc[hhi_data["group"] > 4, "group"] = 4

for i in range(1, 5):
    col = f"g{i}"
    hhi_data[col] = (hhi_data["group"] == i).astype(int)
    hhi_data[f"xg{i}"] = hhi_data["ltotal_dollars"] * hhi_data[col]
    hhi_data[f"zg{i}"] = hhi_data["log_spending_iv5"] * hhi_data[col]

hhi_data.head()


In [None]:
hhi_formula = build_iv_formula(
    outcome="lw_cites_sub",
    endog=[f"xg{i}" for i in range(1, 5)],
    instruments=[f"zg{i}" for i in range(1, 5)],
    controls=["avg_wages", "pop", "emp"],
    fe=("county_fips", "fyear", "state_yr"),
)
hhi_model = IV2SLS.from_formula(hhi_formula, data=hhi_data).fit(
    cov_type="clustered", clusters=hhi_data["county_fips"]
)

coef_terms = [f"xg{i}" for i in range(1, 5)]
hhi_results = pd.DataFrame(
    {
        "group": [f"Q{i}" for i in range(1, 5)],
        "coef": [hhi_model.params[term] for term in coef_terms],
        "std_err": [hhi_model.std_errors[term] for term in coef_terms],
    }
)
hhi_results["ci_low"] = hhi_results["coef"] - 1.96 * hhi_results["std_err"]
hhi_results["ci_high"] = hhi_results["coef"] + 1.96 * hhi_results["std_err"]

baseline = pd.DataFrame(
    {
        "group": ["Baseline (pooled)"],
        "coef": [0.1122621],
        "std_err": [0.0369543],
    }
)
baseline["ci_low"] = baseline["coef"] - 1.96 * baseline["std_err"]
baseline["ci_high"] = baseline["coef"] + 1.96 * baseline["std_err"]

hhi_plot_data = pd.concat([hhi_results, baseline], ignore_index=True)

positions = np.arange(len(hhi_plot_data))
fig, ax = plt.subplots(figsize=(10, 6))
ax.errorbar(
    positions,
    hhi_plot_data["coef"],
    yerr=1.96 * hhi_plot_data["std_err"],
    fmt="o",
    color="#1f77b4",
    capsize=5,
)
ax.axhline(0, color="gray", linestyle="--", linewidth=1)
ax.set_xticks(positions)
ax.set_xticklabels(hhi_plot_data["group"])
ax.set(
    title="Treatment Effects by Concentration Quartile",
    xlabel="Group",
    ylabel="Estimated treatment effect",
)
fig.tight_layout()
fig_path = FIGURE_DIR / "hhi_group_effects.png"
fig.savefig(fig_path, dpi=300)
hhi_results


In [None]:
balance_vars = ["inventor_share", "tech_emp_share", "emp_share", "pop"]

group_means = hhi_data.groupby("group")[balance_vars].mean().round(4)

group_counts = hhi_data.groupby("group").size().rename("n_obs")

balance_table = group_means.join(group_counts)
balance_table


## Appendix

- Figures are saved to `../figures/` and tables can be exported from the DataFrames displayed above.
- Cluster-robust standard errors follow the Stata specifications (clustered at the county level).
- Adjust the `RAW_DATA` dictionary if your folder structure differs or if you add alternative data versions.
