# 11 Non-Compliance and Instruments


In [None]:
from toolz.curried import *

import pandas as pd
import numpy as np
from scipy.special import expit

import statsmodels.formula.api as smf

import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib


from cycler import cycler

color=['0.0', '0.4', '0.8']
default_cycler = (cycler(color=color))
linestyle=['-', '--', ':', '-.']
marker=['o', 'v', 'd', 'p']

plt.rc('axes', prop_cycle=default_cycler)


## Non-Compliance


In [None]:
from graphviz import Digraph

gr = Digraph(format="png", graph_attr={"rankdir":"LR"})

gr.edge("U", "T")
gr.edge("U", "Y")
gr.edge("Z", "T")
gr.edge("T", "Y")

gr

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("./data/prime_card.csv")

df.head()

## Extending Potential Outcomes


In [None]:
m = smf.ols("pv~prime_elegible", data=df).fit()
m.summary().tables[1]

In [None]:
df["tau"].mean()

In [None]:
m = smf.ols("pv~prime_card", data=df).fit()
m.summary().tables[1]

## Instrument Identification Assumptions


## First Stage


In [None]:
first_stage = smf.ols("prime_card ~ prime_elegible", data=df).fit()
first_stage.summary().tables[1]

In [None]:
df.groupby("categ").size()/len(df)

## Reduced Form


In [None]:
red_form = smf.ols("pv ~ prime_elegible", data=df).fit()
red_form.summary().tables[1]

In [None]:
late = (red_form.params["prime_elegible"] /
        first_stage.params["prime_elegible"])
late

In [None]:
df.groupby("categ")["tau"].mean()

## Two Stage Least Squares


In [None]:
gr = Digraph(format="png", graph_attr={"rankdir":"LR"})

gr.edge("U", "T")
gr.edge("U", "Y")
gr.edge("Z", "T")
gr

In [None]:
iv_regr = smf.ols(
    "pv ~ prime_card",
    data=df.assign(prime_card=first_stage.fittedvalues)).fit()

iv_regr.summary().tables[1]

## Standard Error


In [None]:
Z = df["prime_elegible"]
T = df["prime_card"]
n = len(df)

# not the same as iv_regr.resid!
e_iv = df["pv"] - iv_regr.predict(df)
compliance = np.cov(T, Z)[0, 1]/Z.var()

se = np.std(e_iv)/(compliance*np.std(Z)*np.sqrt(n))

print("SE IV:", se)
print("95% CI:", [late - 2*se, late + 2*se])

In [None]:
from linearmodels import IV2SLS

formula = 'pv ~ 1 + [prime_card ~ prime_elegible]'
iv_model = IV2SLS.from_formula(formula, df).fit(cov_type="unadjusted")

iv_model.summary.tables[1]

In [None]:
se_formula_iv = lambda compliance: np.std(e_iv)/(compliance*np.std(Z)*np.sqrt(n))
x = np.linspace(0.01, 1, 50)

effect = iv_regr.params["prime_card"]


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(x, effect-se_formula_iv(x)*2, label="SE($\\beta_{IV}$) x2", ls=":", color="0")
ax1.plot(x, effect+se_formula_iv(x)*2, ls=":", color="0")
ax1.hlines(effect, 0, 1, ls="-.", label="LATE")
ax1.hlines(0, 0, 1)
ax1.set_xlabel("Compliance")
ax1.set_ylim(-(effect+100), (effect+100)*2)
ax1.legend(loc="lower right")
ax1.set_title("95% CI around LATE");


x = np.linspace(0.2, 1, 50)
ax2.plot(x, 1/(x**2))
ax2.hlines(10, 0.2, 1, ls=":", label="10")
ax2.hlines(4, 0.2, 1, ls="-.", label="4")
ax2.set_xlabel("Compliance")
ax2.set_ylabel("$N_{IV}$/N")
ax2.set_title("Required Sample Size Ratio")
ax2.legend()


## Additional Controls and Instruments


In [None]:
gr = Digraph(format="png", graph_attr={"rankdir":"LR"})

gr.edge("U", "T")
gr.edge("U", "Y")
gr.edge("Z", "T")
gr.edge("T", "Y")
gr.edge("Income", "Y")
gr.edge("Age", "T")
gr.edge("Age", "Y")
gr.edge("Score", "T")

gr

In [None]:
formula = 'pv ~ 1 + [prime_card ~ prime_elegible + credit_score]'
iv_model = IV2SLS.from_formula(formula, df).fit()

iv_model.summary.tables[1]

In [None]:
formula = '''pv ~ 1 
+ [prime_card ~ prime_elegible + credit_score]
+ income + age'''

iv_model = IV2SLS.from_formula(formula, df).fit(cov_type="unadjusted")

iv_model.summary.tables[1]

### 2SLS by Hand


In [None]:
formula_1st = "prime_card ~ prime_elegible + credit_score + income+age"
first_stage = smf.ols(formula_1st, data=df).fit()

iv_model = smf.ols(
    "pv ~ prime_card + income + age",
    data=df.assign(prime_card=first_stage.fittedvalues)).fit()

iv_model.summary().tables[1]

### Matrix Implementation


In [None]:
Z = df[["prime_elegible", "credit_score", "income", "age"]].values
X = df[["prime_card", "income", "age"]].values
Y = df[["pv"]].values

def add_intercept(x):
    return np.concatenate([np.ones((x.shape[0], 1)), x], axis=1)

Z_ = add_intercept(Z)
X_ = add_intercept(X)

# pre-multiplying Z_.dot(...) last is important to avoid
# creating a huge NxN matrix
X_hat = Z_.dot(np.linalg.inv(Z_.T.dot(Z_)).dot(Z_.T).dot(X_))

b_iv = np.linalg.inv(X_hat.T.dot(X_hat)).dot(X_hat.T).dot(Y)
b_iv[1]

In [None]:
e_hat_iv = (Y - X_.dot(b_iv))

var = e_hat_iv.var()*np.diag(np.linalg.inv(X_hat.T.dot(X_hat)))

np.sqrt(var[1])

In [None]:
t_tilde = smf.ols("prime_card ~ income + age", data=df).fit().resid

e_hat_iv.std()/(t_tilde.std()*np.sqrt(n*first_stage.rsquared))

## Discontinuity Design


### Discontinuity Design Assumptions


### Intention to Treat Effect


In [None]:
df_dd = pd.read_csv("./data/prime_card_discontinuity.csv")
df_dd.head()

In [None]:
m = smf.ols(f"pv~balance*I(balance>0)",
            df_dd.assign(balance = lambda d: d["balance"] - 5000)).fit()
m.summary().tables[1]

In [None]:
plt_df = df_dd.round({"balance": -2}).assign(size=1).groupby("balance").agg({"pv":"mean", "size": "sum"}).reset_index()

plt.figure(figsize=(8,4))
sns.scatterplot(data=plt_df, y="pv", x="balance", size="size", color="C5")
plt.plot(plt_df.query("balance<5000")["balance"], m.predict(plt_df.query("balance<5000").assign(balance = lambda d: d["balance"] - 5000)), color="C0", lw=2, label="regression")
plt.plot(plt_df.query("balance>5000")["balance"], m.predict(plt_df.query("balance>5000").assign(balance = lambda d: d["balance"] - 5000)), color="C0", lw=2)
plt.legend(fontsize=14)


### The IV Estimate

Errata: The book does not center the discontinuity at zero. Therefore, the intercept for the ITTE regression cannot be interpreted as the ITTE. Here, I'm centering the discontinuity at zero.

In [None]:
def rdd_iv(data, y, t, r, cutoff):
    
    centered_df = data.assign(**{r: data[r]-cutoff})
    
    compliance = smf.ols(f"{t}~{r}*I({r}>0)", centered_df).fit()
    itte = smf.ols(f"{y}~{r}*I({r}>0)", centered_df).fit()
    
    param = f"I({r} > 0)[T.True]"
    return itte.params[param]/compliance.params[param]


rdd_iv(df_dd, y="pv", t="prime_card", r="balance", cutoff=5000)

In [None]:
(df_dd
 .round({"balance":-2}) # round to nearest hundred
 .query("balance==5000 & categ=='complier'")["tau"].mean())

In [None]:
from joblib import Parallel, delayed
from toolz import partial

def bootstrap(data, est_fn, rounds=200, seed=123, pcts=[2.5, 97.5]):
    np.random.seed(seed)
    
    stats = Parallel(n_jobs=4)(
        delayed(est_fn)(data.sample(frac=1, replace=True))
        for _ in range(rounds)
    )
    
    return np.percentile(stats, pcts)


bootstrap(df_dd,
          partial(rdd_iv, y="pv", t="prime_card", r="balance", cutoff=5000))

### Bunching


In [None]:
plt.figure(figsize=(10,4))
df_dd.round({"balance":-3}).groupby("balance").size().plot.bar()
plt.ylabel("Sample Size.")
plt.xlabel("Balance (rounded to nearest 1000)")

## Key Ideas


In [None]:
gr = Digraph(format="png", graph_attr={"rankdir":"LR"})

gr.edge("U", "School")
gr.edge("U", "Income")
gr.edge("Quarter", "School")
gr.edge("School", "Income")
gr