In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import os


In [2]:
RANDOM_SEED = 42

In [3]:
wf = pd.read_csv("data/wf.csv")
city_yb = pd.read_csv("data/city_yb.csv")

In [4]:
wf = pd.read_csv("data/wf.csv")
city_yb = pd.read_csv("data/city_yb.csv")

wf["temp2"] = wf["temp"] ** 2
wf["l_aqi"] = np.log(1 + wf["aqi"])
wf["l_pm"] = np.log(1 + wf["pm"])
wf2020 = wf[(wf["daynum"] >= 8401) & (wf["daynum"]<= 8461)].dropna(
    subset = ['aqi', 'pm']
)
wf2020['cities'] = wf2020['city_code'].astype('category')
wf2020['days'] = wf2020['daynum'].astype('category')
wf2020 = pd.get_dummies(wf2020, drop_first=True)

fixed = ['treat']
for col in wf2020.columns:
    if 'cities' in col or 'days' in col:
        fixed.append(col)
        
weather = ['prec', 'snow', 'temp', 'temp2']
out = ["aqi", "l_aqi", "pm", "l_pm"]

## DOUBLE ML

In [5]:
treated = wf2020[wf2020['treat'] == 1]
treated = treated[['daynum', 'city_code']].groupby('city_code')
first = treated.apply(lambda x: x.sort_values(by = 'daynum', ascending=True).head(1))

day, count = np.unique(first.daynum, return_counts = True)
treat_day = day[count == max(count)][0]

num_cities = {d:c for d,c in zip(day, count)}
first = {city:day for day, city in first.values}

In [6]:
wf2020 = wf2020.assign(first = [first.get(city, 0) for city in wf2020['city_code']])
wf2020['A'] = (wf2020['daynum'] == wf2020['first']).astype('int64')
#wf2020 = pd.get_dummies(wf2020, drop_first=True)

In [7]:
wf2020['diff'] = wf2020.sort_values(by = 'daynum')['aqi'] \
            - wf2020.sort_values(by = 'daynum').groupby('city_code')['aqi'].shift(1)
wf2020 = wf2020.dropna(subset= ['diff'])

In [8]:
outcome = wf2020['diff']
treatment = wf2020['A']
confounders = wf2020[['first'] + weather]

In [9]:
# specify a model for the conditional expected outcome

# TODO(victorveitch) the covariates have basically no predictive power, replace this example with something better

# make a function that returns a sklearn model for later use in k-folding
def make_Q_model():
    #return LinearRegression()
    return RandomForestRegressor(random_state=RANDOM_SEED, n_estimators = 100, max_depth=10)
Q_model = make_Q_model()

# Sanity check that chosen model actually improves test error
# A real analysis should give substantial attention to model selection and validation 

X_w_treatment = confounders.copy()
X_w_treatment["treatment"] = treatment

X_train, X_test, y_train, y_test = train_test_split(X_w_treatment, outcome, test_size=0.2)
Q_model.fit(X_train, y_train)
y_pred = Q_model.predict(X_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}") 
baseline_mse=mean_squared_error(y_train.mean()*np.ones_like(y_test), y_test)
print(f"Test MSE of no-covariate model {baseline_mse}")

Test MSE of fit model 1064.9597086174613
Test MSE of no-covariate model 1141.3827235666568


In [17]:
# specify a model for the propensity score

def make_g_model():
    #return LogisticRegression(max_iter=1000)
    return RandomForestClassifier(random_state=RANDOM_SEED, n_estimators=100, max_depth=3)

g_model = make_g_model()
# Sanity check that chosen model actually improves test error
# A real analysis should give substantial attention to model selection and validation 

X_train, X_test, a_train, a_test = train_test_split(confounders, treatment, test_size=0.2)
g_model.fit(X_train, a_train)
a_pred = g_model.predict_proba(X_test)[:,1]

test_ce=log_loss(a_test, a_pred)
print(f"Test CE of fit model {test_ce}") 
baseline_ce=log_loss(a_test, a_train.mean()*np.ones_like(a_test))
print(f"Test CE of no-covariate model {baseline_ce}")

Test CE of fit model 0.02818858527482042
Test CE of no-covariate model 0.03647026313211039


In [18]:
a_pred.min()

0.0005570707343122783

In [19]:
def att_aiptw(Q0, Q1, g, A, Y, prob_t=None):
    """
    # Double ML estimator for the ATT
    This uses the ATT specific scores, see equation 3.9 of https://www.econstor.eu/bitstream/10419/149795/1/869216953.pdf
    """

    if prob_t is None:
        prob_t = A.mean() # estimate marginal probability of treatment

    tau_hat = (A*(Y-Q0) - (1-A)*(g/(1-g))*(Y-Q0)).mean()/ prob_t
  
    scores = (A*(Y-Q0) - (1-A)*(g/(1-g))*(Y-Q0) - tau_hat*A) / prob_t
    n = Y.shape[0] # number of observations
    std_hat = np.std(scores) / np.sqrt(n)

    return tau_hat, std_hat


In [20]:
# for comparison, the point estimate without any covariate correction
outcome[treatment==1].mean()-outcome[treatment==0].mean()

-11.684548868980684

In [21]:
res = dict()
Q_model.fit(X_w_treatment, outcome)
g_model.fit(confounders, treatment)

RandomForestClassifier(max_depth=3, random_state=42)

In [22]:
for day in np.unique(wf2020['daynum']):
    df = wf2020[wf2020['daynum'] == day]
    outcome_t = df['diff']
    treatment_t = df['A']
    confounders_t = df[['first'] + weather]
    
    if df['A'].sum() == 0 or num_cities[day] < 2:
        continue
    
    X1 = confounders_t.copy()
    X0 = confounders_t.copy()
    X1["treatment"] = 1
    X0["treatment"] = 0
    
    Q0 = Q_model.predict(X0)
    Q1 = Q_model.predict(X1)
    g = g_model.predict_proba(confounders_t)[:,1]
    
    est, sd = att_aiptw(Q0, Q1, g, treatment_t, outcome_t)
    res[day] = (est, sd)
    
inv_var = np.array([1/v**2 for p,v in res.values()])
point = np.array([p for p,v in res.values()])    

tau_hat = (point * inv_var).sum()/inv_var.sum()
std_hat = np.sqrt(1/inv_var.sum())

print('%0.3f pm %0.3f' % (tau_hat, std_hat))

-4.916 pm 1.542
