# Credibility/Sensitivity

In [32]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import statsmodels.api as sm
import os
import pathlib


from austen_plots.AustenPlot import AustenPlot

In [33]:
RANDOM_SEED = 42

def make_Q_model():
    return LinearRegression()
    return RandomForestRegressor(random_state=RANDOM_SEED, n_estimators=100, max_depth=5)

def make_g_model():
    return LogisticRegression(max_iter=1000)
    return RandomForestClassifier(random_state=RANDOM_SEED, n_estimators=100, max_depth=5)

# helper functions to implement the cross fitting

def treatment_k_fold_fit_and_predict(make_model, X:pd.DataFrame, A:np.array, n_splits:int):
    """
    Implements K fold cross-fitting for the model predicting the treatment A. 
    That is, 
    1. Split data into K folds
    2. For each fold j, the model is fit on the other K-1 folds
    3. The fitted model is used to make predictions for each data point in fold j
    Returns an array containing the predictions  

    Args:
    model: function that returns sklearn model (which implements fit and predict_prob)
    X: dataframe of variables to adjust for
    A: array of treatments
    n_splits: number of splits to use
    """
    predictions = np.full_like(A, np.nan, dtype=float)
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    
    for train_index, test_index in kf.split(X, A):
        X_train = X.loc[train_index]
        A_train = A.loc[train_index]
        g = make_model()
        g.fit(X_train, A_train)

        # get predictions for split
        predictions[test_index] = g.predict_proba(X.loc[test_index])[:, 1]

    assert np.isnan(predictions).sum() == 0
    return predictions


def outcome_k_fold_fit_and_predict(make_model, X:pd.DataFrame, y:np.array, A:np.array, n_splits:int, output_type:str):
    """
    Implements K fold cross-fitting for the model predicting the outcome Y. 
    That is, 
    1. Split data into K folds
    2. For each fold j, the model is fit on the other K-1 folds
    3. The fitted model is used to make predictions for each data point in fold j
    Returns two arrays containing the predictions for all units untreated, all units treated  

    Args:
    model: function that returns sklearn model (that implements fit and either predict_prob or predict)
    X: dataframe of variables to adjust for
    y: array of outcomes
    A: array of treatments
    n_splits: number of splits to use
    output_type: type of outcome, "binary" or "continuous"

    """
    predictions0 = np.full_like(A, np.nan, dtype=float)
    predictions1 = np.full_like(y, np.nan, dtype=float)
    if output_type == 'binary':
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    elif output_type == 'continuous':
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

    # include the treatment as input feature
    X_w_treatment = X.copy()
    X_w_treatment["A"] = A

    # for predicting effect under treatment / control status for each data point 
    X0 = X_w_treatment.copy()
    X0["A"] = 0
    X1 = X_w_treatment.copy()
    X1["A"] = 1

    
    for train_index, test_index in kf.split(X_w_treatment, y):
        X_train = X_w_treatment.loc[train_index]
        y_train = y.loc[train_index]
        q = make_model()
        q.fit(X_train, y_train)

        if output_type =='binary':
            predictions0[test_index] = q.predict_proba(X0.loc[test_index])[:, 1]
            predictions1[test_index] = q.predict_proba(X1.loc[test_index])[:, 1]
        elif output_type == 'continuous':
            predictions0[test_index] = q.predict(X0.loc[test_index])
            predictions1[test_index] = q.predict(X1.loc[test_index])

    assert np.isnan(predictions0).sum() == 0
    assert np.isnan(predictions1).sum() == 0
    return predictions0, predictions1


def att_aiptw(Q0, Q1, g, A, Y, prob_t=None):
    """
    # Double ML estimator for the ATT
    This uses the ATT specific scores, see equation 3.9 of https://www.econstor.eu/bitstream/10419/149795/1/869216953.pdf
    """

    if prob_t is None:
        prob_t = A.mean() # estimate marginal probability of treatment

    tau_hat = (A*(Y-Q0) - (1-A)*(g/(1-g))*(Y-Q0)).mean()/ prob_t
  
    scores = (A*(Y-Q0) - (1-A)*(g/(1-g))*(Y-Q0) - tau_hat*A) / prob_t
    n = Y.shape[0] # number of observations
    std_hat = np.std(scores) / np.sqrt(n)

    return tau_hat, std_hat

### Single time period

In [34]:
wf = pd.read_csv("data/wf.csv")
city_yb = pd.read_csv("data/city_yb.csv")

# wf.drop('Unnamed: 0', axis = 1)
# city_yb.drop('Unnamed: 0', axis = 1)
# city_yb = city_yb.dropna()
# print(len(wf))
# wf = wf.merge(city_yb, on='city_code').dropna(
#     subset = ['sec_city', 'gdp_city', 'pgdp_city', 
#               'firm_city', 'gonglu', 'emit_ww', 'emit_so1', 'emi_dust1',
#               'aqi', 'pm']
# )
# print(len(wf))

wf["temp2"] = wf["temp"] ** 2
wf["l_aqi"] = np.log(1 + wf["aqi"])
wf["l_pm"] = np.log(1 + wf["pm"])
wf2020 = wf[(wf["daynum"] >= 8401) & (wf["daynum"]<= 8461)].dropna(
    subset = ['aqi', 'pm']
)
wf2020['cities'] = wf2020['city_code'].astype('category')
wf2020['days'] = wf2020['daynum'].astype('category')
wf2020 = pd.get_dummies(wf2020, drop_first=True)

fixed = ['treat']
city_fixed = []
time_fixed = []
for col in wf2020.columns:
    if 'cities' in col:
        city_fixed.append(col)
    if 'days' in col:
        time_fixed.append(col)
fixed = fixed + city_fixed + time_fixed
        
weather = ['prec', 'snow', 'temp', 'temp2']
city_economic = ['pop_city', 'sec_city', 'gdp_city' , 'pgdp_city', 'firm_city']
city_environmental = ['gonglu', 'emit_ww', 'emit_so1', 'emi_dust1']
out = ["aqi", "l_aqi", "pm", "l_pm"]

treated = wf2020[wf2020['treat'] == 1]
treated = treated[['daynum', 'city_code']].groupby('city_code')
first = treated.apply(lambda x: x.sort_values(by = 'daynum', ascending=True).head(1))

day, count = np.unique(first.daynum, return_counts = True)
treat_day = day[count == max(count)][0]

num_cities = {d:c for d,c in zip(day, count)}
first = {city:day for day, city in first.values}

wf2020 = wf2020.assign(first = [first.get(city, 0) for city in wf2020['city_code']])
group = wf2020[(wf2020['first'] == treat_day) | (wf2020['first'] == 0)]
wf2020['first'] = wf2020['first'].astype('category')
wf2020 = pd.get_dummies(wf2020, drop_first=True)

group['pre'] = group['daynum'] < treat_day
group = group.groupby(['city_code', 'pre']).mean().reset_index('pre')

compact = group[~group['pre']]
aqi = group['aqi'].values
compact['Y1-Y0'] = aqi[~group['pre']] - aqi[group['pre']]


compact = compact.reset_index()
outcome = compact['Y1-Y0']
treatment = compact['treat']
confounders = compact[city_fixed + time_fixed + weather]

g = treatment_k_fold_fit_and_predict(make_g_model, X=confounders, A=treatment, n_splits=10)
Q0,Q1=outcome_k_fold_fit_and_predict(make_Q_model, X=confounders, y=outcome, A=treatment, n_splits=10, output_type="continuous")

data_and_nuisance_estimates = pd.DataFrame({'g': g, 'Q0': Q0, 'Q1': Q1, 'A': treatment, 'Y': outcome})

tau_hat, std_hat = att_aiptw(**data_and_nuisance_estimates)
print(f"The estimate is {tau_hat} pm {1.96*std_hat}")

# for comparison, the point estimate without any covariate correction
outcome[treatment==1].mean()-outcome[treatment==0].mean()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The estimate is -19.861800850414966 pm 11.89715227874221


-18.886275852614084

In [35]:
compact

Unnamed: 0.1,city_code,pre,Unnamed: 0,date,prec,snow,temp,aqi,co,no2,...,days_8454,days_8455,days_8456,days_8457,days_8458,days_8459,days_8460,days_8461,first,Y1-Y0
0,3547,False,17973.5,2.020022e+07,12.066931,82.475749,-3.213173,63.077457,1.043483,29.020299,...,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,8436.0,-47.483196
1,4870,False,41679.5,2.020022e+07,19.911129,60.060211,8.265350,52.420406,0.755353,24.584402,...,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.0,-38.480972
2,4956,False,45630.5,2.020022e+07,17.183897,55.427658,7.256715,74.777778,0.599033,15.702991,...,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.0,-67.323789
3,5996,False,42996.5,2.020022e+07,16.372821,60.033804,8.216290,53.088675,0.664156,20.947650,...,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.0,-37.060072
4,6145,False,4364.5,2.020022e+07,39.948305,58.197934,-1.215675,64.752137,1.073328,28.256410,...,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.0,-1.856384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,102298,False,118504.5,2.020022e+07,8.359323,48.622150,9.711950,35.453526,0.627885,6.076389,...,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.0,6.763426
251,102730,False,124211.5,2.020022e+07,7.468586,47.417817,6.140083,106.738782,1.157516,27.941773,...,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.0,-47.700253
252,105012,False,104456.5,2.020022e+07,3.043103,48.781714,11.076449,78.638355,0.696608,19.128739,...,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.0,3.184062
253,105016,False,131235.5,2.020022e+07,4.517875,49.336072,0.334428,71.642094,0.685550,16.830662,...,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.038462,0.0,-7.957029


#### Multiple time period

In [36]:
wf2020 = wf2020.assign(first = [first.get(city, 0) for city in wf2020['city_code']])
wf2020['A'] = (wf2020['daynum'] == wf2020['first']).astype('int64')

wf2020['diff'] = wf2020.sort_values(by = 'daynum')['aqi'] \
            - wf2020.sort_values(by = 'daynum').groupby('city_code')['aqi'].shift(1)
wf2020 = wf2020.dropna(subset= ['diff'])

outcome = wf2020['diff']
treatment = wf2020['A']
confounders = wf2020[['daynum'] + weather + city_fixed + time_fixed]

# specify a model for the conditional expected outcome

# TODO(victorveitch) the covariates have basically no predictive power, replace this example with something better

# make a function that returns a sklearn model for later use in k-folding
def make_Q_model():
    #return LinearRegression()
    return RandomForestRegressor(random_state=RANDOM_SEED, n_estimators = 100, max_depth=10)
Q_model = make_Q_model()

# Sanity check that chosen model actually improves test error
# A real analysis should give substantial attention to model selection and validation 

X_w_treatment = confounders.copy()
X_w_treatment["treatment"] = treatment

X_train, X_test, y_train, y_test = train_test_split(X_w_treatment, outcome, test_size=0.2)
Q_model.fit(X_train, y_train)
y_pred = Q_model.predict(X_test)


# specify a model for the propensity score

def make_g_model():
    #return LogisticRegression(max_iter=1000)
    return RandomForestClassifier(random_state=RANDOM_SEED, n_estimators=100, max_depth=3)

g_model = make_g_model()
# Sanity check that chosen model actually improves test error
# A real analysis should give substantial attention to model selection and validation 

X_train, X_test, a_train, a_test = train_test_split(confounders, treatment, test_size=0.2)
g_model.fit(X_train, a_train)
a_pred = g_model.predict_proba(X_test)[:,1]

# for comparison, the point estimate without any covariate correction
print(outcome[treatment==1].mean()-outcome[treatment==0].mean())

res = dict()
Q_model.fit(X_w_treatment, outcome)
g_model.fit(confounders, treatment)

for day in np.unique(wf2020['daynum']):
    df = wf2020[wf2020['daynum'] == day]
    outcome_t = df['diff']
    treatment_t = df['A']
    confounders_t = df[['daynum'] + weather + city_fixed + time_fixed]
    
    if df['A'].sum() == 0 or num_cities[day] < 2:
        continue
    
    X1 = confounders_t.copy()
    X0 = confounders_t.copy()
    X1["treatment"] = 1
    X0["treatment"] = 0
    
    Q0 = Q_model.predict(X0)
    Q1 = Q_model.predict(X1)
    g = g_model.predict_proba(confounders_t)[:,1]
    
    est, sd = att_aiptw(Q0, Q1, g, treatment_t, outcome_t)
    res[day] = (est, sd)
    
inv_var = np.array([1/v**2 for p,v in res.values()])
point = np.array([p for p,v in res.values()])    

tau_hat = (point * inv_var).sum()/inv_var.sum()
std_hat = np.sqrt(1/inv_var.sum())

print('%0.3f pm %0.3f' % (tau_hat, std_hat))

-11.684548868980686
-11.746 pm 1.545


## Parallel trends

Parametric multiple-time period

In [37]:
# create week coefficient 
treated = wf2020[wf2020['treat'] == 1]
treated = treated[['daynum', 'city_code']].groupby('city_code')
first = treated.apply(lambda x: x.sort_values(by = 'daynum', ascending=True).head(1))
day, count = np.unique(first.daynum, return_counts = True)
treat_day = day[count == max(count)][0]
first = {city:day for day, city in first.values}
wf2020 = wf2020.assign(first = [first.get(city, 0) for city in wf2020['city_code']])
wf2020["week_coef"] = np.floor((wf2020["daynum"] - wf2020["first"])/7).astype(int)

# set -1 lead and untreated to NaN so they don't get week0 dummy
wf2020["week_coef"] = np.where((wf2020["week_coef"] == -1), np.NaN, wf2020["week_coef"])
wf2020["week_coef"][wf2020["first"] == 0] = np.NaN
wf2020["week_coef"] = wf2020["week_coef"].astype('category')
wf2020 = pd.get_dummies(wf2020)

week_coef = []
for col in wf2020.columns:
    if 'week_coef' in col:
        week_coef.append(col)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [38]:
for Yname in out:
    Y = wf2020[Yname]
    X = wf2020[fixed + weather + week_coef]
    fit = sm.OLS(Y, sm.add_constant(X)).fit()
    print(Yname)
    print(*list(zip([index for index in fit.params.index if 'week_coef' in index],
                    fit.params[[index for index in fit.params.index if 'week_coef' in index]], 
                   2*fit.bse[[index for index in fit.params.index if 'week_coef' in index]])), sep="\n")

aqi
('week_coef_-8.0', -19.67750644058875, 34.78439824125761)
('week_coef_-7.0', -7.5558915665224, 30.006947920031873)
('week_coef_-6.0', -5.2049938273005125, 17.09890057195802)
('week_coef_-5.0', 0.671156619812766, 5.340276359861647)
('week_coef_-4.0', -6.409588715384092, 4.698786186081183)
('week_coef_-3.0', -2.9318971044050044, 4.299834886919208)
('week_coef_-2.0', 2.587539405094274, 4.244627961084665)
('week_coef_0.0', 11.059228639271826, 3.3256560165105364)
('week_coef_1.0', 6.649594635204274, 3.3192584528342475)
('week_coef_2.0', -6.443418794640473, 3.271662615242559)
('week_coef_3.0', -8.533294552083818, 3.425156780194195)
('week_coef_4.0', -9.942218345641072, 4.878733474988247)
('week_coef_5.0', -15.482991974087353, 9.309041106773398)
l_aqi
('week_coef_-8.0', -0.17094816949989425, 0.3640543449416856)
('week_coef_-7.0', -0.09276540879467923, 0.31405343547870795)
('week_coef_-6.0', -0.007958147742662293, 0.1789575028371148)
('week_coef_-5.0', 0.0003076119666239954, 0.055891460260

check parallel trends on two-period subsets (23 and 16)

In [87]:
# E[Y_{t+1} - Y_{t} | A=1, X] - E[Y_{t+1} - Y_{t} | A=0, X] = 0 in all pre-treatment periods t.
#  if you've fit a model for Q(a,x) = E[Y_{t+1} - Y_{t} | A=a, x] then you can plot diff(t) = 1/n \sum_i Q(1,x_i) - Q(0,x_i)



day_23 = day[count == max(count)][0]
wf23 = wf2020[(wf2020['first'] == day_23) | (wf2020['first'] == 0)]

# estimate for each day
day_diffs = []
day_list = np.sort(np.unique(wf23['daynum']))
for d in day_list:
    df = wf23[wf23['daynum'] == d]
    confounders_t = df[['daynum'] + weather + city_fixed + time_fixed]
    
    X1 = confounders_t.copy()
    X0 = confounders_t.copy()
    X1["treatment"] = 1
    X0["treatment"] = 0
    
    Q0 = Q_model.predict(X0)
    Q1 = Q_model.predict(X1)
    
    day_diffs.append(np.mean(Q1 - Q0))

list(zip(day_list, day_diffs))

[(8402, 0.0),
 (8403, 0.0),
 (8404, 0.0),
 (8405, 0.0),
 (8406, 0.0),
 (8407, 0.0),
 (8408, 0.0),
 (8409, 0.0),
 (8410, 0.0),
 (8411, 0.0),
 (8412, 0.0),
 (8413, 0.0),
 (8414, 0.0),
 (8415, 0.0),
 (8416, 0.0),
 (8417, 0.0),
 (8418, 0.0),
 (8419, 0.0),
 (8420, 0.0),
 (8421, 0.0),
 (8422, 0.0),
 (8423, 0.0),
 (8424, -0.38133389606799645),
 (8425, -0.126985867358723),
 (8426, 1.790454412458783),
 (8427, 0.0),
 (8428, 0.0),
 (8429, 0.0),
 (8430, 0.0),
 (8431, 0.0),
 (8432, 0.0),
 (8433, 0.0),
 (8434, 0.0),
 (8435, 0.0),
 (8436, 0.0),
 (8437, 0.0),
 (8438, 0.0),
 (8439, 0.0),
 (8440, 0.0),
 (8441, 0.0),
 (8442, 0.0),
 (8443, 0.0),
 (8444, 0.0),
 (8445, 0.0),
 (8446, 0.0),
 (8447, 0.0),
 (8448, 0.0),
 (8449, 0.0),
 (8450, 0.0),
 (8451, 0.0),
 (8452, 0.0),
 (8453, 0.0),
 (8454, 0.0),
 (8455, 0.0),
 (8456, 0.0),
 (8457, 0.0),
 (8458, 0.0),
 (8459, 0.0),
 (8460, 0.0),
 (8461, 0.0)]

In [91]:
X1

Unnamed: 0,daynum,prec,snow,temp,temp2,cities_3547,cities_4870,cities_4956,cities_5993,cities_5996,...,days_8453,days_8454,days_8455,days_8456,days_8457,days_8458,days_8459,days_8460,days_8461,treatment
16225,8457,0.093731,123.399300,-1.931199,3.729528,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
16226,8458,0.417873,118.744040,-1.312069,1.721525,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
16227,8459,0.769849,63.414462,0.400970,0.160777,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
16228,8460,0.335493,41.602661,0.971140,0.943113,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
16229,8461,0.146075,103.232866,-0.591814,0.350244,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113244,8457,0.050583,63.074559,13.997334,195.925373,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
113245,8458,73.515920,12.815312,14.244464,202.904752,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
113246,8459,83.192755,39.559249,14.188429,201.311514,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
113247,8460,0.097458,35.937346,13.350782,178.243391,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [88]:
# estimate by week
weeks_list = []
week_diffs = []
for w in week_coef:
    wf23_week = wf23[wf23[w] == 1]
    if len(wf23_week) == 0:
        continue
    confounders_t = wf23_week[['daynum'] + weather + city_fixed + time_fixed]
    
    X1 = confounders_t.copy()
    X0 = confounders_t.copy()
    X1["treatment"] = 1
    X0["treatment"] = 0
    
    Q0 = Q_model.predict(X0)
    Q1 = Q_model.predict(X1)
    weeks_list.append(w)
    week_diffs.append(np.mean(Q1 - Q0))
    
list(zip(weeks_list, week_diffs))

[('week_coef_-5.0', 0.0),
 ('week_coef_-4.0', 0.0),
 ('week_coef_-3.0', 0.0),
 ('week_coef_-2.0', -0.05448249179145666),
 ('week_coef_0.0', 0.0),
 ('week_coef_1.0', 0.0),
 ('week_coef_2.0', 0.0),
 ('week_coef_3.0', 0.0)]

In [89]:
day_16 = day[3]

### Model fit

In [24]:
Q_model

RandomForestRegressor(max_depth=10, random_state=42)

## Covariate Influence Strength

#### Single time period

In [None]:
def _convert_to_austen_format(nuisance_estimate_df: pd.DataFrame):
  austen_df = pd.DataFrame()
  austen_df['y']=nuisance_estimate_df['Y']
  austen_df['t']=nuisance_estimate_df['A']
  austen_df['g']=nuisance_estimate_df['g']
  A = nuisance_estimate_df['A']
  austen_df['Q']=A*nuisance_estimate_df['Q1'] + (1-A)*nuisance_estimate_df['Q0'] # use Q1 when A=1, and Q0 when A=0

  return austen_df

In [None]:
covariate_groups = {
    'city_fixed': city_fixed,
    'time_fixed': time_fixed,
    'weather': weather,
    }

In [None]:
# For each covariate group, refit the models without using that group
nuisance_estimates = {}
for group, covs in covariate_groups.items():
    remaining_confounders = confounders.drop(columns=covs)
    print(remaining_confounders)
    
    g = treatment_k_fold_fit_and_predict(make_g_model, X=remaining_confounders, A=treatment, n_splits=10)
    Q0, Q1 = outcome_k_fold_fit_and_predict(make_Q_model, X=remaining_confounders, y=outcome, A=treatment, n_splits=10, output_type="continuous")

    data_and_nuisance_estimates = pd.DataFrame({'g': g, 'Q0': Q0, 'Q1': Q1, 'A': treatment, 'Y': outcome})
    nuisance_estimates[group] = data_and_nuisance_estimates

In [None]:
data_and_nuisance_path = 'data_and_nuisance_estimates.csv'
covariate_dir_path = 'covariates/'

In [None]:
def _convert_to_austen_format(nuisance_estimate_df: pd.DataFrame):
  austen_df = pd.DataFrame()
  austen_df['y']=nuisance_estimate_df['Y']
  austen_df['t']=nuisance_estimate_df['A']
  austen_df['g']=nuisance_estimate_df['g']
  A = nuisance_estimate_df['A']
  austen_df['Q']=A*nuisance_estimate_df['Q1'] + (1-A)*nuisance_estimate_df['Q0'] # use Q1 when A=1, and Q0 when A=0

  return austen_df

In [None]:
austen_data_and_nuisance = _convert_to_austen_format(data_and_nuisance_estimates)
austen_data_and_nuisance.to_csv(data_and_nuisance_path, index=False)

pathlib.Path(covariate_dir_path).mkdir(exist_ok=True)
for group, nuisance_estimate in nuisance_estimates.items():
  austen_nuisance_estimate = _convert_to_austen_format(nuisance_estimate)
  austen_nuisance_estimate.to_csv(os.path.join(covariate_dir_path,group+".csv"), index=False)

In [None]:
target_bias = 8.00 # note: bias is specified as an absolute number
ap = AustenPlot(data_and_nuisance_path, covariate_dir_path)
p, plot_coords, variable_coords = ap.fit(bias=target_bias)
p

### Multiple time period

In [None]:
nuisance_estimates

In [None]:
nuisance_estimates

In [None]:
# For each covariate group, refit the models without using that group
nuisance_estimates = {}
for group, covs in covariate_groups.items():
    # make model with covariates dropped
    Q_model = make_Q_model()
    g_model = make_g_model()
    
    remaining_confounders = confounders.drop(columns=covs)
    X_w_treatment_rem = remaining_confounders.copy()
    X_w_treatment_rem["treatment"] = treatment
    Q_model.fit(X_w_treatment_rem, outcome)
    g_model.fit(remaining_confounders, treatment)
    
    g_list = []
    Q0_list = []
    Q1_list = []
    t_list = []
    out_list = []
    for day in np.unique(wf2020['daynum']):
        df = wf2020[wf2020['daynum'] == day]
        outcome_t = df['diff']
        treatment_t = df['A']
        confounders_t = df[['daynum'] + weather + city_fixed + time_fixed]
        remaining_confounders_t = confounders_t.drop(columns=covs)

        if df['A'].sum() == 0 or num_cities[day] < 2:
            continue

        X1 = remaining_confounders_t.copy()
        X0 = remaining_confounders_t.copy()
        X1["treatment"] = 1
        X0["treatment"] = 0

        Q0 = Q_model.predict(X0)
        Q1 = Q_model.predict(X1)
        g = g_model.predict_proba(remaining_confounders_t)[:,1]
        
        g_list += list(g)
        Q0_list += list(Q0)
        Q1_list += list(Q1)
        t_list += list(treatment_t)
        out_list += list(outcome_t)

#         est, sd = att_aiptw(Q0, Q1, g, treatment_t, outcome_t)
#         res[day] = (est, sd)
    
#     inv_var = np.array([1/v**2 for p,v in res.values()])
#     point = np.array([p for p,v in res.values()])
    
#     tau_hat = (point * inv_var).sum()/inv_var.sum()
#     std_hat = np.sqrt(1/inv_var.sum())
    
    data_and_nuisance_estimates = pd.DataFrame({'g': pd.Series(g_list), 
                                                'Q0': pd.Series(Q0_list), 'Q1': pd.Series(Q1_list), 
                                                'A': pd.Series(t_list), 'Y': pd.Series(out_list)})
    nuisance_estimates[group] = data_and_nuisance_estimates

austen_data_and_nuisance = _convert_to_austen_format(data_and_nuisance_estimates)
austen_data_and_nuisance.to_csv(data_and_nuisance_path, index=False)

pathlib.Path(covariate_dir_path).mkdir(exist_ok=True)
for group, nuisance_estimate in nuisance_estimates.items():
  austen_nuisance_estimate = _convert_to_austen_format(nuisance_estimate)
  austen_nuisance_estimate.to_csv(os.path.join(covariate_dir_path,group+".csv"), index=False)

In [None]:
austen_data_and_nuisance

In [None]:
target_bias = 5.00 # note: bias is specified as an absolute number
ap = AustenPlot(data_and_nuisance_path, covariate_dir_path)
p, plot_coords, variable_coords = ap.fit(bias=target_bias)
p