# Credibility/Sensitivity

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import statsmodels.api as sm
import os
import pathlib

from xgboost import XGBClassifier, XGBRegressor

from austen_plots.AustenPlot import AustenPlot

from utils.double_ml import *
from utils.analysis import *

In [2]:
RANDOM_SEED = 42

## Parallel trends

Parametric multiple-time period

In [3]:
# create week coefficient 
# treated = wf2020[wf2020['treat'] == 1]
# treated = treated[['daynum', 'city_code']].groupby('city_code')
# first = treated.apply(lambda x: x.sort_values(by = 'daynum', ascending=True).head(1))
# day, count = np.unique(first.daynum, return_counts = True)
# treat_day = day[count == max(count)][0]
# first = {city:day for day, city in first.values}
# wf2020 = wf2020.assign(first = [first.get(city, 0) for city in wf2020['city_code']])
# wf2020["week_coef"] = np.floor((wf2020["daynum"] - wf2020["first"])/7).astype(int)

# # set -1 lead and untreated to NaN so they don't get week0 dummy
# wf2020["week_coef"] = np.where((wf2020["week_coef"] == -1), np.NaN, wf2020["week_coef"])
# wf2020["week_coef"][wf2020["first"] == 0] = np.NaN
# wf2020["week_coef"] = wf2020["week_coef"].astype('category')
# wf2020 = pd.get_dummies(wf2020)

# week_coef = []
# for col in wf2020.columns:
#     if 'week_coef' in col:
#         week_coef.append(col)


In [4]:
# for Yname in out:
#     Y = wf2020[Yname]
#     X = wf2020[fixed + weather + week_coef]
#     fit = sm.OLS(Y, sm.add_constant(X)).fit()
#     print(Yname)
#     print(*list(zip([index for index in fit.params.index if 'week_coef' in index],
#                     fit.params[[index for index in fit.params.index if 'week_coef' in index]], 
#                    2*fit.bse[[index for index in fit.params.index if 'week_coef' in index]])), sep="\n")

check parallel trends on two-period subsets (23 and 16)

In [5]:
# E[Y_{t+1} - Y_{t} | A=1, X] - E[Y_{t+1} - Y_{t} | A=0, X] = 0 in all pre-treatment periods t.
#  if you've fit a model for Q(a,x) = E[Y_{t+1} - Y_{t} | A=a, x] then you can plot diff(t) = 1/n \sum_i Q(1,x_i) - Q(0,x_i)



# day_23 = day[count == max(count)][0]
# wf23 = wf2020[(wf2020['first'] == day_23) | (wf2020['first'] == 0)]

# # estimate for each day
# day_diffs = []
# day_list = np.sort(np.unique(wf23['daynum']))
# for d in day_list:
#     df = wf23[wf23['daynum'] == d]
#     confounders_t = df[['daynum'] + weather + city_fixed + time_fixed]
    
#     X1 = confounders_t.copy()
#     X0 = confounders_t.copy()
#     X1["treatment"] = 1
#     X0["treatment"] = 0
    
#     Q0 = Q_model.predict(X0)
#     Q1 = Q_model.predict(X1)
    
#     day_diffs.append(np.mean(Q1 - Q0))

# list(zip(day_list, day_diffs))

In [6]:
# X1

In [7]:
# # estimate by week
# weeks_list = []
# week_diffs = []
# for w in week_coef:
#     wf23_week = wf23[wf23[w] == 1]
#     if len(wf23_week) == 0:
#         continue
#     confounders_t = wf23_week[['daynum'] + weather + city_fixed + time_fixed]
    
#     X1 = confounders_t.copy()
#     X0 = confounders_t.copy()
#     X1["treatment"] = 1
#     X0["treatment"] = 0
    
#     Q0 = Q_model.predict(X0)
#     Q1 = Q_model.predict(X1)
#     weeks_list.append(w)
#     week_diffs.append(np.mean(Q1 - Q0))
    
# list(zip(weeks_list, week_diffs))

In [8]:
# day_16 = day[3]

In [9]:
# day_16

## Model fit

#### Single period

In [10]:
days = [8436, 8426]
Q_models = [LinearRegression, RandomForestRegressor, RandomForestRegressor, XGBRegressor]
Q_params = [{}, 
            {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 10},
            {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 3},
            {'n_jobs': 1,
             'objective': 'reg:squarederror'},
           ]
g_models = [LogisticRegression, RandomForestClassifier, RandomForestClassifier, XGBClassifier]
g_params = [{'max_iter':1000},
            {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 10},
           {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 3},
           {'use_label_encoder': False, 
            'n_jobs': 1, 
            'objective': 'binary:logistic', 
            'eval_metric': 'logloss'}]

In [11]:
wf2020 = make_wf2020()
fixed = ['treat']
city_fixed = []
time_fixed = []
for col in wf2020.columns:
    if 'cities' in col:
        city_fixed.append(col)
    if 'days' in col:
        time_fixed.append(col)
fixed = fixed + city_fixed + time_fixed
weather = ['prec', 'snow', 'temp', 'temp2']

In [15]:
out_vars=['aqi', 'l_aqi', 'pm', 'l_pm']
pd.options.mode.chained_assignment = None  # default='warn'

for out_var in out_vars:
    for i in range(0, len(Q_models)):
        print("======= ", out_var, "=======")
        print("Day: %d \n Q: %s \n g:%s" % (8436, Q_models[i], g_models[i]))
        Q_mse, g_ce, base_mse, base_ce = test_single_models(wf2020, treat_day=8436, outcome_var=out_var, 
                                            confounder_list=weather,
                                            Q_model_class=Q_models[i], Q_model_params=Q_params[i],
                                            g_model_class=g_models[i], g_model_params=g_params[i])
        print(f"Test MSE of Q model {Q_mse} (baseline: {base_mse})") 
        print(f"Test CE of g model {g_ce} (baseline: {base_ce})") 

Day: 8436 
 Q: <class 'sklearn.linear_model._base.LinearRegression'> 
 g:<class 'sklearn.linear_model._logistic.LogisticRegression'>
Test MSE of Q model 928.4545363093672 (baseline: 991.8240212613339)
Test CE of g model 0.3006517506211365 (baseline: 0.3033059674459485)
Day: 8436 
 Q: <class 'sklearn.ensemble._forest.RandomForestRegressor'> 
 g:<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Test MSE of Q model 666.0205329913273 (baseline: 991.8240212613339)
Test CE of g model 0.4548427116822693 (baseline: 0.3033059674459485)
Day: 8436 
 Q: <class 'sklearn.ensemble._forest.RandomForestRegressor'> 
 g:<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Test MSE of Q model 717.1066678677829 (baseline: 991.8240212613339)
Test CE of g model 0.23440256243323235 (baseline: 0.3033059674459485)
Day: 8436 
 Q: <class 'xgboost.sklearn.XGBRegressor'> 
 g:<class 'xgboost.sklearn.XGBClassifier'>
Test MSE of Q model 894.9422331988848 (baseline: 991.8240212613339)
Test CE of g model 0.

#### Multiple period

In [13]:
for out_var in out_vars:
    for i in range(0, len(Q_models)):
        print("======= ", out_var, "=======")
        print("Day: %d \n Q: %s \n g:%s" % (8436, Q_models[i], g_models[i]))
        Q_mse, g_ce, base_mse, base_ce = test_multi_models(wf2020, outcome_var=out_var, 
                                            confounder_list=['daynum'] + weather + city_fixed + time_fixed,
                                            Q_model_class=Q_models[i], Q_model_params=Q_params[i],
                                            g_model_class=g_models[i], g_model_params=g_params[i])
        print(f"Test MSE of Q model {Q_mse} (baseline: {base_mse})") 
        print(f"Test CE of g model {g_ce} (baseline: {base_ce})") 

Day: 8436 
 Q: <class 'sklearn.linear_model._base.LinearRegression'> 
 g:<class 'sklearn.linear_model._logistic.LogisticRegression'>
Test MSE of Q model 1126.6455008720734 (baseline: 1155.8062584179363)
Test CE of g model 0.023324756371169552 (baseline: 0.030056788589195416)
Day: 8436 
 Q: <class 'sklearn.ensemble._forest.RandomForestRegressor'> 
 g:<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Test MSE of Q model 937.1832603330149 (baseline: 1155.8062584179363)
Test CE of g model 0.02350721939805143 (baseline: 0.030056788589195416)
Day: 8436 
 Q: <class 'sklearn.ensemble._forest.RandomForestRegressor'> 
 g:<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Test MSE of Q model 1066.4395159647668 (baseline: 1155.8062584179363)
Test CE of g model 0.027135442661260157 (baseline: 0.030056788589195416)
Day: 8436 
 Q: <class 'xgboost.sklearn.XGBRegressor'> 
 g:<class 'xgboost.sklearn.XGBClassifier'>
Test MSE of Q model 885.8037026742313 (baseline: 1155.8062584179363)
Test 