In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import os

from xgboost import XGBClassifier, XGBRegressor

from utils.double_ml import *
from utils.analysis import *

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
RANDOM_SEED = 42

### Import data

In [3]:
wf2020 = make_wf2020(city_var=True)

fixed = ['treat']
city_fixed = []
time_fixed = []
for col in wf2020.columns:
    if 'cities' in col:
        city_fixed.append(col)
    if 'days' in col:
        time_fixed.append(col)
fixed = fixed + city_fixed + time_fixed
        
weather = ['prec', 'snow', 'temp', 'temp2']
city_economic = ['pop_city', 'gdp_city', 'firm_city']
city_environmental = ['gonglu', 'emit_ww', 'emit_so1', 'emi_dust1']
out = ["aqi", "pm"]

day, count, num_cities = get_day_count(wf2020)
treat_day = day[count == max(count)][0]

### Single time period

In [4]:
Q_models = [LinearRegression, RandomForestRegressor, XGBRegressor]
Q_params = [{}, 
            {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 10},
            {'n_jobs': 1,
             'objective': 'reg:squarederror',
             'random_state': RANDOM_SEED},
           ]
g_models = [LogisticRegression, RandomForestClassifier, XGBClassifier]
g_params = [{'max_iter':1000}, 
           {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 10},
           {'use_label_encoder': False, 
            'n_jobs': 1, 
            'objective': 'binary:logistic', 
            'eval_metric': 'logloss',
            'random_state': RANDOM_SEED}]

In [5]:
tau_list_single = []
std_list_single = []
Q_model_list_single = []
g_model_list_single = []
out_var_list_single = []

for out_var in out:
    for i in range(0, len(Q_models)):
        print("======= ", out_var, "=======")
        print("Day: %d \n Q: %s \n g:%s" % (8436, get_model_string(Q_models[i], Q_params[i]), 
                                            get_model_string(g_models[i], g_params[i])))
        tau_hat, std_hat, _, _ = single_period_estimate(wf2020, treat_day=treat_day, outcome_var=out_var, 
                                              confounder_list=weather + city_economic + city_environmental,
                                              Q_model_class=Q_models[i], Q_model_params=Q_params[i],
                                              g_model_class=g_models[i], g_model_params=g_params[i])
        tau_list_single.append(tau_hat)
        std_list_single.append(std_hat)
        Q_model_list_single.append(get_model_string(Q_models[i], Q_params[i]))
        g_model_list_single.append(get_model_string(g_models[i], g_params[i]))
        out_var_list_single.append(get_var_string(out_var))
        print(f"The estimate is {tau_hat} pm {1.96*std_hat}")
    

Day: 8436 
 Q: Lin./Log. Reg 
 g:Lin./Log. Reg
The estimate is -9.551428876892755 pm 15.032528485892538
Day: 8436 
 Q: RF (depth 10) 
 g:RF(depth 10)
The estimate is 0.6659278709882487 pm 10.229994668025448
Day: 8436 
 Q: XGBoost 
 g:XGBoost
The estimate is -3.9413818567367227 pm 34.2995351776955
Day: 8436 
 Q: Lin./Log. Reg 
 g:Lin./Log. Reg
The estimate is -7.351327742277995 pm 12.235188471643678
Day: 8436 
 Q: RF (depth 10) 
 g:RF(depth 10)
The estimate is 0.7626146302975149 pm 8.831497300865946
Day: 8436 
 Q: XGBoost 
 g:XGBoost
The estimate is -10.860707863219202 pm 38.14069448936063


In [6]:
df = pd.DataFrame({'outcome': out_var_list_single, 'model': Q_model_list_single,
             'estimate': np.round(tau_list_single, 2), 'p/m': np.round([1.96*std for std in std_list_single], 2)})
df

Unnamed: 0,outcome,model,estimate,p/m
0,AQI,Lin./Log. Reg,-9.55,15.03
1,AQI,RF (depth 10),0.67,10.23
2,AQI,XGBoost,-3.94,34.3
3,PM,Lin./Log. Reg,-7.35,12.24
4,PM,RF (depth 10),0.76,8.83
5,PM,XGBoost,-10.86,38.14


### Multiple time periods estimates

In [7]:
tau_list_multi = []
std_list_multi = []
Q_model_list_multi = []
g_model_list_multi = []
out_var_list_multi = []
res_dict_list = []


for out_var in out:
    for i in range(0, len(Q_models)):
        print("======= ", out_var, "=======")
        print("Q: %s \n g:%s" % (get_model_string(Q_models[i], Q_params[i]), 
                                 get_model_string(g_models[i], g_params[i])))
        tau_hat, std_hat, _, _, res = multi_period_estimate(wf2020, outcome_var=out_var, 
                                              confounder_list=['daynum'] + weather + city_economic + city_environmental,
                                              Q_model_class=Q_models[i], Q_model_params=Q_params[i],
                                              g_model_class=g_models[i], g_model_params=g_params[i])
        res_dict_list.append(res)
        tau_list_multi.append(tau_hat)
        std_list_multi.append(std_hat)
        Q_model_list_multi.append(get_model_string(Q_models[i], Q_params[i]))
        g_model_list_multi.append(get_model_string(g_models[i], g_params[i]))
        out_var_list_multi.append(get_var_string(out_var))
        print(f"The estimate is {tau_hat} pm {1.96*std_hat}")

Q: Lin./Log. Reg 
 g:Lin./Log. Reg
The estimate is -7.8539821945538755 pm 3.6361586265237147
Q: RF (depth 10) 
 g:RF(depth 10)
The estimate is -8.075933116652816 pm 2.846082321263806
Q: XGBoost 
 g:XGBoost
The estimate is -13.66935321243369 pm 1.2141174783204323
Q: Lin./Log. Reg 
 g:Lin./Log. Reg
The estimate is -5.475960420567455 pm 1.570108021929218
Q: RF (depth 10) 
 g:RF(depth 10)
The estimate is -5.617933599056244 pm 2.0832133254982916
Q: XGBoost 
 g:XGBoost
The estimate is -6.37298426712131 pm 1.8764708401970265


In [8]:
df2 = pd.DataFrame({'outcome': out_var_list_multi, 'model': Q_model_list_multi,
             'estimate': np.round(tau_list_multi, 2), 'p/m': np.round([1.96*std for std in std_list_multi], 2)})
df2

Unnamed: 0,outcome,model,estimate,p/m
0,AQI,Lin./Log. Reg,-7.85,3.64
1,AQI,RF (depth 10),-8.08,2.85
2,AQI,XGBoost,-13.67,1.21
3,PM,Lin./Log. Reg,-5.48,1.57
4,PM,RF (depth 10),-5.62,2.08
5,PM,XGBoost,-6.37,1.88


#### ATT weights for each day

In [44]:
res_aqi = res_dict_list[1]
res_pm = res_dict_list[4]

In [50]:
a = list(np.unique(wf2020[['date', 'daynum']]))
daynum_to_date = {k:v for k,v in zip(a[:int(len(a)/2)] , a[int(len(a)/2):])}

days = [daynum_to_date[d] for d in res_aqi]
estimates = [res_aqi[d][0] for d in res_aqi]
stds = [res_aqi[d][1] for d in res_aqi]

df = pd.DataFrame({'day': days, 'estimate': estimates, 'p/m': [1.96*std for std in stds]})
df

Unnamed: 0,day,estimate,p/m
0,20200124,-28.711168,9.444318
1,20200125,42.160968,37.211422
2,20200126,15.619417,20.002507
3,20200131,0.376498,12.608754
4,20200202,-29.45821,7.067823
5,20200203,-54.786224,65.860538
6,20200204,-2.472994,8.172015
7,20200205,-3.228286,5.156871
8,20200206,0.461201,5.782714


In [51]:
a = list(np.unique(wf2020[['date', 'daynum']]))
daynum_to_date = {k:v for k,v in zip(a[:int(len(a)/2)] , a[int(len(a)/2):])}

days = [daynum_to_date[d] for d in res_pm]
estimates = [res_pm[d][0] for d in res_pm]
stds = [res_pm[d][1] for d in res_pm]

df = pd.DataFrame({'day': days, 'estimate': estimates, 'p/m': [1.96*std for std in stds]})
df

Unnamed: 0,day,estimate,p/m
0,20200124,-11.136334,6.308814
1,20200125,-19.361203,41.91815
2,20200126,-1.612799,12.658705
3,20200131,8.103762,12.386518
4,20200202,-32.348619,11.684365
5,20200203,-16.059811,25.699202
6,20200204,-5.054692,11.76971
7,20200205,-8.580284,3.208517
8,20200206,1.076037,3.562849
