In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import os

from xgboost import XGBClassifier, XGBRegressor

from utils.double_ml import *
from utils.analysis import *

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
RANDOM_SEED = 42

### Import data

In [3]:
wf2020 = make_wf2020(city_var=True)

fixed = ['treat']
city_fixed = []
time_fixed = []
for col in wf2020.columns:
    if 'cities' in col:
        city_fixed.append(col)
    if 'days' in col:
        time_fixed.append(col)
fixed = fixed + city_fixed + time_fixed
        
weather = ['prec', 'snow', 'temp', 'temp2']
city_economic = ['pop_city', 'gdp_city', 'firm_city']
city_environmental = ['gonglu', 'emit_ww', 'emit_so1', 'emi_dust1']
out = ["aqi", "l_aqi", "pm", "l_pm"]

day, count, num_cities = get_day_count(wf2020)
treat_day = day[count == max(count)][0]

### Single time period

In [4]:
tau_hat, std_hat = single_period_estimate(wf2020, treat_day=treat_day, outcome_var='aqi', 
                                          confounder_list=weather + city_economic + city_environmental,
                                          Q_model_class=LinearRegression, Q_model_params={},
                                          g_model_class=LogisticRegression, g_model_params={'max_iter':1000})

print(f"The estimate is {tau_hat} pm {1.96*std_hat}")

The estimate is -9.551428876892755 pm 15.032528485892538


In [5]:
Q_models = [LinearRegression, RandomForestRegressor, XGBRegressor]
Q_params = [{}, 
            {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 10},
            {'n_jobs': 1,
             'objective': 'reg:squarederror'},
           ]
g_models = [LogisticRegression, RandomForestClassifier, XGBClassifier]
g_params = [{'max_iter':1000}, 
           {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 10},
           {'use_label_encoder': False, 
            'n_jobs': 1, 
            'objective': 'binary:logistic', 
            'eval_metric': 'logloss'}]

In [6]:
treat_day

8436

In [7]:
tau_list_single = []
std_list_single = []
Q_model_list_single = []
g_model_list_single = []
out_var_list_single = []

for out_var in out:
    for i in range(0, len(Q_models)):
        print("======= ", out_var, "=======")
        print("Day: %d \n Q: %s \n g:%s" % (8436, get_model_string(Q_models[i], Q_params[i]), 
                                            get_model_string(g_models[i], g_params[i])))
        tau_hat, std_hat = single_period_estimate(wf2020, treat_day=treat_day, outcome_var=out_var, 
                                              confounder_list=weather + city_economic + city_environmental,
                                              Q_model_class=Q_models[i], Q_model_params=Q_params[i],
                                              g_model_class=g_models[i], g_model_params=g_params[i])
        tau_list_single.append(tau_hat)
        std_list_single.append(std_hat)
        Q_model_list_single.append(get_model_string(Q_models[i], Q_params[i]))
        g_model_list_single.append(get_model_string(g_models[i], g_params[i]))
        out_var_list_single.append(get_var_string(out_var))
        print(f"The estimate is {tau_hat} pm {1.96*std_hat}")
    

Day: 8436 
 Q: Lin. Reg 
 g:Log. Reg
The estimate is -9.551428876892755 pm 15.032528485892538
Day: 8436 
 Q: RF (depth 10) 
 g:RF(depth 10)
The estimate is 0.6659278709882487 pm 10.229994668025448
Day: 8436 
 Q: XGB 
 g:XGB
The estimate is -3.9413818567367227 pm 34.2995351776955
Day: 8436 
 Q: Lin. Reg 
 g:Log. Reg
The estimate is -0.09543847633702815 pm 0.14881548370126665
Day: 8436 
 Q: RF (depth 10) 
 g:RF(depth 10)
The estimate is 0.020664216765638545 pm 0.07984004457299317
Day: 8436 
 Q: XGB 
 g:XGB
The estimate is -0.0017200317839799043 pm 0.33411013446082827
Day: 8436 
 Q: Lin. Reg 
 g:Log. Reg
The estimate is -7.351327742277995 pm 12.235188471643678
Day: 8436 
 Q: RF (depth 10) 
 g:RF(depth 10)
The estimate is 0.7626146302975149 pm 8.831497300865946
Day: 8436 
 Q: XGB 
 g:XGB
The estimate is -10.860707863219202 pm 38.14069448936063
Day: 8436 
 Q: Lin. Reg 
 g:Log. Reg
The estimate is -0.11246352126584046 pm 0.16615777393608616
Day: 8436 
 Q: RF (depth 10) 
 g:RF(depth 10)
The e

In [8]:
df = pd.DataFrame({'outcome': out_var_list_single, 'outcome model': Q_model_list_single, 'treatment model': g_model_list_single,
             'estimate': np.round(tau_list_single, 2), 'p/m': np.round([1.96*std for std in std_list_single], 2)})
df

Unnamed: 0,outcome,outcome model,treatment model,estimate,p/m
0,AQI,Lin. Reg,Log. Reg,-9.55,15.03
1,AQI,RF (depth 10),RF(depth 10),0.67,10.23
2,AQI,XGB,XGB,-3.94,34.3
3,log AQI,Lin. Reg,Log. Reg,-0.1,0.15
4,log AQI,RF (depth 10),RF(depth 10),0.02,0.08
5,log AQI,XGB,XGB,-0.0,0.33
6,PM,Lin. Reg,Log. Reg,-7.35,12.24
7,PM,RF (depth 10),RF(depth 10),0.76,8.83
8,PM,XGB,XGB,-10.86,38.14
9,log PM,Lin. Reg,Log. Reg,-0.11,0.17


### Multiple time periods

In [None]:
wf2020 = make_wf2020()
tau_hat, std_hat = multi_period_estimate(wf2020, outcome_var='aqi', 
                                         confounder_list= ['daynum'] + weather + city_economic + city_environmental,
                                         Q_model_class=RandomForestRegressor, Q_model_params={
                                             'random_state': RANDOM_SEED,
                                             'n_estimators': 100,
                                             'max_depth': 10
                                         },
                                         g_model_class=RandomForestClassifier, g_model_params={
                                             'random_state': RANDOM_SEED,
                                             'n_estimators': 100,
                                             'max_depth': 10
                                         }
                                         )
print('%0.3f pm %0.3f' % (tau_hat, std_hat))