In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import os

from xgboost import XGBClassifier, XGBRegressor

from utils.double_ml import *
from utils.analysis import *

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
RANDOM_SEED = 42

### Import data

In [3]:
wf2020 = make_wf2020()

fixed = ['treat']
city_fixed = []
time_fixed = []
for col in wf2020.columns:
    if 'cities' in col:
        city_fixed.append(col)
    if 'days' in col:
        time_fixed.append(col)
fixed = fixed + city_fixed + time_fixed
        
weather = ['prec', 'snow', 'temp', 'temp2']
city_economic = ['pop_city', 'sec_city', 'gdp_city' , 'pgdp_city', 'firm_city']
city_environmental = ['gonglu', 'emit_ww', 'emit_so1', 'emi_dust1']
out = ["aqi", "l_aqi", "pm", "l_pm"]

In [4]:
day, count, num_cities = get_day_count(wf2020)
treat_day = day[count == max(count)][0]

### Single time period

In [6]:
tau_hat, std_hat = single_period_estimate(wf2020, treat_day=treat_day, outcome_var='aqi', 
                                          confounder_list=weather + city_fixed + time_fixed,
                                          Q_model_class=LinearRegression, Q_model_params={},
                                          g_model_class=LogisticRegression, g_model_params={'max_iter':1000})

print(f"The estimate is {tau_hat} pm {1.96*std_hat}")

The estimate is -19.861195037038506 pm 11.896965012736917


In [7]:
Q_models = [LinearRegression, RandomForestRegressor, XGBRegressor]
Q_params = [{}, 
            {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 10},
            {'n_jobs': 1,
             'objective': 'reg:squarederror'},
           ]
g_models = [LogisticRegression, RandomForestClassifier, XGBClassifier]
g_params = [{'max_iter':1000}, 
           {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 10},
           {'use_label_encoder': False, 
            'n_jobs': 1, 
            'objective': 'binary:logistic', 
            'eval_metric': 'logloss'}]

In [10]:
tau_list_single = []
std_list_single = []
Q_model_list_single = []
g_model_list_single = []
out_var_list_single = []

for out_var in out:
    for i in range(0, len(Q_models)):
        print("======= ", out_var, "=======")
        print("Day: %d \n Q: %s \n g:%s" % (8436, get_model_string(Q_models[i], Q_params[i]), 
                                            get_model_string(g_models[i], g_params[i])))
        tau_hat, std_hat = single_period_estimate(wf2020, treat_day=treat_day, outcome_var=out_var, 
                                              confounder_list=weather + city_fixed + time_fixed,
                                              Q_model_class=Q_models[i], Q_model_params=Q_params[i],
                                              g_model_class=g_models[i], g_model_params=g_params[i])
        tau_list_single.append(tau_hat)
        std_list_single.append(std_hat)
        Q_model_list_single.append(get_model_string(Q_models[i], Q_params[i]))
        g_model_list_single.append(get_model_string(g_models[i], g_params[i]))
        out_var_list_single.append(get_var_string(out_var))
        print(f"The estimate is {tau_hat} pm {1.96*std_hat}")
    

Day: 8436 
 Q: Linear Regression 
 g:Logistic Regression
The estimate is -19.861666562977337 pm 11.897241514686366
Day: 8436 
 Q: Random Forest Regressor (depth 10) 
 g:Random Forest Classifier (depth 10)
The estimate is -5.865090357355205 pm 7.712447687592622
Day: 8436 
 Q: XGBoost Regressor 
 g:XGBoost Classifier
The estimate is -247.48243236318115 pm 479.468065777151
Day: 8436 
 Q: Linear Regression 
 g:Logistic Regression
The estimate is -0.2980896249543357 pm 0.3213418520324933
Day: 8436 
 Q: Random Forest Regressor (depth 10) 
 g:Random Forest Classifier (depth 10)
The estimate is -0.023244578186134804 pm 0.062019886152666084
Day: 8436 
 Q: XGBoost Regressor 
 g:XGBoost Classifier
The estimate is -2.3164349051622914 pm 4.489472695727653
Day: 8436 
 Q: Linear Regression 
 g:Logistic Regression
The estimate is -19.05789212087119 pm 17.856075594549864
Day: 8436 
 Q: Random Forest Regressor (depth 10) 
 g:Random Forest Classifier (depth 10)
The estimate is -2.6808465270370956 pm 5.59

In [11]:
df = pd.DataFrame({'outcome': out_var_list_single, 'outcome model': Q_model_list_single, 'treatment model': g_model_list_single,
             'estimate': np.round(tau_list_single, 2), 'p/m': np.round(1.96*std_list_single, 2)})
df

TypeError: can't multiply sequence by non-int of type 'float'

In [20]:
std_list_single*1.96

TypeError: can't multiply sequence by non-int of type 'float'

### Multiple time periods

In [None]:
wf2020 = make_wf2020()
tau_hat, std_hat = multi_period_estimate(wf2020, outcome_var='aqi', 
                                         confounder_list= ['daynum'] + weather + city_fixed + time_fixed,
                                         Q_model_class=RandomForestRegressor, Q_model_params={
                                             'random_state': RANDOM_SEED,
                                             'n_estimators': 100,
                                             'max_depth': 10
                                         },
                                         g_model_class=RandomForestClassifier, g_model_params={
                                             'random_state': RANDOM_SEED,
                                             'n_estimators': 100,
                                             'max_depth': 3
                                         }
                                         )
print('%0.3f pm %0.3f' % (tau_hat, std_hat))