In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import statsmodels.api as sm
import os
import pathlib

from xgboost import XGBClassifier, XGBRegressor

from austen_plots.AustenPlot import AustenPlot

from utils.double_ml import *
from utils.analysis import *

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
RANDOM_SEED = 42

In [3]:
# import data

wf2020 = make_wf2020(city_var=True)
fixed = ['treat']
city_fixed = []
time_fixed = []
for col in wf2020.columns:
    if 'cities' in col:
        city_fixed.append(col)
    if 'days' in col:
        time_fixed.append(col)
fixed = fixed + city_fixed + time_fixed
weather = ['prec', 'snow', 'temp', 'temp2']
city_economic = ['pop_city', 'gdp_city', 'firm_city']
city_environmental = ['gonglu', 'emit_ww', 'emit_so1', 'emi_dust1']
out_vars=['aqi', 'pm']

In [4]:
# define models to test

Q_models = [LinearRegression, RandomForestRegressor, RandomForestRegressor, XGBRegressor]
Q_params = [{}, 
            {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 10},
            {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 3},
            {'n_jobs': 1,
             'objective': 'reg:squarederror',
             'random_state': RANDOM_SEED},
           ]
g_models = [LogisticRegression, RandomForestClassifier, RandomForestClassifier, XGBClassifier]
g_params = [{'max_iter':1000},
            {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 10},
           {'random_state': RANDOM_SEED,
                            'n_estimators': 100,
                            'max_depth': 3},
           {'use_label_encoder': False, 
            'n_jobs': 1, 
            'objective': 'binary:logistic', 
            'eval_metric': 'logloss',
            'random_state': RANDOM_SEED}]

### Single time period

In [5]:
out_var_list = []
Q_model_types = []
g_model_types = []
Q_mses = []
g_ces = []
mse_baselines = []
ce_baselines = []

for out_var in out_vars:
    print("Running on ", out_var, "...")
    for i in range(0, len(Q_models)):
        Q_mse, g_ce, base_mse, base_ce = test_single_models(wf2020, treat_day=8436, outcome_var=out_var, 
                                            confounder_list=weather + city_economic + city_environmental,
                                            Q_model_class=Q_models[i], Q_model_params=Q_params[i],
                                            g_model_class=g_models[i], g_model_params=g_params[i],
                                            only_treated=True)
        out_var_list.append(get_var_string(out_var))
        Q_model_types.append(get_model_string(Q_models[i], Q_params[i]))
        g_model_types.append(get_model_string(g_models[i], g_params[i]))
        Q_mses.append(Q_mse)
        g_ces.append(g_ce)
        mse_baselines.append(base_mse)
        ce_baselines.append(base_ce)

Running on  aqi ...
Running on  pm ...


In [6]:
df = pd.DataFrame({'outcome': out_var_list, 'model': Q_model_types,
             'Q mse': np.round(Q_mses, 2), 'Q baseline': np.round(mse_baselines, 2), 
              'g ce': np.round(g_ces, 2), 'g baseline': np.round(ce_baselines, 2)})
df

Unnamed: 0,outcome,model,Q mse,Q baseline,g ce,g baseline
0,AQI,Lin./Log. Reg,405.86,481.31,0.45,0.36
1,AQI,RF (depth 10),218.25,481.31,0.23,0.36
2,AQI,RF (depth 3),230.94,481.31,0.26,0.36
3,AQI,XGBoost,291.48,481.31,0.26,0.36
4,PM,Lin./Log. Reg,274.01,359.38,0.45,0.36
5,PM,RF (depth 10),213.45,359.38,0.23,0.36
6,PM,RF (depth 3),207.08,359.38,0.26,0.36
7,PM,XGBoost,247.15,359.38,0.26,0.36


### Multiple period

#### All samples

In [7]:
out_var_list_multi = []
Q_model_types_multi = []
g_model_types_multi = []
Q_mses_multi = []
g_ces_multi = []
mse_baselines_multi = []
ce_baselines_multi = []

for out_var in out_vars:
    print("Running on ", out_var, "...")
    for i in range(0, len(Q_models)):
        Q_mse, g_ce, base_mse, base_ce = test_multi_models(wf2020, outcome_var=out_var, 
                                            confounder_list=['daynum'] + weather + city_economic + city_environmental,
                                            Q_model_class=Q_models[i], Q_model_params=Q_params[i],
                                            g_model_class=g_models[i], g_model_params=g_params[i],
                                            only_treated=False)
        out_var_list_multi.append(get_var_string(out_var))
        Q_model_types_multi.append(get_model_string(Q_models[i], Q_params[i]))
        g_model_types_multi.append(get_model_string(g_models[i], g_params[i]))
        Q_mses_multi.append(Q_mse)
        g_ces_multi.append(g_ce)
        mse_baselines_multi.append(base_mse)
        ce_baselines_multi.append(base_ce)

Running on  aqi ...
Running on  pm ...


In [8]:
df_multi = pd.DataFrame({'outcome': out_var_list_multi, 'model': Q_model_types_multi,
             'Q mse': np.round(Q_mses_multi, 2), 'Q baseline': np.round(mse_baselines_multi, 2), 
              'g ce': np.round(g_ces_multi, 2), 'g baseline': np.round(ce_baselines_multi, 2)})
df_multi

Unnamed: 0,outcome,model,Q mse,Q baseline,g ce,g baseline
0,AQI,Lin./Log. Reg,1246.1,1267.46,0.04,0.03
1,AQI,RF (depth 10),990.59,1267.46,0.03,0.03
2,AQI,RF (depth 3),1177.31,1267.46,0.03,0.03
3,AQI,XGBoost,1049.65,1267.46,0.03,0.03
4,PM,Lin./Log. Reg,690.8,702.45,0.04,0.03
5,PM,RF (depth 10),517.46,702.45,0.03,0.03
6,PM,RF (depth 3),650.15,702.45,0.03,0.03
7,PM,XGBoost,533.26,702.45,0.03,0.03


#### Only treated

In [9]:
out_var_list_multi = []
Q_model_types_multi = []
g_model_types_multi = []
Q_mses_multi = []
g_ces_multi = []
mse_baselines_multi = []
ce_baselines_multi = []

for out_var in out_vars:
    print("Running on ", out_var, "...")
    for i in range(0, len(Q_models)):
        Q_mse, g_ce, base_mse, base_ce = test_multi_models(wf2020, outcome_var=out_var, 
                                            confounder_list=['daynum'] + weather + city_economic + city_environmental,
                                            Q_model_class=Q_models[i], Q_model_params=Q_params[i],
                                            g_model_class=g_models[i], g_model_params=g_params[i],
                                            only_treated=True)
        out_var_list_multi.append(get_var_string(out_var))
        Q_model_types_multi.append(get_model_string(Q_models[i], Q_params[i]))
        g_model_types_multi.append(get_model_string(g_models[i], g_params[i]))
        Q_mses_multi.append(Q_mse)
        g_ces_multi.append(g_ce)
        mse_baselines_multi.append(base_mse)
        ce_baselines_multi.append(base_ce)

Running on  aqi ...
Running on  pm ...


In [10]:
df_multi = pd.DataFrame({'outcome': out_var_list_multi, 'model': Q_model_types_multi,
             'Q mse': np.round(Q_mses_multi, 2), 'Q baseline': np.round(mse_baselines_multi, 2), 
              'g ce': np.round(g_ces_multi, 2), 'g baseline': np.round(ce_baselines_multi, 2)})
df_multi

Unnamed: 0,outcome,model,Q mse,Q baseline,g ce,g baseline
0,AQI,Lin./Log. Reg,1323.91,1327.08,0.04,0.03
1,AQI,RF (depth 10),1246.48,1327.08,0.03,0.03
2,AQI,RF (depth 3),1320.27,1327.08,0.03,0.03
3,AQI,XGBoost,1469.49,1327.08,0.03,0.03
4,PM,Lin./Log. Reg,494.12,491.21,0.04,0.03
5,PM,RF (depth 10),555.82,491.21,0.03,0.03
6,PM,RF (depth 3),526.51,491.21,0.03,0.03
7,PM,XGBoost,646.77,491.21,0.03,0.03
