In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

import statsmodels.api as sm
import statsmodels.formula.api as smf
# from linearmodels.panel import PanelOLS

In [2]:
wf = pd.read_csv("data/wf.csv")
city_yb = pd.read_csv("data/city_yb.csv")

## 2020 treatment vs control

In [64]:
wf = pd.read_csv("data/wf.csv")
city_yb = pd.read_csv("data/city_yb.csv")

wf["temp2"] = wf["temp"] ** 2
wf["l_aqi"] = np.log(1 + wf["aqi"])
wf["l_pm"] = np.log(1 + wf["pm"])
wf2020 = wf[(wf["daynum"] >= 8401) & (wf["daynum"]<= 8461)].dropna(
    subset = ['aqi', 'pm']
)
wf2020['cities'] = wf2020['city_code'].astype('category')
wf2020['days'] = wf2020['daynum'].astype('category')
wf2020["t_sum"] = wf2020.groupby("city_code")["treat"].transform('sum')
wf2020 = pd.get_dummies(wf2020, drop_first=True)

In [65]:
fixed = ['treat']
for col in wf2020.columns:
    if 'cities' in col or 'days' in col:
        fixed.append(col)
        
weather = ['prec', 'snow', 'temp', 'temp2']

In [66]:
out = ["aqi", "l_aqi", "pm", "l_pm"]


for Yname in out:
    Y = wf2020[Yname]
    X = wf2020[fixed + weather]
    lr = LinearRegression()
    lr.fit(X, Y)
    print(lr.coef_[0])

-19.844075932282255
-0.1741297114238865
-14.069969671861704
-0.1678396393548723


Parallel trends

In [67]:
# get first (from dML nb)
treated = wf2020[wf2020['treat'] == 1]
treated = treated[['daynum', 'city_code']].groupby('city_code')
first = treated.apply(lambda x: x.sort_values(by = 'daynum', ascending=True).head(1))
day, count = np.unique(first.daynum, return_counts = True)
treat_day = day[count == max(count)][0]
first = {city:day for day, city in first.values}
wf2020 = wf2020.assign(first = [first.get(city, 0) for city in wf2020['city_code']])

# create week coefficient 
wf2020["week_coef"] = np.floor((wf2020["daynum"] - wf2020["first"])/7).astype(int)
# set -1 lead and untreated to NaN so they don't get week0 dummy
wf2020["week_coef"] = np.where((wf2020["week_coef"] == -1), np.NaN, wf2020["week_coef"])
wf2020["week_coef"][wf2020["first"] == 0] = np.NaN
wf2020["week_coef"] = wf2020["week_coef"].astype('category')
wf2020 = pd.get_dummies(wf2020)

week_coef = []
for col in wf2020.columns:
    if 'week_coef' in col:
        week_coef.append(col)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [73]:
for Yname in out:
    Y = wf2020[Yname]
    X = wf2020[fixed + weather + week_coef]
    fit = sm.OLS(Y, sm.add_constant(X)).fit()
    print(Yname)
    print(*list(zip([index for index in fit.params.index if 'week_coef' in index],
                    fit.params[[index for index in fit.params.index if 'week_coef' in index]], 
                   2*fit.bse[[index for index in fit.params.index if 'week_coef' in index]])), sep="\n")

aqi
('week_coef_-8.0', -18.00627161327473, 31.936753228023296)
('week_coef_-7.0', -8.324787978437236, 29.854750815608643)
('week_coef_-6.0', -3.7125282430360578, 12.536586789189553)
('week_coef_-5.0', -2.0338783923834565, 5.125846933467938)
('week_coef_-4.0', -8.130041800167035, 4.597603077051213)
('week_coef_-3.0', -3.0329251380830984, 4.284752755667526)
('week_coef_-2.0', 2.5201194790414254, 4.230233635802477)
('week_coef_0.0', 10.87644562178328, 3.313219405271911)
('week_coef_1.0', 6.424389979049763, 3.306806586103549)
('week_coef_2.0', -6.614663658277415, 3.259649912037807)
('week_coef_3.0', -8.566434277435519, 3.4133199746308076)
('week_coef_4.0', -9.553382655955549, 4.859642280452933)
('week_coef_5.0', -15.058933532716313, 9.274440038108484)
l_aqi
('week_coef_-8.0', -0.15519707161095148, 0.3344370672631494)
('week_coef_-7.0', -0.09956446639052036, 0.3126346386984391)
('week_coef_-6.0', 0.005779541005906941, 0.13128132623035588)
('week_coef_-5.0', -0.023801479685331486, 0.05367712

In [78]:
np.unique(np.floor((wf2020["daynum"] - wf2020["first"])/7).astype(int))
np.unique(wf2020["daynum"] - wf2020["first"])

array([ -55,  -54,  -53,  -52,  -51,  -50,  -49,  -48,  -47,  -46,  -45,
        -44,  -43,  -42,  -41,  -40,  -39,  -38,  -37,  -36,  -35,  -34,
        -33,  -32,  -31,  -30,  -29,  -28,  -27,  -26,  -25,  -24,  -23,
        -22,  -21,  -20,  -19,  -18,  -17,  -16,  -15,  -14,  -13,  -12,
        -11,  -10,   -9,   -8,   -7,   -6,   -5,   -4,   -3,   -2,   -1,
          0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
         11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
         22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
         33,   34,   35,   36,   37,   38, 8401, 8402, 8403, 8404, 8405,
       8406, 8407, 8408, 8409, 8410, 8411, 8412, 8413, 8414, 8415, 8416,
       8417, 8418, 8419, 8420, 8421, 8422, 8423, 8424, 8425, 8426, 8427,
       8428, 8429, 8430, 8431, 8432, 8433, 8434, 8435, 8436, 8437, 8438,
       8439, 8440, 8441, 8442, 8443, 8444, 8445, 8446, 8447, 8448, 8449,
       8450, 8451, 8452, 8453, 8454, 8455, 8456, 84

In [None]:
# check E[Y_{t+1} - Y_{t} | A=1, X] - E[Y_{t+1} - Y_{t} | A=0, X] = 0 



## Spring festival 2020 vs 2019

In [75]:
wf_spring = wf[
    ((wf["daynum"] >= 8401) & (wf["daynum"]<= 8461)) |
    ((wf["daynum"] >= 8047) & (wf["daynum"]<= 8107))].dropna(
    subset = ['aqi', 'pm']
)
wf_spring["daynum_i"] = wf_spring.apply(lambda x : x["daynum"] + 354 if x["year"] == 2019 else x["daynum"], axis=1)
wf_spring["treat_SF1"] = ((wf_spring["daynum_i"] >= 8425) & (wf_spring["year"] == 2020)).astype("int")

wf_spring['cities'] = wf_spring['city_code'].astype('category')
wf_spring['days_i'] = wf_spring['daynum_i'].astype('category')
wf_spring['year_ind'] = wf_spring['year'].astype('category')
wf_spring['t_group'] = pd.notna(wf_spring["t_asign"])
wf_spring = pd.get_dummies(wf_spring, drop_first=True)


fixed = ['treat_SF1']
for col in wf_spring.columns:
    if 'cities' in col or 'days_i' in col or 'year_ind' in col:
        fixed.append(col)

In [76]:
for Yname in out:
    Y = wf_spring[wf_spring["t_group"] == 0][Yname]
    X = wf_spring[wf_spring["t_group"] == 0][fixed + weather]
    lr = LinearRegression()
    lr.fit(X, Y)
    print(lr.coef_[0])

-6.369863682777258
-0.0484466255348826
-7.090465831937396
-0.07072380133669583
