# 14.32 final paper - step 3, regressions

Start with simple DD, then add complexity.

In [1]:
import pandas as pd
import numpy as np
import microdf as mdf
import statsmodels.api as sm
from linearmodels.panel import PanelOLS
from statsmodels.stats.weightstats import DescrStatsW
from stargazer.stargazer import Stargazer

## Data

In [2]:
cps = pd.read_csv('data/cps.csv')

Change bools to numbers for regressions
(do here to save space when storing between notebooks).

In [3]:
cps.replace(False, 0, inplace=True)

In [4]:
DATE_COLS = ['cps_start_date', 'cps_end_date', 'school_closure_date']
cps.drop(DATE_COLS, axis=1, inplace=True)  # If needed, change to dates.

In [5]:
cps = sm.add_constant(cps)

## Common

In [6]:
cps.columns

Index(['const', 'YEAR', 'MONTH', 'COUNTY', 'w', 'age', 'age2', 'female',
       'married', 'hours', 'POPESTIMATE2019', 'unemp', 'emp', 'lf',
       'emp_of_lf', 'has_k6', 'days_sc', 'post', 'post_has_k6', 'lwe',
       'days_since_2000', 'apr2020', 'apr2020_has_k6', 'days_sc_has_k6',
       'apr2020_has_k6_female', 'apr2020_female', 'has_k6_female',
       'days_sc_has_k6_female', 'days_sc_female'],
      dtype='object')

In [7]:
OUTCOMES = ['emp', 'hours', 'lwe']

In [8]:
def reg(df, y, x, cluster=False):
    """ Run WLS regression. Also drops all incomplete rows across x and y.
    
    Args:
        df: DataFrame.
        y: Name of y column.
        x: List of x column names (const is added).
        cluster: Whether to cluster by county. Defaults to False.
    """
    cols = x + [y, 'w', 'const']
    if cluster:
        cols += ['COUNTY']
    tmp = df[cols].dropna()
    m = sm.WLS(tmp[y], tmp[x + ['const']], weights=tmp['w'], missing='drop')
    if cluster:
        f = m.fit(cov_type='cluster', cov_kwds={'groups': tmp['COUNTY']})
    else:
        f = m.fit()
    return f.summary()

In [9]:
def reg_all(df, x, cluster=False):
    """ Run and print summaries of all regressions: emp, hours, lwe.
    """
    for outcome in OUTCOMES:
        print(reg(df, outcome, x, cluster))

Columns for use in different regressions.

Beta of interest is the first element of `DD_COLS`, `DD_REG_COLS`, and
`DDD_COLS`.

In [10]:
DD_COLS = ['apr2020_has_k6', 'apr2020', 'has_k6']
DD_REG_COLS = ['days_sc_has_k6', 'days_sc', 'has_k6']
DDD_COLS = ['apr2020_has_k6_female', 'apr2020_female',
            'has_k6_female'] + DD_COLS
DDD_REG_COLS = ['days_sc_has_k6_female', 'days_sc_female',
                'has_k6_female'] + DD_REG_COLS
CONTROLS = ['days_since_2000', 'female', 'age', 'age2', 'married']

## DD

First version: only compare February and April

In [11]:
dd0 = cps[(cps.YEAR == 2020) & cps.MONTH.isin([2, 4])][
    OUTCOMES + ['apr2020', 'has_k6', 'apr2020_has_k6', 'w', 'const']]

In [12]:
weighted_stats = DescrStatsW(dd0.lwe[~dd0.lwe.isna()], 
                             weights=dd0.w[~dd0.lwe.isna()], ddof=0)
weighted_stats.std_mean

0.00011043335582375969

In [13]:
reg_all(dd0, DD_COLS)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.015
Model:                            WLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     420.4
Date:                Sat, 30 May 2020   Prob (F-statistic):          4.89e-271
Time:                        22:20:01   Log-Likelihood:                -56582.
No. Observations:               81467   AIC:                         1.132e+05
Df Residuals:                   81463   BIC:                         1.132e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
apr2020_has_k6     0.0112      0.006      1.

### Simple with more history

Can do the same with `cps` instead of `dd0`, after dropping March 2020,
since treatment wasn't the same then (requires clustering).

In [14]:
cps_no_mar2020 = cps[~((cps.YEAR == 2020) & (cps.MONTH == 3))]

In [15]:
reg_all(cps_no_mar2020, DD_COLS)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.003
Model:                            WLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     980.1
Date:                Sat, 30 May 2020   Prob (F-statistic):               0.00
Time:                        22:20:02   Log-Likelihood:            -6.4094e+05
No. Observations:             1031956   AIC:                         1.282e+06
Df Residuals:                 1031952   BIC:                         1.282e+06
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
apr2020_has_k6     0.0084      0.004      2.

### History plus more controls

In [16]:
reg_all(cps_no_mar2020, DD_COLS + CONTROLS)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.030
Model:                            WLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     3994.
Date:                Sat, 30 May 2020   Prob (F-statistic):               0.00
Time:                        22:20:03   Log-Likelihood:            -6.2667e+05
No. Observations:             1031956   AIC:                         1.253e+06
Df Residuals:                 1031947   BIC:                         1.253e+06
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
apr2020_has_k6      0.0071      0.004     

## Intensity DD with fixed effects and clustering

Only a few observations from the March CPS were in counties with closed schools. Any signal they might contribute would likely be offset by the need to cluster by county.

Instead, apply a different signal: days since school closure. This is effectively a DD where treatment varies in intensity. Pair this with county fixed effects and standard error clustering at the county level. We can now include March 2020 data.

### 

In [17]:
reg_all(cps, DD_REG_COLS + CONTROLS, cluster=True)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.029
Model:                            WLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     213.5
Date:                Sat, 30 May 2020   Prob (F-statistic):           1.76e-84
Time:                        22:20:04   Log-Likelihood:            -2.0815e+05
No. Observations:              363768   AIC:                         4.163e+05
Df Residuals:                  363759   BIC:                         4.164e+05
Df Model:                           8                                         
Covariance Type:              cluster                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
days_sc_has_k6      0.0003      0.000     

## Diff-in-diff-in-diff

Add interactions by female and SES.

### Simple DDD

In [18]:
reg_all(cps_no_mar2020, DDD_COLS + CONTROLS)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.035
Model:                            WLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     3405.
Date:                Sat, 30 May 2020   Prob (F-statistic):               0.00
Time:                        22:20:05   Log-Likelihood:            -6.2402e+05
No. Observations:             1031956   AIC:                         1.248e+06
Df Residuals:                 1031944   BIC:                         1.248e+06
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
apr2020_has_k6_female    -0.01

### Intensity DIDID

In [19]:
reg_all(cps, DDD_REG_COLS + CONTROLS, True)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.034
Model:                            WLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     178.6
Date:                Sat, 30 May 2020   Prob (F-statistic):           2.92e-87
Time:                        22:20:07   Log-Likelihood:            -2.0713e+05
No. Observations:              363768   AIC:                         4.143e+05
Df Residuals:                  363756   BIC:                         4.144e+05
Df Model:                          11                                         
Covariance Type:              cluster                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
days_sc_has_k6_female    -0.00