# 14.32 final paper - step 3, regressions

Start with simple DD, then add complexity.

In [1]:
# For multiple dependent variables.
# !pip uninstall -y stargazer
# !pip install git+https://github.com/toobaz/stargazer.git@fix_show_dep_var

In [2]:
import pandas as pd
import numpy as np
import microdf as mdf
import statsmodels.api as sm
from linearmodels.panel import PanelOLS
from statsmodels.stats.weightstats import DescrStatsW
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML  # For examining Stargazer output.

## Data

In [3]:
cps = pd.read_csv('data/cps.csv.gz')

Change bools to numbers for regressions
(do here to save space when storing between notebooks).

In [4]:
cps.replace(False, 0, inplace=True)

Make employment rate out of 100 for easier to read regression results.

In [5]:
cps.emp *= 100

In [6]:
DATE_COLS = ['cps_start_date', 'cps_end_date', 'school_closure_date']
cps.drop(DATE_COLS, axis=1, inplace=True)  # If needed, change to dates.

In [7]:
cps = sm.add_constant(cps)

## Common

In [8]:
cps.columns

Index(['const', 'YEAR', 'MONTH', 'COUNTY', 'w', 'age', 'age2', 'female',
       'married', 'hours', 'POPESTIMATE2019', 'unemp', 'emp', 'lf',
       'emp_of_lf', 'has_k5', 'days_sc', 'post', 'post_has_k5', 'lwe',
       'days_since_2000', 'apr2020', 'apr2020_has_k5', 'days_sc_has_k5',
       'apr2020_has_k5_female', 'apr2020_female', 'has_k5_female',
       'days_sc_has_k5_female', 'days_sc_female'],
      dtype='object')

In [9]:
OUTCOMES = ['emp', 'hours', 'lwe']

In [10]:
def reg(df, y, x, cluster=False):
    """ Run WLS regression. Also drops all incomplete rows across x and y.
    
    Args:
        df: DataFrame.
        y: Name of y column.
        x: List of x column names (const is added).
        cluster: Whether to cluster by county. Defaults to False.
    """
    cols = x + [y, 'w', 'const']
    if cluster:
        cols += ['COUNTY']
    tmp = df[cols].dropna()
    m = sm.WLS(tmp[y], tmp[x + ['const']], weights=tmp['w'], missing='drop')
    if cluster:
        return m.fit(cov_type='cluster', cov_kwds={'groups': tmp['COUNTY']})
    return m.fit()

In [11]:
def reg_all(df, x, cluster=False, quiet=False):
    """ Run and print summaries of all regressions: emp, hours, lwe.
    
    Returns dict where each model is identified by its outcome.
    """
    res = {}
    for outcome in OUTCOMES:
        m = reg(df, outcome, x, cluster)
        res[outcome] = m
        if ~quiet:
            print(m.summary())
    return res

Columns for use in different regressions.

Beta of interest is the first element of `DD_COLS`, `DD_REG_COLS`, and
`DDD_COLS`.

In [12]:
DD_COLS = ['apr2020_has_k5', 'apr2020', 'has_k5']
DD_REG_COLS = ['days_sc_has_k5', 'days_sc', 'has_k5']
DDD_COLS = ['apr2020_has_k5_female', 'apr2020_female',
            'has_k5_female'] + DD_COLS
DDD_REG_COLS = ['days_sc_has_k5_female', 'days_sc_female',
                'has_k5_female'] + DD_REG_COLS
CONTROLS = ['days_since_2000', 'female', 'age', 'age2', 'married']

## DD

First version: only compare February and April

In [13]:
dd0 = cps[(cps.YEAR == 2020) & cps.MONTH.isin([2, 4])][
    OUTCOMES + ['apr2020', 'has_k5', 'apr2020_has_k5', 'w', 'const']]

In [14]:
weighted_stats = DescrStatsW(dd0.lwe[~dd0.lwe.isna()], 
                             weights=dd0.w[~dd0.lwe.isna()], ddof=0)
weighted_stats.std_mean

0.00011043335582375969

In [15]:
m_dd0 = reg_all(dd0, DD_COLS)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.015
Model:                            WLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     420.4
Date:                Sun, 31 May 2020   Prob (F-statistic):          4.89e-271
Time:                        13:28:31   Log-Likelihood:            -4.3175e+05
No. Observations:               81467   AIC:                         8.635e+05
Df Residuals:                   81463   BIC:                         8.635e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
apr2020_has_k5     1.1179      0.606      1.

### Simple with more history

Can do the same with `cps` instead of `dd0`, after dropping March 2020,
since treatment wasn't the same then (requires clustering).

In [16]:
cps_no_mar2020 = cps[~((cps.YEAR == 2020) & (cps.MONTH == 3))]

In [17]:
m_no_mar2020 = reg_all(cps_no_mar2020, DD_COLS)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.003
Model:                            WLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     980.1
Date:                Sun, 31 May 2020   Prob (F-statistic):               0.00
Time:                        13:28:32   Log-Likelihood:            -5.3933e+06
No. Observations:             1031956   AIC:                         1.079e+07
Df Residuals:                 1031952   BIC:                         1.079e+07
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
apr2020_has_k5     0.8440      0.391      2.

### History plus more controls

In [18]:
m_no_mar2020_controls = reg_all(cps_no_mar2020, DD_COLS + CONTROLS)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.030
Model:                            WLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     3994.
Date:                Sun, 31 May 2020   Prob (F-statistic):               0.00
Time:                        13:28:33   Log-Likelihood:            -5.3790e+06
No. Observations:             1031956   AIC:                         1.076e+07
Df Residuals:                 1031947   BIC:                         1.076e+07
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
apr2020_has_k5      0.7059      0.385     

## Intensity DD with fixed effects and clustering

Only a few observations from the March CPS were in counties with closed schools. Any signal they might contribute would likely be offset by the need to cluster by county.

Instead, apply a different signal: days since school closure. This is effectively a DD where treatment varies in intensity. Pair this with county fixed effects and standard error clustering at the county level. We can now include March 2020 data.

### 

In [19]:
m_intensity = reg_all(cps, DD_REG_COLS + CONTROLS, cluster=True)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.029
Model:                            WLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     213.5
Date:                Sun, 31 May 2020   Prob (F-statistic):           1.76e-84
Time:                        13:28:34   Log-Likelihood:            -1.8834e+06
No. Observations:              363768   AIC:                         3.767e+06
Df Residuals:                  363759   BIC:                         3.767e+06
Df Model:                           8                                         
Covariance Type:              cluster                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
days_sc_has_k5      0.0309      0.035     

## Diff-in-diff-in-diff

Add interactions by female.

TODO: SES.

### Simple DDD

In [20]:
m_ddd = reg_all(cps_no_mar2020, DDD_COLS + CONTROLS)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.035
Model:                            WLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     3405.
Date:                Sun, 31 May 2020   Prob (F-statistic):               0.00
Time:                        13:28:36   Log-Likelihood:            -5.3763e+06
No. Observations:             1031956   AIC:                         1.075e+07
Df Residuals:                 1031944   BIC:                         1.075e+07
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
apr2020_has_k5_female    -1.93

### Intensity DDD

In [21]:
m_intensity_ddd = reg_all(cps, DDD_REG_COLS + CONTROLS, True)

                            WLS Regression Results                            
Dep. Variable:                    emp   R-squared:                       0.034
Model:                            WLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     178.6
Date:                Sun, 31 May 2020   Prob (F-statistic):           2.92e-87
Time:                        13:28:38   Log-Likelihood:            -1.8823e+06
No. Observations:              363768   AIC:                         3.765e+06
Df Residuals:                  363756   BIC:                         3.765e+06
Df Model:                          11                                         
Covariance Type:              cluster                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
days_sc_has_k5_female    -0.06

## Create TeX tables

In [22]:
CONTROLS

['days_since_2000', 'female', 'age', 'age2', 'married']

In [23]:
COV_NAMES = {
    'apr2020_has_k5': 'April 2020 * Children age 5+',
    'apr2020': 'April 2020',
    'has_k5': 'Children age 5+',
    'const': 'Constant',
    'apr2020_has_k5_female': 'April 2020 * Children age 5+ * Female',
    'apr2020_female': 'April 2020 * Female',
    'has_k5_female': 'Children age 5+ * Female',
    'days_sc_has_k5_female':
        'Days since school closure * Children age 5+ * Female',
    'days_sc_has_k5': 'Days since school closure * Children age 5+',
    'days_sc_has_k5': 'Days since school closure * Children age 5+',
    'days_sc': 'Days since school closure',
    'days_since_2000': 'Time (days since 2000-01-01)',
    'female': 'Female',
    'age': 'Age',
    'age2': 'Age^2',
    'married': 'Married'
}

In [29]:
OUTCOME_NAMES = ['Employment', 'Hours', 'Log earnings']

In [24]:
def sg(models, covariate_order, column_names, tex_out, quiet=False):
    """ Performs common operations on Stargazer objects:
        - Renames covariates
        - Removes stars
        - Removes degrees of freedom
        
    Args:
        models: List of models.
        covariate_order: List passed to Stargazer.covariate_order.
        column_names: List passed to Stargazer.custom_columns.
        tex_out: File to export tex to, in tex/ folder.
        quiet: Whether to suppress 
    """
    sg.rename_covariates(COV_NAMES)
    sg.significance_levels([1e-999, 1e-999, 1e-999])
    sg.append_notes(False)
    sg.show_degrees_of_freedom(False)
    if three_outcomes:
        sg.custom_columns(, [1, 1, 1])

In [25]:
s_dd0 = Stargazer([m_dd0['emp'], m_dd0['hours'], m_dd0['lwe']])
s_dd0.covariate_order(DD_COLS + ['const'])
sg_common(s_dd0, True)
HTML(s_dd0.render_html())

In [33]:
f = open("dd0.tex", "w")
f.write(s_dd0.render_latex())
f.close()

In [32]:
s_dd0.render_latex()

'\\begin{table}[!htbp] \\centering\n  \\label{}\n\\begin{tabular}{@{\\extracolsep{5pt}}lcc}\n\\\\[-1.8ex]\\hline\n\\hline \\\\[-1.8ex]\n\\\\[-1.8ex] & \\multicolumn{1}{c}{Employment} & \\multicolumn{1}{c}{Hours} & \\multicolumn{1}{c}{Log earnings}  \\\\\n\\\\[-1.8ex] & (1) & (2) & (3) \\\\\n\\hline \\\\[-1.8ex]\n April 2020 * Children age 5+ & 1.118$^{}$ & 0.095$^{}$ & -0.021$^{}$ \\\\\n  & (0.606) & (0.198) & (0.024) \\\\\n  & & & \\\\\n April 2020 & -11.173$^{}$ & -1.643$^{}$ & 0.055$^{}$ \\\\\n  & (0.408) & (0.134) & (0.016) \\\\\n  & & & \\\\\n Children age 5+ & 0.277$^{}$ & -0.192$^{}$ & 0.064$^{}$ \\\\\n  & (0.429) & (0.133) & (0.016) \\\\\n  & & & \\\\\n Constant & 80.095$^{}$ & 40.426$^{}$ & 6.783$^{}$ \\\\\n  & (0.288) & (0.089) & (0.011) \\\\\n  & & & \\\\\n\\hline \\\\[-1.8ex]\n Observations & 81467.0 & 59107.0 & 14642.0 \\\\\n R${2}$ & 0.015 & 0.004 & 0.002 \\\\\n Adjusted R${2}$ & 0.015 & 0.004 & 0.002 \\\\\n Residual Std. Error & 2393.309 & 655.855 & 39.232  \\\\\n F Stat

In [27]:
s_emp = Stargazer([m_dd0['emp'], m_no_mar2020['emp'],
                   m_no_mar2020_controls['emp']])
s_emp.custom_columns(['Two-period', 'Full series ex Mar2020',
                      'Full series ex Mar2020 + controls'],
                     [1, 1, 1])
s_emp.covariate_order(DD_COLS + ['const'])
# s_emp.dependent_variable_name('Employment')  # Broken.
sg_common(s_emp)
HTML(s_emp.render_html())

0,1,2,3
,,,
,Dependent variable:emp,Dependent variable:emp,Dependent variable:emp
,,,
,Two-period,Full series ex Mar2020,Full series ex Mar2020 + controls
,(1),(2),(3)
,,,
April 2020 * Children age 5+,1.118,0.844,0.706
,(0.606),(0.391),(0.385)
April 2020,-11.173,-10.83,-11.438
,(0.408),(0.264),(0.272)


In [28]:
s_intensity = Stargazer([m_intensity['emp'], m_intensity['hours'],
                         m_intensity['lwe']])
s_intensity.covariate_order(DD_REG_COLS + ['const'])
sg_common(s_intensity, True)
HTML(s_intensity.render_html())

0,1,2,3
,,,
,,,
,Employment,Hours,Log earnings
,(1),(2),(3)
,,,
Days since school closure * Children age 5+,0.031,-0.009,0.0
,(0.035),(0.009),(0.001)
Days since school closure,-0.397,-0.048,0.001
,(0.031),(0.007),(0.001)
Children age 5+,0.265,-0.589,-0.134
