In [1]:
import os
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from patsy import dmatrices
from openpyxl import load_workbook

data_path = '/Users/kylieleblancKylie/domino/dofis/data/'
table_path = '/Users/kylieleblancKylie/domino/dofis/results/impact/'
data = pd.read_csv(os.path.join(data_path, 'clean', 'cits_match.csv'),
                  sep=",", low_memory=False)

In [2]:
def coef_with_stars(coef, pvalue):
    if pvalue >.05:
        coef = str(coef)
    if pvalue <= .05:
        coef = str(coef) + '*'
    if pvalue <= .01:
        coef = coef + '*'
    if pvalue <= .001:
        coef = coef + '*'
    return(coef)
test = coef_with_stars(.1, .005)

# No Matching

In [3]:
y, X = dmatrices('avescores ~  treat + year_centered + treatyear + yearpost1 + treatpostyear1',
                 data=data, return_type='dataframe', NA_action = "drop")
cluster = data[pd.notnull(data.avescores)].distname
mod = smf.OLS(y, X)    # Describe model
res = mod.fit(cov_type = 'cluster',  cov_kwds={'groups': pd.DataFrame(cluster)})       # Fit model
print(res.summary())   # Summarize model
row_1 = []
for coef in ['treat', 'year_centered', 'treatyear', 'yearpost1', 'treatpostyear1']:
    row_1.append(coef_with_stars(res.params[coef].round(2), res.pvalues[coef]))
    row_1.append('(' + str(res.bse[coef].round(2)) + ')')
row_1.append('')
row_1.append('')
row_1.append(res.rsquared_adj.round(2))
row_1

                            OLS Regression Results                            
Dep. Variable:              avescores   R-squared:                       0.077
Model:                            OLS   Adj. R-squared:                  0.076
Method:                 Least Squares   F-statistic:                     157.5
Date:                Mon, 18 Mar 2019   Prob (F-statistic):          6.93e-118
Time:                        09:55:16   Log-Likelihood:                -6507.1
No. Observations:                6302   AIC:                         1.303e+04
Df Residuals:                    6296   BIC:                         1.307e+04
Df Model:                           5                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.1646      0.048      3.

['0.23**',
 '(0.08)',
 '0.11***',
 '(0.01)',
 '-0.02*',
 '(0.01)',
 '-0.12***',
 '(0.01)',
 '-0.0',
 '(0.01)',
 '',
 '',
 0.08]

# Matching without pre-tests


In [5]:
y, X = dmatrices('avescores ~  treat + year_centered + treatyear + yearpost1 + treatpostyear1',
                 data=data, return_type='dataframe', NA_action = "drop")
cluster = data[pd.notnull(data.avescores)].distname
psweights = data[pd.notnull(data.avescores)].psweight1
mod = smf.WLS(y, X, weights = psweights)    # Describe model
res = mod.fit(cov_type = 'cluster',  cov_kwds={'groups': pd.DataFrame(cluster)})       # Fit model
print(res.summary())   # Summarize model
row_2 = []
for coef in ['treat', 'year_centered', 'treatyear', 'yearpost1', 'treatpostyear1']:
    row_2.append(coef_with_stars(res.params[coef].round(2), res.pvalues[coef]))
    row_2.append('(' + str(res.bse[coef].round(2)) + ')')
row_2.append('X')
row_2.append('')
row_2.append(res.rsquared_adj.round(2))
row_2

                            WLS Regression Results                            
Dep. Variable:              avescores   R-squared:                       0.063
Model:                            WLS   Adj. R-squared:                  0.062
Method:                 Least Squares   F-statistic:                     128.5
Date:                Mon, 18 Mar 2019   Prob (F-statistic):          5.82e-101
Time:                        09:55:30   Log-Likelihood:                -7255.9
No. Observations:                6302   AIC:                         1.452e+04
Df Residuals:                    6296   BIC:                         1.456e+04
Df Model:                           5                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.2787      0.054      5.

['0.09',
 '(0.09)',
 '0.1***',
 '(0.01)',
 '-0.01',
 '(0.01)',
 '-0.13***',
 '(0.02)',
 '0.0',
 '(0.01)',
 'X',
 '',
 0.06]

# Matching with Pre-tests

In [9]:
data = data.dropna(subset = ['avescores', 'psweight2', 'scores15', 'scores14', 'scores13', 'scores12'], how = 'any')
y, X = dmatrices('avescores ~  treat + year_centered + treatyear + yearpost1 + treatpostyear1 + scores12 + scores13 + scores14 + scores15',
                 data=data, return_type='dataframe', NA_action = "drop")
cluster = data[pd.notnull(data.scores14)].distname
psweights = data[pd.notnull(data.scores14)].psweight2
mod = smf.WLS(y, X, weights = psweights)    # Describe model
res = mod.fit(cov_type = 'cluster',  cov_kwds={'groups': pd.DataFrame(cluster)})       # Fit model
print(res.summary())   # Summarize model
row_3 = []
for coef in ['treat', 'year_centered', 'treatyear', 'yearpost1', 'treatpostyear1']:
    row_3.append(coef_with_stars(res.params[coef].round(2), res.pvalues[coef]))
    row_3.append('(' + str(res.bse[coef].round(2)) + ')')
row_3.append('X')
row_3.append('X')
row_3.append(res.rsquared_adj.round(2))
row_3

                            WLS Regression Results                            
Dep. Variable:              avescores   R-squared:                       0.872
Model:                            WLS   Adj. R-squared:                  0.872
Method:                 Least Squares   F-statistic:                     2214.
Date:                Mon, 18 Mar 2019   Prob (F-statistic):               0.00
Time:                        10:00:10   Log-Likelihood:                -1121.6
No. Observations:                6293   AIC:                             2263.
Df Residuals:                    6283   BIC:                             2331.
Df Model:                           9                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.1696      0.030      5.

['0.09**',
 '(0.03)',
 '0.09***',
 '(0.01)',
 '-0.0',
 '(0.01)',
 '-0.13***',
 '(0.03)',
 '0.0',
 '(0.02)',
 'X',
 'X',
 0.87]

In [10]:
df = pd.DataFrame(
    {'Model 1': row_1,
    'Model 2': row_2,
    'Model 3': row_3})
df

Unnamed: 0,Model 1,Model 2,Model 3
0,0.23**,0.09,0.09**
1,(0.08),(0.09),(0.03)
2,0.11***,0.1***,0.09***
3,(0.01),(0.01),(0.01)
4,-0.02*,-0.01,-0.0
5,(0.01),(0.01),(0.01)
6,-0.12***,-0.13***,-0.13***
7,(0.01),(0.02),(0.03)
8,-0.0,0.0,0.0
9,(0.01),(0.01),(0.02)


In [11]:
file = table_path + 'results_main.xlsx'
wb = load_workbook(file)
ws = wb.active
df
for model, col in zip(['Model 1', 'Model 2', 'Model 3'], [2,3,4]):
    row_n = 3
    for ob in df[model]:
        ws.cell(row=row_n, column=col).value = ob
        row_n = row_n + 1
wb.save(file)