In [53]:
import sys
sys.path.append("../")


import os
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from patsy import dmatrices
from openpyxl import load_workbook
import engarde.decorators as ed

from library import test

@ed.verify(test.onlyeligible)
def load(data):
    return data

In [54]:
data_path = '/Users/kylieleblancKylie/domino/dofis/data/'
table_path = '/Users/kylieleblancKylie/domino/dofis/results/impact/'
data = pd.read_csv(os.path.join(data_path, 'clean', 'cits_match.csv'),
                  sep=",", low_memory=False)
load(data)
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,district,distname,year,cntyname,distischarter,rating_academic,rating_financial,eligible,...,scores12,scores13,scores14,scores15,ps1,psweight1,pswtshare1,ps2,psweight2,pswtshare2
0,0,0,1902,CAYUGA ISD,2012,ANDERSON,N,,,,...,0.377578,0.68824,0.698838,0.767644,0.909773,1.099175,0.119368,0.911242,1.097403,0.119636
1,1,1229,1902,CAYUGA ISD,2013,ANDERSON,N,M,,,...,0.377578,0.68824,0.698838,0.767644,0.909773,1.099175,0.119368,0.911242,1.097403,0.119636
2,2,2459,1902,CAYUGA ISD,2014,ANDERSON,N,M,,,...,0.377578,0.68824,0.698838,0.767644,0.909773,1.099175,0.119368,0.911242,1.097403,0.119636
3,3,3688,1902,CAYUGA ISD,2015,ANDERSON,N,M,Pass,True,...,0.377578,0.68824,0.698838,0.767644,0.909773,1.099175,0.119368,0.911242,1.097403,0.119636
4,4,4909,1902,CAYUGA ISD,2016,ANDERSON,N,M,Pass,True,...,0.377578,0.68824,0.698838,0.767644,0.909773,1.099175,0.119368,0.911242,1.097403,0.119636


In [55]:
def coef_with_stars(coef, pvalue):
    if pvalue >.05:
        coef = str(coef)
    if pvalue <= .05:
        coef = str(coef) + '*'
    if pvalue <= .01:
        coef = coef + '*'
    if pvalue <= .001:
        coef = coef + '*'
    return(coef)
test = coef_with_stars(.1, .005)

In [56]:
data.district.nunique()

924

In [57]:
data[pd.notnull(data.avescores)].district.nunique()

923

In [61]:
reg_vars = ['year_centered']
test = data.dropna(subset = reg_vars)
test.district.nunique()

853

# No Matching

In [25]:
y, X = dmatrices('avescores ~  treat + year_centered + treatyear + yearpost1 + treatpostyear1',
                 data=data, return_type='dataframe', NA_action = "drop")
cluster = data[pd.notnull(data.avescores)].district
mod = smf.OLS(y, X)    # Describe model
res = mod.fit(cov_type = 'cluster',  cov_kwds={'groups': pd.DataFrame(cluster)})       # Fit model
print(res.summary())   # Summarize model
row_1 = []
for coef in ['treat', 'year_centered', 'treatyear', 'yearpost1', 'treatpostyear1']:
    row_1.append(coef_with_stars(res.params[coef].round(2), res.pvalues[coef]))
    row_1.append('(' + str(res.bse[coef].round(2)) + ')')
row_1.append('')
row_1.append('')
row_1.append(res.rsquared_adj.round(2))
row_1

ValueError: The weights and list don't have the same length.

# Matching without pre-tests


In [7]:
y, X = dmatrices('avescores ~  treat + year_centered + treatyear + yearpost1 + treatpostyear1',
                 data=data, return_type='dataframe', NA_action = "drop")
cluster = data[pd.notnull(data.avescores)].distname
psweights = data[pd.notnull(data.avescores)].psweight1
mod = smf.WLS(y, X, weights = psweights)    # Describe model
res = mod.fit(cov_type = 'cluster',  cov_kwds={'groups': pd.DataFrame(cluster)})       # Fit model
print(res.summary())   # Summarize model
row_2 = []
for coef in ['treat', 'year_centered', 'treatyear', 'yearpost1', 'treatpostyear1']:
    row_2.append(coef_with_stars(res.params[coef].round(2), res.pvalues[coef]))
    row_2.append('(' + str(res.bse[coef].round(2)) + ')')
row_2.append('X')
row_2.append('')
row_2.append(res.rsquared_adj.round(2))
row_2

                            WLS Regression Results                            
Dep. Variable:              avescores   R-squared:                       0.176
Model:                            WLS   Adj. R-squared:                  0.176
Method:                 Least Squares   F-statistic:                     96.92
Date:                Wed, 10 Apr 2019   Prob (F-statistic):           6.21e-80
Time:                        10:20:14   Log-Likelihood:                -28928.
No. Observations:               34292   AIC:                         5.787e+04
Df Residuals:                   34286   BIC:                         5.792e+04
Df Model:                           5                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.2761      0.086      3.

['-0.45***',
 '(0.13)',
 '0.09***',
 '(0.01)',
 '-0.05***',
 '(0.01)',
 '-0.12***',
 '(0.02)',
 '0.14***',
 '(0.01)',
 'X',
 '',
 0.18]

# Matching with Pre-tests

In [8]:
data = data.dropna(subset = ['avescores', 'psweight2', 'scores15', 'scores14', 'scores13', 'scores12'], how = 'any')
y, X = dmatrices('avescores ~  treat + year_centered + treatyear + yearpost1 + treatpostyear1 + scores12 + scores13 + scores14 + scores15',
                 data=data, return_type='dataframe', NA_action = "drop")
cluster = data[pd.notnull(data.scores14)].distname
psweights = data[pd.notnull(data.scores14)].psweight2
mod = smf.WLS(y, X, weights = psweights)    # Describe model
res = mod.fit(cov_type = 'cluster',  cov_kwds={'groups': pd.DataFrame(cluster)})       # Fit model
print(res.summary())   # Summarize model
row_3 = []
for coef in ['treat', 'year_centered', 'treatyear', 'yearpost1', 'treatpostyear1']:
    row_3.append(coef_with_stars(res.params[coef].round(2), res.pvalues[coef]))
    row_3.append('(' + str(res.bse[coef].round(2)) + ')')
row_3.append('X')
row_3.append('X')
row_3.append(res.rsquared_adj.round(2))
row_3

                            WLS Regression Results                            
Dep. Variable:              avescores   R-squared:                       0.854
Model:                            WLS   Adj. R-squared:                  0.854
Method:                 Least Squares   F-statistic:                     2610.
Date:                Wed, 10 Apr 2019   Prob (F-statistic):               0.00
Time:                        10:20:15   Log-Likelihood:                 3169.7
No. Observations:               34279   AIC:                            -6319.
Df Residuals:                   34269   BIC:                            -6235.
Df Model:                           9                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.1346      0.035      3.

['-0.01',
 '(0.03)',
 '0.07***',
 '(0.01)',
 '-0.03*',
 '(0.02)',
 '-0.14**',
 '(0.05)',
 '0.11***',
 '(0.01)',
 'X',
 'X',
 0.85]

In [9]:
df = pd.DataFrame(
    {'Model 1': row_1,
    'Model 2': row_2,
    'Model 3': row_3})
df

Unnamed: 0,Model 1,Model 2,Model 3
0,-0.31**,-0.45***,-0.01
1,(0.11),(0.13),(0.03)
2,0.1***,0.09***,0.07***
3,(0.01),(0.01),(0.01)
4,-0.06***,-0.05***,-0.03*
5,(0.01),(0.01),(0.02)
6,-0.12***,-0.12***,-0.14**
7,(0.02),(0.02),(0.05)
8,0.13***,0.14***,0.11***
9,(0.0),(0.01),(0.01)


In [10]:
file = table_path + 'results_main.xlsx'
wb = load_workbook(file)
ws = wb.active
df
for model, col in zip(['Model 1', 'Model 2', 'Model 3'], [2,3,4]):
    row_n = 3
    for ob in df[model]:
        ws.cell(row=row_n, column=col).value = ob
        row_n = row_n + 1
wb.save(file)