In [2]:
import sys
sys.path.append("../")

import pandas as pd
import os
import statsmodels.formula.api as sm
import numpy as np
from patsy import dmatrices
from openpyxl import load_workbook
import engarde.decorators as ed


from library import regulations
from library import characteristics
from library import analysis
from library import tables
from library import test

In [3]:
data_path = '/Users/kylieleblancKylie/domino/dofis/data/'
table_path = '/Users/kylieleblancKylie/domino/dofis/results/impact/'
data = pd.read_csv(os.path.join(data_path, 'clean', 'cits.csv'),
                  sep=",", low_memory= False)
@ed.verify(test.alleligible)
def load():
    return data
data = load()
data.head()
data = data[data.year == 2016]
data['doi_16'] = np.where(data.doi_year == 2016, 1, 0)
data['doi_17'] = np.where(data.doi_year == 2017, 1, 0)
data['doi_18'] = np.where(data.doi_year == 2018, 1, 0)
data['tpsd'] = np.where(data.doi == False, 1, 0)
data.head()


Unnamed: 0.1,Unnamed: 0,district,distname,year,cntyname,distischarter,rating_academic,rating_financial,eligible,type,...,treatyear,treatpost,treatpostyear,yearpost1,treatpostyear1,treatpostyear2,doi_16,doi_17,doi_18,tpsd
3712,4909,1902,CAYUGA ISD,2016,ANDERSON,N,M,Pass,True,H,...,-1.0,0.0,-0.0,0,0,0,0,1,0,0
3713,4910,1903,ELKHART ISD,2016,ANDERSON,N,M,Pass,True,G,...,-2.0,0.0,-0.0,0,0,0,0,0,1,0
3714,4911,1904,FRANKSTON ISD,2016,ANDERSON,N,M,Pass,True,H,...,-2.0,0.0,-0.0,0,0,0,0,0,1,0
3715,4912,1906,NECHES ISD,2016,ANDERSON,N,M,Pass,True,H,...,-1.0,0.0,-0.0,0,0,0,0,1,0,0
3716,4913,1907,PALESTINE ISD,2016,ANDERSON,N,M,Pass,True,E,...,0.0,1.0,0.0,0,0,0,1,0,0,0


In [85]:
y, X = dmatrices('students_num ~ 0 + tpsd + doi_16 + doi_17 + doi_18', data=data, return_type='dataframe', NA_action = "drop")
X.head()

Unnamed: 0,tpsd,doi_16,doi_17,doi_18
3365,0.0,0.0,1.0,0.0
3366,0.0,0.0,0.0,1.0
3367,0.0,0.0,0.0,1.0
3368,0.0,0.0,1.0,0.0
3369,0.0,1.0,0.0,0.0


In [86]:
rules = [(data.tpsd == 1), (data.doi_16 == 1), (data.doi_17 == 1), (data.doi_18 == 1) ]
coef_1 = []
coef_2 = []
coef_3 = []
coef_4 = []
cols  = [coef_1, coef_2, coef_3, coef_4]
char = 'students_num'
for rule, col in zip(rules, cols):
    col.append(float((data[rule][[char]].mean().round(2))))
coef_1
    

[6295.91]

In [87]:
def stars(pvalue):
    if pvalue >.05:
        stars = ''
    if pvalue <= .05:
        stars = '*'
    if pvalue <= .01:
        stars = '**'
    if pvalue <= .001:
        stars = '***'
    return(stars)
test = stars(.05)
test

'*'

In [99]:
def make_stat_df(data, y_list, y_labels):
    characteristics = []
    coef_1 = []
    coef_2 = []
    coef_3 = []
    coef_4 = []
    p_ftest = []
    for var in y_list:
        formula = var + ' ~ ' + '0 + tpsd + doi_16 + doi_17 + doi_18'
        y, X = dmatrices(formula, data=data, return_type='dataframe', NA_action = "drop") 
        mod = sm.OLS(y, X)    # Describe model
        res = mod.fit()       # Fit model
        print(res.fvalue)
        print(y_labels[var])
        print(res.summary())   # Summarize model
        p_ftest.append(stars(res.f_pvalue))
        p_ftest.append('')
        rules = [(data.tpsd == 1), (data.doi_16 == 1), (data.doi_17 == 1), (data.doi_18 == 1) ]
        cols  = [coef_1, coef_2, coef_3, coef_4]
        for rule, col in zip(rules, cols):
            col.append(float((data[rule][[var]].mean().round(2))))
            col.append('[' + str(float((data[rule][[var]].std().round(2)))) + ']')
        characteristics.append(y_labels[var])
        characteristics.append('')


    df = pd.DataFrame(
        {'Characteristic': characteristics,
         'TPSD': coef_1,
         'DOI_2016': coef_2,
         'DOI_2017': coef_3,
         'DOI_2018': coef_4,
         'P_ftest': p_ftest
         })
    return df
geo = make_stat_df(data = data, 
                   y_list = characteristics.geography,
                   y_labels = characteristics.labels)
geo

14.940378240545344
Urban
                            OLS Regression Results                            
Dep. Variable:             type_urban   R-squared:                       0.067
Model:                            OLS   Adj. R-squared:                  0.062
Method:                 Least Squares   F-statistic:                     14.94
Date:                Sun, 17 Mar 2019   Prob (F-statistic):           8.42e-12
Time:                        08:35:21   Log-Likelihood:                 49.106
No. Observations:                 842   AIC:                            -90.21
Df Residuals:                     838   BIC:                            -71.27
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
tpsd           0.0660      

Unnamed: 0,Characteristic,TPSD,DOI_2016,DOI_2017,DOI_2018,P_ftest
0,Urban,0.07,0.13,0.04,0.08,***
1,,[0.25],[0.34],[0.2],[0.27],
2,Suburban,0.28,0.33,0.24,0.24,***
3,,[0.45],[0.47],[0.43],[0.43],
4,Town,0.23,0.21,0.29,0.31,***
5,,[0.42],[0.41],[0.45],[0.47],
6,Rural,0.42,0.33,0.43,0.38,***
7,,[0.5],[0.47],[0.5],[0.49],


In [100]:
teacher = make_stat_df(data = data, 
                   y_list = characteristics.teacher,
                   y_labels = characteristics.labels)
teacher

3159.049146441374
Ave. Experience Teaching
                             OLS Regression Results                            
Dep. Variable:     teachers_tenure_ave   R-squared:                       0.938
Model:                             OLS   Adj. R-squared:                  0.938
Method:                  Least Squares   F-statistic:                     3159.
Date:                 Sun, 17 Mar 2019   Prob (F-statistic):               0.00
Time:                         08:35:22   Log-Likelihood:                -1723.9
No. Observations:                  842   AIC:                             3456.
Df Residuals:                      838   BIC:                             3475.
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

Unnamed: 0,Characteristic,TPSD,DOI_2016,DOI_2017,DOI_2018,P_ftest
0,Ave. Experience Teaching,7.36,7.29,7.26,7.24,***
1,,[2.08],[1.59],[1.81],[1.78],
2,Teacher Turnover Ratio,0.19,0.17,0.18,0.18,***
3,,[0.1],[0.06],[0.08],[0.08],
4,Student-Teacher Ratio,6.49,6.95,6.68,6.84,***
5,,[1.21],[1.14],[1.17],[1.04],


In [101]:
student = make_stat_df(data = data, 
                   y_list = characteristics.student,
                   y_labels = characteristics.labels)
student

487.36191534011135
Percent Hispanic
                            OLS Regression Results                            
Dep. Variable:          students_hisp   R-squared:                       0.699
Model:                            OLS   Adj. R-squared:                  0.698
Method:                 Least Squares   F-statistic:                     487.4
Date:                Sun, 17 Mar 2019   Prob (F-statistic):          5.83e-217
Time:                        08:35:22   Log-Likelihood:                -56.200
No. Observations:                 842   AIC:                             120.4
Df Residuals:                     838   BIC:                             139.3
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
tpsd           0

Unnamed: 0,Characteristic,TPSD,DOI_2016,DOI_2017,DOI_2018,P_ftest
0,Percent Hispanic,0.5,0.45,0.34,0.37,***
1,,[0.32],[0.25],[0.23],[0.25],
2,Percent White,0.42,0.44,0.55,0.53,***
3,,[0.3],[0.24],[0.24],[0.26],
4,Percent Black,0.05,0.07,0.06,0.07,***
5,,[0.08],[0.08],[0.09],[0.08],
6,Percent Econ. Disadvantaged,0.61,0.56,0.54,0.55,***
7,,[0.2],[0.21],[0.17],[0.17],
8,Average STAAR Performance (Std.),0.21,0.46,0.42,0.38,***
9,,[0.69],[0.89],[0.67],[0.63],


# To Table

In [104]:
rows = [6, 15, 22]
dfs = [geo, teacher, student]
file = table_path + 'characteristic_by_adoption_year.xlsx'
wb = load_workbook(file)
ws = wb.active
str_n = 'N = ' + str(len(data[data.tpsd == 1]))
ws.cell(row=4, column=2).value = str_n
str_n = 'N = ' + str(len(data[data.doi_16 == 1]))
ws.cell(row=4, column=3).value = str_n
str_n = 'N = ' + str(len(data[data.doi_17 == 1]))
ws.cell(row=4, column=4).value = str_n
str_n = 'N = ' + str(len(data[data.doi_18 == 1]))
ws.cell(row=4, column=5).value = str_n

for df, row in zip(dfs, rows):
    row_n = row
    for ob in df.TPSD:
        ws.cell(row=row_n, column=2).value = ob
        row_n = row_n + 1
    row_n = row
    for ob in df.DOI_2016:
        ws.cell(row=row_n, column=3).value = ob
        row_n = row_n + 1   
    row_n = row
    for ob in df.DOI_2017:
        ws.cell(row=row_n, column=4).value = ob
        row_n = row_n + 1 
    row_n = row
    for ob in df.DOI_2018:
        ws.cell(row=row_n, column=5).value = ob
        row_n = row_n + 1 
    row_n = row
    for ob in df.P_ftest:
        ws.cell(row=row_n, column=6).value = ob
        row_n = row_n + 1 
wb.save(file)