In [1]:
import sys
sys.path.append("../")


import os
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from patsy import dmatrices
from openpyxl import load_workbook

from linearmodels import PanelOLS

from library import print_statistics

from library import regulations
from library import characteristics
from library import analysis
from library import tables

  from pandas import (Categorical, DataFrame, Index, MultiIndex, Panel, Series,


In [2]:
def coef_with_stars(coef, pvalue):
    coef = round(coef, 2)
    if pvalue >.05:
        coef = str(coef)
    if pvalue <= .05:
        coef = str(coef) + '*'
    if pvalue <= .01:
        coef = coef + '*'
    if pvalue <= .001:
        coef = coef + '*'
    return(coef)
test = coef_with_stars(9.1568, 0.8523)
test

'9.16'

In [3]:
def format_se(se):
    if se < .005:
        se = '(0.00)'
    else:
        se = '(' + str(round(se, 2)) + ')'
    return se
test = format_se(.0053)
test

'(0.01)'

In [4]:
data_path = '/Users/kylieleblancKylie/domino/dofis/data/'
table_path = '/Users/kylieleblancKylie/domino/dofis/results/Who Needs Rules/'
data = pd.read_csv(os.path.join(data_path, 'clean', 'gdid.csv'),
                  sep=",", low_memory= False)
#load(data)
print(data[data.doi == True].district.nunique())
data.sample()

794


Unnamed: 0.1,Unnamed: 0,year,campus,campname,campischarter,district,distname,distischarter,rating_academic,rating_financial,...,yearpost,yearpre,pre5,pre4,pre3,pre2,pre1,post1,post2,post3
9679,13767,2013,113903002,LOVELADY J H H S,N,113903,LOVELADY ISD,N,M,,...,0.0,-5.0,1,0,0,0,0,0,0,0


In [5]:
subjects = ['m_3rd_avescore', 'm_4th_avescore',  'm_5th_avescore', 
            'm_6th_avescore', 'm_7th_avescore', 'm_8th_avescore',
            'alg_avescore',
            'r_3rd_avescore', 'r_4th_avescore', 'r_5th_avescore',
            'r_6th_avescore', 'r_7th_avescore', 'r_8th_avescore',
            'eng1_avescore',  'bio_avescore']
math_tests = ['m_3rd_avescore', 'm_4th_avescore', 'm_5th_avescore',
              'm_6th_avescore', 'm_7th_avescore', 'm_8th_avescore', 'alg_avescore']
reading_tests = ['r_3rd_avescore', 'r_4th_avescore', 'r_5th_avescore',
              'r_6th_avescore', 'r_7th_avescore', 'r_8th_avescore', 'eng1_avescore']

In [6]:
#convert year to datetime
df = data.reset_index()
df['year'] = pd.to_datetime(df['year'], format='%Y')
#add column year to index
df = df.set_index(['campus', 'year'])
#swap indexes
df.index = df.index.swaplevel(0,1)
df[['district', 'doi_year','treatpost', 'pre1']].sample(5, random_state = 8)

Unnamed: 0_level_0,Unnamed: 1_level_0,district,doi_year,treatpost,pre1
year,campus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-01,126903001,126903,2017.0,True,0
2015-01-01,146905003,146905,2018.0,False,0
2012-01-01,101903117,101903,2018.0,False,0
2019-01-01,101920103,101920,2017.0,True,0
2012-01-01,31901131,31901,2018.0,False,0


In [7]:
def many_y_one_x(data, y_list, y_labels, x):
    regs = []
    cons = []
    coef = []
    se = []
    pvalue = []

    for y in y_list:
        df = data.replace(np.inf, np.nan).replace(-np.inf, np.nan).dropna(subset = [y])
        formula = y + ' ~ + 1 + pre1 + EntityEffects'
        print(formula)
        mod = PanelOLS.from_formula(formula, df)
        #result = mod.fit()
        result = mod.fit(cov_type='clustered', cluster_entity = True, cluster_time = True)
        #result = mod.fit(cov_type='clustered', clusters = data.district)

        cons.append(result.params["Intercept"].round(2))
        if str(data[x].dtypes) == 'bool':
            var = x + '[T.True]'
        else:
            var = x
        coef.append(result.params[var].round(2))
        se.append(result.std_errors[var].round(2))
        pvalue.append(result.pvalues[var].round(2))
        regs.append(y_labels[y])

    df = pd.DataFrame(
        {'Characteristic': regs,
         'Control': cons,
         'Difference': coef,
         'Std. Error': se,
         'P-value': pvalue,
         })
    return df


In [8]:
district  = many_y_one_x(df, characteristics.geography, characteristics.labels, 'pre1')
district

type_urban ~ + 1 + pre1 + EntityEffects
type_suburban ~ + 1 + pre1 + EntityEffects
type_town ~ + 1 + pre1 + EntityEffects
type_rural ~ + 1 + pre1 + EntityEffects


Unnamed: 0,Characteristic,Control,Difference,Std. Error,P-value
0,Urban,0.28,-0.0,0.1,0.99
1,Suburban,0.44,0.0,0.02,0.97
2,Town,0.17,-0.0,0.03,0.97
3,Rural,0.12,0.0,0.05,0.98


In [9]:
data.students_teacher_ratio

0        12.884018
1              NaN
2        13.655172
3         8.456555
4        10.099912
           ...    
49800    17.166667
49801    17.470588
49802    16.093750
49803     8.607595
49804    13.266332
Name: students_teacher_ratio, Length: 49805, dtype: float64

In [10]:
teacher_characteristics = ['teachers_tenure_ave', 'teachers_turnover_ratio_d']
teachers = many_y_one_x(df, characteristics.teacher, characteristics.labels, 'pre1')
teachers

teachers_exp_ave ~ + 1 + pre1 + EntityEffects
teachers_tenure_ave ~ + 1 + pre1 + EntityEffects
teachers_turnover_ratio_d ~ + 1 + pre1 + EntityEffects
students_teacher_ratio ~ + 1 + pre1 + EntityEffects


Unnamed: 0,Characteristic,Control,Difference,Std. Error,P-value
0,Ave. Experience in District,11.68,-0.05,0.22,0.81
1,Ave. Experience Teaching,7.78,-0.01,0.15,0.97
2,Teacher Turnover Ratio,16.04,-0.1,1.11,0.93
3,Student-Teacher Ratio,14.4,-0.08,0.37,0.82


In [11]:
students = many_y_one_x(df, characteristics.student, characteristics.labels, 'pre1')
students

students_hisp ~ + 1 + pre1 + EntityEffects
students_white ~ + 1 + pre1 + EntityEffects
students_black ~ + 1 + pre1 + EntityEffects
students_frpl ~ + 1 + pre1 + EntityEffects
avescores ~ + 1 + pre1 + EntityEffects


Unnamed: 0,Characteristic,Control,Difference,Std. Error,P-value
0,Percent Hispanic,0.46,0.0,0.01,0.81
1,Percent White,0.36,-0.0,0.02,0.97
2,Percent Black,0.12,-0.0,0.01,0.82
3,Percent Econ. Disadvantaged,0.59,0.0,0.02,0.93
4,Average STAAR Performance (Std.),0.14,0.01,0.08,0.9


In [12]:
dfs = [district, teachers, students]
rows = [5, 14, 21]
tables.n_to_excel(file = table_path + 'balance_controls.xlsx',
                 col = 2, row = 4, n = len(data[data.doi == 0]))
tables.n_to_excel(file = table_path + 'balance_controls.xlsx',
                 col = 3, row = 4, n = len(data[data.doi == 1]))
for df, row in zip(dfs,rows):
    tables.var_diff_to_excel(file = table_path + 'balance_controls.xlsx', 
                             df = df, 
                             control_col = 'Control', 
                             diff_col = 'Difference', 
                             se_col = 'Std. Error', pvalue_col = 'P-value',
                             start_col = 2, start_row = row)