In [1]:
import sys
sys.path.append("../")

import pandas as pd
import os
import statsmodels.formula.api as sm
import numpy as np
import engarde.decorators as ed


from library import regulations
from library import characteristics
from library import analysis
from library import tables
from library import test_data

In [2]:
data_path = '/Users/kylieleblancKylie/domino/dofis/data/'
table_path = '/Users/kylieleblancKylie/domino/dofis/results/Who Needs Rules/'
data = pd.read_csv(os.path.join(data_path, 'clean', 'master_data_district.csv'),
                  sep=",", low_memory = False)
data = data[data.year == 2016]
data.head()


Unnamed: 0.1,Unnamed: 0,year,district,distname,distischarter,rating_academic,rating_financial,type,type_description,cntyname,...,elem,middle_math,middle_reading,middle_science,algebra,biology,eng1,math,reading,avescores
4901,4901,2016,1902,CAYUGA ISD,N,M,Pass,H,RURAL,ANDERSON,...,1.282771,0.692503,0.844028,0.900302,1.881695,1.034905,1.097759,0.879946,1.17109,1.074875
4902,4902,2016,1903,ELKHART ISD,N,M,Pass,G,NON-METROPOLITAN STABLE,ANDERSON,...,0.8562,-0.172723,0.287515,1.239979,0.258778,0.14946,0.786351,0.401804,0.511792,0.475319
4903,4903,2016,1904,FRANKSTON ISD,N,M,Pass,H,RURAL,ANDERSON,...,0.539407,0.588836,0.881549,0.708117,0.628674,1.538693,0.489772,0.528941,0.745659,0.701539
4904,4904,2016,1906,NECHES ISD,N,M,Pass,H,RURAL,ANDERSON,...,0.38374,0.956068,0.58285,1.472389,0.180175,0.760112,-0.454338,0.824381,0.328818,0.622125
4905,4905,2016,1907,PALESTINE ISD,N,M,Pass,E,INDEPENDENT TOWN,ANDERSON,...,-0.523642,0.317496,-0.665815,0.256704,-0.203591,0.195259,-0.493882,-0.0051,-0.6927,-0.262562


In [3]:
#TODO is -999 missing or NA? Worth differentiating?
data.doi_year.value_counts().sort_index()

2016.0      3
2017.0    177
2018.0    505
2019.0    112
2020.0     14
Name: doi_year, dtype: int64

In [4]:
data['table_col'] = np.where(~np.isnan(data.doi_year), data.doi_year, 0)
data[['doi', 'doi_year', 'table_col']].sample(5)

Unnamed: 0,doi,doi_year,table_col
5641,True,2019.0,2019.0
5814,True,2018.0,2018.0
5302,False,,0.0
6001,True,2018.0,2018.0
5841,False,,0.0


# Geography

In [6]:
years = ['texas', 2017, 2018, 2019, 'tps']

In [12]:
data.type_description.value_counts()

RURAL                            459
CHARTER SCHOOL DISTRICTS         183
NON-METROPOLITAN STABLE          174
OTHER CENTRAL CITY SUBURBAN      161
MAJOR SUBURBAN                    79
INDEPENDENT TOWN                  68
OTHER CENTRAL CITY                41
NON-METROPOLITAN FAST GROWING     31
MAJOR URBAN                       11
Name: type_description, dtype: int64

In [11]:
data.geography.value_counts()

Rural       459
Town        273
Suburban    240
Urban        52
Name: geography, dtype: int64

In [13]:
data.distischarter.value_counts()

N    1024
Y     183
Name: distischarter, dtype: int64

In [14]:
labels = []
for char in characteristics.geography:
    labels.append(characteristics.labels[char])
    labels.append('')
table_dict = {'Characteristics': labels}
geo_table = pd.DataFrame(data=table_dict)

for yr in years:
    if yr == 'texas': 
        df = data[(data.distischarter == 'N')]
    elif yr == 'tps':
        df = data[(data.doi == False) & (data.distischarter == 'N')]
    else:
        df = data[data.doi_year == yr]
    means = []
    for char in characteristics.geography:
        means.append(round(df[char].mean(), 2))
        sd = '[' + str(round(df[char].std(), 2)) + ']'
        means.append(sd)
    geo_table[yr] = means
    
geo_table

Unnamed: 0,Characteristics,texas,2017,2018,2019,tps
0,Urban,0.05,0.1,0.04,0.04,0.06
1,,[0.22],[0.3],[0.19],[0.19],[0.25]
2,Suburban,0.23,0.32,0.22,0.21,0.2
3,,[0.42],[0.47],[0.41],[0.41],[0.4]
4,Town,0.27,0.27,0.28,0.26,0.25
5,,[0.44],[0.44],[0.45],[0.44],[0.44]
6,Rural,0.45,0.32,0.47,0.49,0.49
7,,[0.5],[0.47],[0.5],[0.5],[0.5]


# Teacher characteristics

In [15]:
labels = []
for char in characteristics.teacher:
    labels.append(characteristics.labels[char])
    labels.append('')
table_dict = {'Characteristics': labels}
teacher_table = pd.DataFrame(data=table_dict)

for yr in years:
    means = []
    if yr == 'texas': 
        df = data[(data.distischarter == 'N')]
    elif yr == 'tps':
        df = data[(data.doi == False) & (data.distischarter == 'N')]
    else:
        df = data[data.doi_year == yr]
    for char in characteristics.teacher:
        means.append(round(df[char].mean(), 2))
        sd = '[' + str(round(df[char].std(), 2)) + ']'
        means.append(sd)
    teacher_table[yr] = means
    
teacher_table

Unnamed: 0,Characteristics,texas,2017,2018,2019,tps
0,Ave. Experience Teaching,7.12,7.3,7.18,7.04,6.9
1,,[1.98],[1.65],[1.89],[1.98],[2.4]
2,Teacher Turnover Ratio,19.36,17.78,18.75,19.11,22.42
3,,[9.98],[6.94],[9.08],[8.53],[14.11]
4,Student-Teacher Ratio,12.69,13.36,12.49,12.75,12.53
5,,[2.47],[2.3],[2.35],[2.36],[2.74]


# Student 

In [16]:
labels = []
for char in characteristics.student:
    labels.append(characteristics.labels[char])
    labels.append('')
table_dict = {'Characteristics': labels}
student_table = pd.DataFrame(data=table_dict)

for yr in years:
    if yr == 'texas': 
        df = data[(data.distischarter == 'N')]
    elif yr == 'tps':
        df = data[(data.doi == False) & (data.distischarter == 'N')]
    else:
        df = data[data.doi_year == yr]
    means = []
    for char in characteristics.student:
        means.append(round(df[char].mean(), 2))
        sd = '[' + str(round(df[char].std(), 2)) + ']'
        means.append(sd)
    student_table[yr] = means
    
student_table

Unnamed: 0,Characteristics,texas,2017,2018,2019,tps
0,Percent Hispanic,0.39,0.37,0.34,0.39,0.53
1,,[0.27],[0.24],[0.24],[0.28],[0.31]
2,Percent White,0.5,0.51,0.55,0.51,0.39
3,,[0.27],[0.24],[0.25],[0.28],[0.29]
4,Percent Black,0.07,0.07,0.07,0.07,0.06
5,,[0.1],[0.09],[0.11],[0.09],[0.11]
6,Percent Econ. Disadvantaged,0.57,0.53,0.56,0.57,0.66
7,,[0.19],[0.2],[0.18],[0.19],[0.19]
8,Average STAAR Performance (Std.),0.25,0.44,0.31,0.19,-0.06
9,,[0.73],[0.78],[0.68],[0.73],[0.7]


# To Table

In [17]:
geo_table.columns

Index(['Characteristics', 'texas', 2017, 2018, 2019, 'tps'], dtype='object')

In [18]:
dfs = [geo_table, teacher_table, student_table]
rows = [4, 13, 20]
for df, row in zip(dfs, rows):
    tables.df_to_excel(file = table_path + 'table2_district_characteristics_by_year.xlsx', df = df,
                      df_columns = years, start_col = 2, start_row = row)

In [24]:
tables.n_to_excel(file = table_path + 'table2_district_characteristics_by_year.xlsx',
                 col = 2, row = 31, n = len(data[data.distischarter == 'N']))
tables.n_to_excel(file = table_path + 'table2_district_characteristics_by_year.xlsx',
                 col = 3, row = 31, n = len(data[data.doi_year == years[1]]))
tables.n_to_excel(file = table_path + 'table2_district_characteristics_by_year.xlsx', 
                  col = 4, row = 31, n = len(data[data.doi_year == years[2]]))
tables.n_to_excel(file = table_path + 'table2_district_characteristics_by_year.xlsx',
                 col = 5, row = 31, n = len(data[data.doi_year == years[3]]))
tables.n_to_excel(file = table_path + 'table2_district_characteristics_by_year.xlsx',
                 col = 6, row = 31, n = len(data[(data.doi == False) & (data.distischarter == 'N')]))