In [1]:
import sys
sys.path.append("../")

import pandas as pd
import os
import statsmodels.formula.api as sm
import numpy as np
import engarde.decorators as ed


from library import regulations
from library import characteristics
from library import analysis
from library import tables
from library import test_data

In [2]:
data_path = '/Users/kylieleblancKylie/domino/dofis/data/'
table_path = '/Users/kylieleblancKylie/domino/dofis/results/Who Needs Rules/'
data = pd.read_csv(os.path.join(data_path, 'clean', 'master_data_school.csv'),
                  sep=",", low_memory = False)
data = data[data.year == 2016]
data.head()

Unnamed: 0.1,Unnamed: 0,year,campus,campname,campischarter,district,distname,distischarter,rating_academic,rating_financial,...,district_status,type_urban,type_suburban,type_town,type_rural,eligible,teachers_nodegree,teachers_badegree,teachers_msdegree,teachers_phddegree
34304,34304,2016,1902001,CAYUGA H S,N,1902,CAYUGA ISD,N,M,Pass,...,doi,0,0,0,1,1,0.0,0.92381,0.07619,0.0
34305,34305,2016,1902041,CAYUGA MIDDLE,N,1902,CAYUGA ISD,N,M,Pass,...,doi,0,0,0,1,1,0.0,0.787611,0.212389,0.0
34306,34306,2016,1902103,CAYUGA EL,N,1902,CAYUGA ISD,N,M,Pass,...,doi,0,0,0,1,1,0.0,0.954751,0.045249,0.0
34307,34307,2016,1903001,ELKHART H S,N,1903,ELKHART ISD,N,M,Pass,...,doi,0,0,1,0,1,0.0,0.823353,0.176647,0.0
34308,34308,2016,1903041,ELKHART MIDDLE,N,1903,ELKHART ISD,N,M,Pass,...,doi,0,0,1,0,1,0.0,0.865217,0.134783,0.0


In [3]:
#TODO is -999 missing or NA? Worth differentiating?
data.doi_year.value_counts().sort_index()

2015.0       1
2016.0     928
2017.0    4232
2018.0     948
2019.0     204
Name: doi_year, dtype: int64

# Geography

In [4]:
labels = []
for char in characteristics.geography:
    labels.append(characteristics.labels[char])
    labels.append('')
table_dict = {'Characteristics': labels}
geo_table = pd.DataFrame(data=table_dict)

for yr in [2016, 2017,2018]:
    means = []
    df = data[data.doi_year == yr]
    for char in characteristics.geography:
        means.append(round(df[char].mean(), 2))
        sd = '[' + str(round(df[char].std(), 2)) + ']'
        means.append(sd)
    geo_table[yr] = means
    
geo_table

Unnamed: 0,Characteristics,2016,2017,2018
0,Urban,0.38,0.26,0.28
1,,[0.48],[0.44],[0.45]
2,Suburban,0.48,0.42,0.42
3,,[0.5],[0.49],[0.49]
4,Town,0.09,0.19,0.17
5,,[0.29],[0.39],[0.37]
6,Rural,0.05,0.13,0.14
7,,[0.22],[0.34],[0.35]


# Teacher characteristics

In [5]:
labels = []
for char in characteristics.teacher:
    labels.append(characteristics.labels[char])
    labels.append('')
table_dict = {'Characteristics': labels}
teacher_table = pd.DataFrame(data=table_dict)

for yr in [2016, 2017,2018]:
    means = []
    df = data[data.doi_year == yr]
    for char in characteristics.teacher:
        means.append(round(df[char].mean(), 2))
        sd = '[' + str(round(df[char].std(), 2)) + ']'
        means.append(sd)
    teacher_table[yr] = means
    
teacher_table

Unnamed: 0,Characteristics,2016,2017,2018
0,Ave. Experience Teaching,7.82,7.53,7.7
1,,[2.76],[2.67],[2.75]
2,Teacher Turnover Ratio,15.2,16.77,17.16
3,,[4.75],[5.9],[6.92]
4,Student-Teacher Ratio,14.78,14.09,14.16
5,,[4.17],[3.67],[5.66]


# Student 

In [6]:
labels = []
for char in characteristics.student:
    labels.append(characteristics.labels[char])
    labels.append('')
table_dict = {'Characteristics': labels}
student_table = pd.DataFrame(data=table_dict)

for yr in [2016, 2017,2018]:
    means = []
    df = data[data.doi_year == yr]
    for char in characteristics.student:
        means.append(round(df[char].mean(), 2))
        sd = '[' + str(round(df[char].std(), 2)) + ']'
        means.append(sd)
    student_table[yr] = means
    
student_table

Unnamed: 0,Characteristics,2016,2017,2018
0,Percent Hispanic,0.53,0.44,0.48
1,,[0.29],[0.28],[0.28]
2,Percent White,0.29,0.37,0.36
3,,[0.25],[0.28],[0.27]
4,Percent Black,0.11,0.13,0.11
5,,[0.12],[0.16],[0.15]
6,Percent Econ. Disadvantaged,0.59,0.59,0.58
7,,[0.29],[0.26],[0.25]
8,Average STAAR Performance (Std.),0.29,0.18,0.13
9,,[0.99],[0.88],[0.81]


# To Table

In [7]:
geo_table.columns

Index(['Characteristics', 2016, 2017, 2018], dtype='object')

In [8]:
dfs = [geo_table, teacher_table, student_table]
rows = [4, 13, 20]
for df, row in zip(dfs, rows):
    tables.df_to_excel(file = table_path + 'table2_characteristics_by_year.xlsx', df = df,
                      df_columns = [2016, 2017, 2018], start_col = 2, start_row = row)

In [9]:
tables.n_to_excel(file = table_path + 'table2_characteristics_by_year.xlsx',
                 col = 2, row = 31, n = len(data[data.doi_year == 2016]))
tables.n_to_excel(file = table_path + 'table2_characteristics_by_year.xlsx', 
                  col = 3, row = 31, n = len(data[data.doi_year == 2017]))
tables.n_to_excel(file = table_path + 'table2_characteristics_by_year.xlsx',
                 col = 4, row = 31, n = len(data[data.doi_year == 2018]))