In [1]:
import sys
import os
import csv
import pandas as pd
from library import clean_tea
from library import clean_tea_schools
import os
import numpy as np
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
year = 'yr1617'

# Basic Descriptives - DREF
data from: https://tea.texas.gov/perfreport/tapr/index.html
reference of labels: https://rptsvr1.tea.texas.gov/perfreport/tapr/2016/download/dstaff.html
steps to cleaning dref can be found in cleaning.py

In [19]:
dref = clean_tea.clean_dref(year = year)
dref = clean_tea_schools.fix_duplicate_distname(dref, distname_col='distname', cntyname_col= 'cntyname')
dref.head()

There are  1203 districts in dref


Unnamed: 0,district,distname,distischarter,cntyname,rating_academic,rating_financial,eligible
0,1902,CAYUGA ISD,N,ANDERSON,M,Pass,True
1,1903,ELKHART ISD,N,ANDERSON,M,Pass,True
2,1904,FRANKSTON ISD,N,ANDERSON,M,Pass,True
3,1906,NECHES ISD,N,ANDERSON,M,Pass,True
4,1907,PALESTINE ISD,N,ANDERSON,M,Pass,True


# Demographic data - DDEM
data from: https://tea.texas.gov/perfreport/tapr/index.html
reference of labels: https://rptsvr1.tea.texas.gov/perfreport/tapr/2016/download/dstaff.html

In [20]:
ddem = clean_tea.clean_ddem(year = year)
ddem.head()

There are  1203 districts in ddem


Unnamed: 0,district,teachers_num,teachers_new_num,teachers_exp_ave,teachers_tenure_ave,teachers_turnover_ratio,teachers_nodegree_num,teachers_badegree_num,teachers_msdegree_num,teachers_phddegree_num,students_num,students_frpl_num,students_hisp_num,students_white_num,students_black_num,students_amind_num,students_asian_num,students_paci_num,students_tworaces_num,teachers_turnover_denom,teachers_turnover_num
0,1902,105.5,1.0,15.4,7.8,7.3,0.0,48.4,4.0,0,576,192,42,479,27,0,3,0,25,54.4,4.0
1,1903,179.6,2.0,14.9,9.4,11.0,0.0,84.0,19.0,0,1267,673,137,1011,67,1,11,2,38,100.3,11.0
2,1904,121.0,2.0,13.7,6.4,26.0,0.0,42.0,17.0,0,846,446,82,648,76,6,6,0,28,58.1,15.1
3,1906,64.2,0.3,11.9,6.6,7.2,0.0,35.3,0.0,0,377,172,51,286,28,1,1,0,10,34.5,2.5
4,1907,481.0,23.1,11.2,6.3,21.8,5.3,225.8,32.4,0,3453,2587,1382,974,949,10,28,4,106,270.3,58.9


# District type - DTYPE

data from: https://tea.texas.gov/acctres/analyze/years.html 
labels from: https://tea.texas.gov/acctres/analyze/1617/gloss1617.html

In [21]:
dtype = clean_tea.clean_dtype(year = year)
dtype.head()

There are  1206 districts in dref


Unnamed: 0,district,type,type_description
0,57816.0,I,CHARTER SCHOOL DISTRICTS
1,57829.0,I,CHARTER SCHOOL DISTRICTS
2,101871.0,I,CHARTER SCHOOL DISTRICTS
3,109901.0,H,RURAL
4,95901.0,H,RURAL


# Number of Schools
data from https://tea.texas.gov/perfreport/tapr/index.html

In [22]:
cref = clean_tea.clean_cref_numschools('yr1617')
cref = clean_tea_schools.fix_duplicate_distname(cref, distname_col='distname', cntyname_col= 'cntyname')

data_path = '/Users/kylieleblancKylie/domino/dofis/data/'
def filter_and_rename_cols(df, dict):
    """
    Keep some original cols from a dataframe, rename them to new column names
    Return a new data frame

    Arguments:
    df = data frame
    dict keys = original column names you want to keep
    dict values = new column names
    """
    df = df[list(dict.keys())]
    new_df = df.rename(index=str, columns=dict)
    return new_df

def clean_cref_numschools(year):
    """
    Reads campus reference data from https://rptsvr1.tea.texas.gov/perfreport/tapr/2017/download/DownloadData.html
    :param year: df of district and number of schools
    :return:
    """
    if year == 'yr1718':
        year = 'yr1617'
    if year == 'yr1112':
        filename = 'cref.dat'
    if year == 'yr1213':
        filename = 'CREF.txt'
    if year >= 'yr1314':
        filename = 'CREF.dat'
    cref = pd.read_csv(os.path.join(data_path, 'tea', 'cref', year, filename), sep=",")
    cref = pd.DataFrame(cref.groupby(cref.DISTNAME)['CAMPUS'].count())
    """
    cref_tokeep = {'DISTNAME': 'distname',
                   'CAMPUS': 'schools_num'}
    cref = cref.reset_index()
    cref = filter_and_rename_cols(cref, cref_tokeep)
    """
    return cref

cref = clean_cref_numschools('yr1617')
"""
print(cref.schools_num.sum())
print(len(cref))
correct_schools = {}
for c in cref.distname:
    correct_schools[c] = cref[cref.distname == c].schools_num
correct_schools['ACADEMY OF DALLAS']
"""
cref.CAMPUS.sum()

# Scores - dscores
data from: https://tea.texas.gov/student.assessment/staar/aggregate/
labels from: https://tea.texas.gov/student.assessment/staar/variables/

In [9]:
subject_list = ['3rd', '4th', '5th', '6th', '7th', '8th',
                'Algebra', 'Biology', 'EnglishI', 'EnglishII', 'USHistory']

In [10]:
dscores = pd.DataFrame(columns = ['district'])
for subject in subject_list:
    dscores_subject = clean_tea.clean_scores(year, subject)
    dscores = dscores.merge(dscores_subject, how='outer', 
                            on = 'district')
dscores.head()

There are  1160 districts in  3rd dataset.
There are  1161 districts in  4th dataset.
There are  1162 districts in  5th dataset.
There are  1161 districts in  6th dataset.
There are  1150 districts in  7th dataset.
There are  1145 districts in  8th dataset.
There are  1121 districts in  Algebra dataset.
There are  1085 districts in  Biology dataset.
There are  1092 districts in  EnglishI dataset.
There are  1089 districts in  EnglishII dataset.
There are  1084 districts in  USHistory dataset.


Unnamed: 0,district,r_3rd_avescore,r_3rd_numtakers,m_3rd_avescore,m_3rd_numtakers,r_4th_avescore,r_4th_numtakers,m_4th_avescore,m_4th_numtakers,r_5th_avescore,r_5th_numtakers,m_5th_avescore,m_5th_numtakers,r_6th_avescore,r_6th_numtakers,m_6th_avescore,m_6th_numtakers,r_7th_avescore,r_7th_numtakers,m_7th_avescore,m_7th_numtakers,r_8th_avescore,r_8th_numtakers,m_8th_avescore,m_8th_numtakers,alg_avescore,alg_numtakers,bio_avescore,bio_numtakers,eng1_avescore,eng1_numtakers,eng2_avescore,eng2_numtakers,us_avescore,us_numtakers
0,1902,1421.0,44.0,1490.0,44.0,1567.0,36.0,1583.0,38.0,1583.0,37.0,1671.0,37.0,1634.0,44.0,1662.0,44.0,1641.0,51.0,1646.0,51.0,1713.0,42.0,1698.0,42.0,4074.0,39.0,4080.0,36.0,3928.0,41.0,4088.0,51.0,4047.0,38.0
1,1903,1465.0,73.0,1500.0,73.0,1530.0,78.0,1616.0,78.0,1578.0,83.0,1695.0,83.0,1607.0,88.0,1631.0,88.0,1651.0,97.0,1649.0,97.0,1663.0,109.0,1628.0,92.0,3990.0,103.0,4065.0,103.0,4022.0,106.0,4105.0,99.0,4056.0,84.0
2,1904,1414.0,61.0,1450.0,61.0,1542.0,56.0,1589.0,56.0,1608.0,60.0,1644.0,61.0,1625.0,75.0,1692.0,75.0,1669.0,65.0,1665.0,65.0,1711.0,53.0,1734.0,53.0,4085.0,58.0,4225.0,71.0,3992.0,67.0,4144.0,71.0,4182.0,71.0
3,1906,1516.0,22.0,1538.0,22.0,1508.0,24.0,1627.0,24.0,1540.0,31.0,1675.0,31.0,1639.0,34.0,1695.0,34.0,1675.0,30.0,1727.0,30.0,1712.0,25.0,1677.0,25.0,4087.0,39.0,4300.0,39.0,3966.0,44.0,3875.0,28.0,4091.0,24.0
4,1907,1413.0,223.0,1439.0,244.0,1471.0,257.0,1519.0,257.0,1507.0,250.0,1588.0,250.0,1527.0,242.0,1607.0,244.0,1595.0,241.0,1637.0,241.0,1635.0,240.0,1694.0,239.0,4017.0,280.0,4011.0,272.0,3822.0,331.0,3791.0,292.0,4096.0,224.0


# Merge

In [23]:
descriptives = ddem.merge(dref, on = 'district', how = 'inner')
print("There is perfect overlap between ddem and dref. New dataset contains ", len(descriptives), " districts.")
correct_distname = list(descriptives.distname)
correct_district = list(descriptives.district)

There is perfect overlap between ddem and dref. New dataset contains  1203  districts.


In [24]:
print(cref.schools_num.sum())
descriptives = descriptives.merge(cref, on = 'distname', how = 'inner')
print("New dataset contains ", len(descriptives), " districts.")
print(descriptives.schools_num.sum())

8757
New dataset contains  1203  districts.
8757


In [None]:
descriptives = descriptives.merge(dscores, on = 'district', how = 'outer', indicator = True)
descriptives = descriptives.dropna(how='all')
print(len(descriptives))

In [None]:
for num,name in zip(descriptives.district, descriptives.distname):
    if num not in correct_district:
        print('number: ', num)
        print('name: ', name)
        try:
            print('3rd Testers: ', int(descriptives[descriptives.district == num]['r_3rd_numtakers']))
        except:
            print('3rd Testers: ', 'missing')
        print()

In [None]:
descriptives['r_3rd_numtakers'].sum()

In [None]:
descriptives[descriptives._merge != 'both']

In [None]:
print('Some charter/specialty schools do not have any test scores. We can drop these from the dataset.')
descriptives.distname[descriptives._merge == 'left_only']

In [None]:
print('All schools with test scores have demographic and basic information')
descriptives.distname[descriptives._merge == 'right_only']

In [None]:
descriptives = descriptives[descriptives._merge != 'right_only']

# Descriptives of Missing Data

In [None]:
missing_elem_tests = []
elem_tests = [descriptives.r_3rd_avescore, descriptives.m_3rd_avescore,
              descriptives.r_4th_avescore, descriptives.m_4th_avescore,
             descriptives.r_5th_avescore, descriptives.m_5th_avescore,
             descriptives.r_6th_avescore, descriptives.m_6th_avescore]
for test in elem_tests:
    for district in list(descriptives.distname[test.isnull() == True]):
        if district not in missing_elem_tests: #add unique districts
            missing_elem_tests.append(district)
print(len(missing_elem_tests), 'districts missing elementary test scores.')
missing_elem_tests

In [None]:
missing_hs_tests = []
if year in ['yr1112', 'yr1214']:
    hs_tests = [descriptives.alg_avescore, descriptives.bio_avescore,
                  descriptives.eng1_avescore, descriptives.eng2_avescore,
                 descriptives.us_avescore]    
else:
    hs_tests = [descriptives.alg_avescore, descriptives.bio_avescore,
                  descriptives.eng1_avescore, descriptives.eng2_avescore,
                 descriptives.us_avescore]
for test in hs_tests:
    for district in list(descriptives.distname[test.isnull() == True]):
        if district not in missing_hs_tests: #add unique districts
            missing_hs_tests.append(district)
print(len(missing_hs_tests), 'districts missing hs test scores')
missing_hs_tests

In [None]:
missing_both = [value for value in missing_elem_tests 
                if value in missing_hs_tests] 
print(len(missing_both), 'districts missing elem and hs test scores')
print('of these', 
      len(descriptives[descriptives.distname.isin(missing_both)]
         [descriptives.distischarter == 'Y']),
      'are charters.')
missing_both

In [None]:
descriptives.r_5th_numtakers.sum()

In [None]:
dscores = pd.read_csv(os.path.join('/Users/kylieleblancKylie/domino/dofis/data/', 'tea', 'dscores', '5th', 'dfy17e5.dat'), sep=",")

In [None]:
dscores[dscores.DISTRICT == 35902]

In [None]:
dscores.r_all_d.sum()

In [None]:
dscores.r_all_docs_n.sum()

In [None]:
dscores.r_all_rs.mean()

In [None]:
dscores_subject = clean_tea.clean_scores('yr1617', '3rd')

In [None]:
dscores_subject['r_3rd_numtakers'].sum()

In [None]:
descriptives['r_3rd_numtakers'].sum()
