In [26]:
import csv
import pandas as pd
import cleaning
import os

# Start by making 2016-17 dataset, then generalize to all years

In [27]:
data_path = '/Users/kylieleblancKylie/domino/dofis/data/tea/'

# Basic Descriptives - DREF
data from: https://tea.texas.gov/perfreport/tapr/index.html
reference of labels: https://rptsvr1.tea.texas.gov/perfreport/tapr/2016/download/dstaff.html

In [31]:
dref = pd.read_csv(os.path.join(data_path, 'dref', 'yr1617', 'DREF.dat'), sep = ",")
dref_tokeep = {'DISTRICT' :'district',
               'DISTNAME' : 'distname',
               'DFLCHART': 'distischarter',
               'D_RATING' : 'rating',
               'CNTYNAME' : 'cntyname'
                }
dref = cleaning.filter_and_rename_cols(dref, dref_tokeep)
dref.head()

Unnamed: 0,distname,rating,distischarter,district,cntyname
0,CAYUGA ISD,M,N,1902,ANDERSON
1,ELKHART ISD,M,N,1903,ANDERSON
2,FRANKSTON ISD,M,N,1904,ANDERSON
3,NECHES ISD,M,N,1906,ANDERSON
4,PALESTINE ISD,M,N,1907,ANDERSON


# Demographic data - DDEM
data from: https://tea.texas.gov/perfreport/tapr/index.html
reference of labels: https://rptsvr1.tea.texas.gov/perfreport/tapr/2016/download/dstaff.html

In [34]:
ddem = pd.read_csv(os.path.join(data_path, 'ddem', 'yr1617', 'DISTPROF.dat'), sep = ",")
ddem_tokeep = {
        'DISTRICT': 'district',
        'DPSATOFC': 'teachers_num',
        'DPST00FC': 'teachers_new_num',
        'DPSTEXPA': 'teachers_exp_ave',
        'DPSTTENA': 'teachers_tenure_ave',
        'DPSTURND': 'teachers_turnover_denom',
        'DPSTURNN': 'teacher_turnover_num',
        'DPSTURNR': 'teachers_turnover_ratio',
        'DPSTNOFC': 'teachers_nodegree_num',
        'DPSTBAFC': 'teachers_badegree_num',
        'DPSTMSFC': 'teachers_msdegree_num',
        'DPSTPHFC': 'teachers_phddegree_num',
        'DPETALLC': 'students_num',
        'DPETECOC': 'students_frpl_num',
        'DPETHISC': 'students_hisp_num',
        'DPETWHIC': 'students_hisp_num',
        'DPETWHIC': 'students_white_num',
        'DPETBLAC': 'students_black_num'} 
ddem = cleaning.filter_and_rename_cols(ddem, ddem_tokeep)
ddem.head()

Unnamed: 0,teachers_turnover_ratio,teachers_num,teachers_exp_ave,students_black_num,district,teacher_turnover_num,teachers_badegree_num,teachers_msdegree_num,students_white_num,students_hisp_num,teachers_nodegree_num,teachers_tenure_ave,teachers_phddegree_num,teachers_turnover_denom,students_frpl_num,teachers_new_num,students_num
0,7.3,105.5,15.4,27,1902,4.0,48.4,4.0,479,42,0.0,7.8,0,54.4,192,1.0,576
1,11.0,179.6,14.9,67,1903,11.0,84.0,19.0,1011,137,0.0,9.4,0,100.3,673,2.0,1267
2,26.0,121.0,13.7,76,1904,15.1,42.0,17.0,648,82,0.0,6.4,0,58.1,446,2.0,846
3,7.2,64.2,11.9,28,1906,2.5,35.3,0.0,286,51,0.0,6.6,0,34.5,172,0.3,377
4,21.8,481.0,11.2,949,1907,58.9,225.8,32.4,974,1382,5.3,6.3,0,270.3,2587,23.1,3453


# Scores - dscores
data from: https://tea.texas.gov/student.assessment/staar/aggregate/
labels from: https://tea.texas.gov/student.assessment/staar/variables/

In [52]:
dscores_3rd = pd.read_csv(os.path.join(data_path, 'dscores', '3rd', 'dfy12e3.dat'), sep = ",")
subject = '3rd'
dscores_tokeep = {'DISTRICT' : 'district',
            "r_all_rs": "r_" + subject + "_avescore", 
            "r_all_d": "r_" + subject + "_numtakers",
            "m_all_d": "m_" + subject + "_avescore", 
            "m_all_rs": "m_" + subject + "_numtakers"}
dscores_3rd = cleaning.filter_and_rename_cols(dscores_3rd, dscores_tokeep)
dscores_3rd.head()

Unnamed: 0,r_3rd_avescore,r_3rd_numtakers,m_3rd_numtakers,district,m_3rd_avescore
0,1407.0,41,1488.0,1902,41
1,1423.0,88,1446.0,1903,90
2,1457.0,50,1509.0,1904,50
3,1550.0,28,1522.0,1906,28
4,1404.0,237,1428.0,1907,231


In [53]:
grade_list = ['4th', '5th', '6th', '7th', '8th']
filenames = ['dfy12e4.dat', 'dfy12e5.dat', 'dfy12e6.dat', 'dfy12e7.dat', 'dfy12e8.dat']

In [54]:
for file, grade in zip(filenames, grade_list):
    dscores_grade = pd.read_csv(os.path.join(data_path, 'dscores', grade, file), sep = ",")
    dscores_tokeep = {'DISTRICT' : 'district',
                "r_all_rs": "r_" + grade + "_avescore", 
                "r_all_d": "r_" + grade + "_numtakers",
                "m_all_d": "m_" + grade + "_avescore", 
                "m_all_rs": "m_" + grade + "_numtakers"}
    dscores_grade = cleaning.filter_and_rename_cols(dscores_grade, dscores_tokeep)
    dscores = dscores.merge(dscores_grade)
dscores.head()

Unnamed: 0,r_3rd_avescore,r_3rd_numtakers,m_3rd_numtakers,district,m_3rd_avescore,r_4th_avescore,r_4th_numtakers,m_4th_numtakers,m_4th_avescore,r_5th_avescore,...,m_6th_numtakers,m_6th_avescore,r_7th_avescore,r_7th_numtakers,m_7th_numtakers,m_7th_avescore,r_8th_avescore,r_8th_numtakers,m_8th_numtakers,m_8th_avescore
0,1407.0,41,1488.0,1902,41,1572.0,24,1567.0,26,1577.0,...,1636.0,46,1656.0,50,1644.0,47,1670.0,48,1659.0,47
1,1423.0,88,1446.0,1903,90,1521.0,101,1549.0,98,1535.0,...,1574.0,96,1649.0,98,1603.0,98,1677.0,88,1651.0,76
2,1457.0,50,1509.0,1904,50,1542.0,49,1486.0,51,1577.0,...,1643.0,55,1635.0,56,1598.0,55,1669.0,51,1668.0,51
3,1550.0,28,1522.0,1906,28,1557.0,35,1565.0,36,1616.0,...,1653.0,30,1636.0,23,1617.0,23,1684.0,31,1685.0,30
4,1404.0,237,1428.0,1907,231,1482.0,235,1523.0,233,1523.0,...,1618.0,201,1606.0,219,1595.0,220,1644.0,178,1628.0,179


In [66]:
subject_list = ['Algebra', 'Biology', 'EnglishI', 'EnglishII', 'USHistory']
filenames = ['dfy17ea1.dat', 'dfy17ebi.dat', 'dfy17ee1.dat', 'dfy17ee2.dat', 'dfy17eus.dat']

In [69]:
for subject, file in zip(subject_list, filenames):
    dscores_subject = pd.read_csv(os.path.join(data_path, 'dscores', subject, file), sep = ",")
    if subject == 'Algebra':
        dscores_tokeep = {"DISTRICT" : "district",
                    "a1_all_rs": "alg_avescore", 
                    "a1_all_d": "alg_numtakers"}
    if subject == 'Biology':
        dscores_tokeep = {"DISTRICT" : "district",
                    "bi_all_rs": "bio_avescore", 
                    "bi_all_d": "bio_numtakers"}
    if subject == 'EnglishI':
        dscores_tokeep = {"DISTRICT" : "district",
                    "e1_all_rs": "eng1_avescore", 
                    "e1_all_d": "eng1_numtakers"}
    if subject == 'EnglishII':
        dscores_tokeep = {"DISTRICT" : "district",
                    "e2_all_rs": "eng2_avescore", 
                    "e2_all_d": "eng2_numtakers"}
    if subject == 'USHistory':
        dscores_tokeep = {"DISTRICT" : "district",
                    "us_all_rs": "us_avescore", 
                    "us_all_d": "us_numtakers"}
    dscores_subject = cleaning.filter_and_rename_cols(dscores_subject, dscores_tokeep)
    dscores = dscores.merge(dscores_subject)
dscores.head()

Unnamed: 0,r_3rd_avescore,r_3rd_numtakers,m_3rd_numtakers,district,m_3rd_avescore,r_4th_avescore,r_4th_numtakers,m_4th_numtakers,m_4th_avescore,r_5th_avescore,...,alg_avescore,alg_numtakers,bio_numtakers,bio_avescore,eng1_numtakers,eng1_avescore,eng2_avescore,eng2_numtakers,us_numtakers,us_avescore
0,1407.0,41,1488.0,1902,41,1572.0,24,1567.0,26,1577.0,...,4074.0,39,36,4080.0,41,3928.0,4088.0,51,38,4047.0
1,1423.0,88,1446.0,1903,90,1521.0,101,1549.0,98,1535.0,...,3990.0,103,103,4065.0,106,4022.0,4105.0,99,84,4056.0
2,1457.0,50,1509.0,1904,50,1542.0,49,1486.0,51,1577.0,...,4085.0,58,71,4225.0,67,3992.0,4144.0,71,71,4182.0
3,1550.0,28,1522.0,1906,28,1557.0,35,1565.0,36,1616.0,...,4087.0,39,39,4300.0,44,3966.0,3875.0,28,24,4091.0
4,1404.0,237,1428.0,1907,231,1482.0,235,1523.0,233,1523.0,...,4017.0,280,272,4011.0,331,3822.0,3791.0,292,224,4096.0


In [70]:
dscores['year'] = 'yr1617'

In [72]:
dscores_subject = pd.read_csv(os.path.join(data_path, 'dscores', 'Algebra', 'dfy13ea1.dat'), sep=",")

CParserError: Error tokenizing data. C error: Expected 1068 fields in line 3, saw 2058
