In [1]:
import csv
import pandas as pd
import cleaning
import os

In [2]:
data_path = '/Users/kylieleblancKylie/domino/dofis/data/tea/'

# Basic Descriptives - DREF
data from: https://tea.texas.gov/perfreport/tapr/index.html
reference of labels: https://rptsvr1.tea.texas.gov/perfreport/tapr/2016/download/dstaff.html

In [24]:
dref = pd.read_csv(os.path.join(data_path, 'dref', 'yr1617', 'DREF.dat'), sep = ",")
dref_tokeep = {'DISTRICT' :'district',
               'DISTNAME' : 'distname',
               'DFLCHART': 'distischarter',
               'D_RATING' : 'rating',
               'CNTYNAME' : 'cntyname'
                }
dref = cleaning.filter_and_rename_cols(dref, dref_tokeep)
dref.head()

Unnamed: 0,distname,rating,distischarter,district,cntyname
0,CAYUGA ISD,M,N,1902,ANDERSON
1,ELKHART ISD,M,N,1903,ANDERSON
2,FRANKSTON ISD,M,N,1904,ANDERSON
3,NECHES ISD,M,N,1906,ANDERSON
4,PALESTINE ISD,M,N,1907,ANDERSON


In [25]:
dref = dref.set_index('district')
dref.index.get_level_values('district').get_duplicates()

[]

In [14]:
dref[dref.dup == False]

Unnamed: 0_level_0,rating,distischarter,district,cntyname,dup
distname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


# Demographic data - DDEM
data from: https://tea.texas.gov/perfreport/tapr/index.html
reference of labels: https://rptsvr1.tea.texas.gov/perfreport/tapr/2016/download/dstaff.html

In [4]:
ddem = pd.read_csv(os.path.join(data_path, 'ddem', 'yr1617', 'DISTPROF.dat'), sep = ",")
ddem_tokeep = {
        'DISTRICT': 'district',
        'DPSATOFC': 'teachers_num',
        'DPST00FC': 'teachers_new_num',
        'DPSTEXPA': 'teachers_exp_ave',
        'DPSTTENA': 'teachers_tenure_ave',
        'DPSTURND': 'teachers_turnover_denom',
        'DPSTURNN': 'teacher_turnover_num',
        'DPSTURNR': 'teachers_turnover_ratio',
        'DPSTNOFC': 'teachers_nodegree_num',
        'DPSTBAFC': 'teachers_badegree_num',
        'DPSTMSFC': 'teachers_msdegree_num',
        'DPSTPHFC': 'teachers_phddegree_num',
        'DPETALLC': 'students_num',
        'DPETECOC': 'students_frpl_num',
        'DPETHISC': 'students_hisp_num',
        'DPETWHIC': 'students_hisp_num',
        'DPETWHIC': 'students_white_num',
        'DPETBLAC': 'students_black_num'} 
ddem = cleaning.filter_and_rename_cols(ddem, ddem_tokeep)
ddem.head()

Unnamed: 0,teachers_turnover_ratio,teachers_num,teachers_exp_ave,students_black_num,district,teacher_turnover_num,teachers_badegree_num,teachers_msdegree_num,students_white_num,students_hisp_num,teachers_nodegree_num,teachers_tenure_ave,teachers_phddegree_num,teachers_turnover_denom,students_frpl_num,teachers_new_num,students_num
0,7.3,105.5,15.4,27,1902,4.0,48.4,4.0,479,42,0.0,7.8,0,54.4,192,1.0,576
1,11.0,179.6,14.9,67,1903,11.0,84.0,19.0,1011,137,0.0,9.4,0,100.3,673,2.0,1267
2,26.0,121.0,13.7,76,1904,15.1,42.0,17.0,648,82,0.0,6.4,0,58.1,446,2.0,846
3,7.2,64.2,11.9,28,1906,2.5,35.3,0.0,286,51,0.0,6.6,0,34.5,172,0.3,377
4,21.8,481.0,11.2,949,1907,58.9,225.8,32.4,974,1382,5.3,6.3,0,270.3,2587,23.1,3453


# Scores - dscores
data from: https://tea.texas.gov/student.assessment/staar/aggregate/
labels from: https://tea.texas.gov/student.assessment/staar/variables/

In [30]:
dscores_3rd = pd.read_csv(os.path.join(data_path, 'dscores', '3rd', 'dfy12e3.dat'), sep = ",")
subject = '3rd'
dscores_tokeep = {'DISTRICT' : 'district',
            "r_all_rs": "r_" + subject + "_avescore", 
            "r_all_d": "r_" + subject + "_numtakers",
            "m_all_d": "m_" + subject + "_avescore", 
            "m_all_rs": "m_" + subject + "_numtakers"}
dscores = cleaning.filter_and_rename_cols(dscores_3rd, dscores_tokeep)
dscores.head()

Unnamed: 0,r_3rd_avescore,r_3rd_numtakers,m_3rd_numtakers,district,m_3rd_avescore
0,1407.0,41,1488.0,1902,41
1,1423.0,88,1446.0,1903,90
2,1457.0,50,1509.0,1904,50
3,1550.0,28,1522.0,1906,28
4,1404.0,237,1428.0,1907,231


In [27]:
grade_list = ['4th', '5th', '6th', '7th', '8th']
filenames = ['dfy12e4.dat', 'dfy12e5.dat', 'dfy12e6.dat', 'dfy12e7.dat', 'dfy12e8.dat']

In [50]:
for file, grade in zip(['dfy12e8.dat'], ['8th']):
    print(grade)
    dscores_grade = pd.read_csv(os.path.join(data_path, 'dscores', grade, file), sep = ",")
    dscores_tokeep = {'DISTRICT' : 'district',
                "r_all_rs": "r_" + grade + "_avescore", 
                "r_all_d": "r_" + grade + "_numtakers",
                "m_all_d": "m_" + grade + "_avescore", 
                "m_all_rs": "m_" + grade + "_numtakers"}
    merge = '_merge' + grade
    dscores_grade = cleaning.filter_and_rename_cols(dscores_grade, dscores_tokeep)
    #dscores = dscores.merge(dscores_grade, how = 'outer', indicator = True)
    #dscores = dscores.rename(columns = {'_merge': merge})
dscores.head()

8th


Unnamed: 0_level_0,r_3rd_avescore,r_3rd_numtakers,m_3rd_numtakers,m_3rd_avescore,r_4th_avescore,r_4th_numtakers,m_4th_numtakers,m_4th_avescore,_merge
district,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1902,1407.0,41.0,1488.0,41.0,1572.0,24.0,1567.0,26.0,both
1903,1423.0,88.0,1446.0,90.0,1521.0,101.0,1549.0,98.0,both
1904,1457.0,50.0,1509.0,50.0,1542.0,49.0,1486.0,51.0,both
1906,1550.0,28.0,1522.0,28.0,1557.0,35.0,1565.0,36.0,both
1907,1404.0,237.0,1428.0,231.0,1482.0,235.0,1523.0,233.0,both


In [51]:
#dscores = dscores.set_index('district')
dscores.index.get_level_values('district').get_duplicates()

[]

In [37]:
dscores[['district', '_merge4th', '_merge5th', '_merge6th', '_merge7th']].head()

Unnamed: 0,district,_merge4th,_merge5th,_merge6th,_merge7th
0,1902.0,both,both,both,both
1,1903.0,both,both,both,both
2,1904.0,both,both,both,both
3,1906.0,both,both,both,both
4,1907.0,both,both,both,both


In [38]:
subject_list = ['Algebra', 'Biology', 'EnglishI', 'EnglishII', 'USHistory']
filenames = ['dfy17ea1.dat', 'dfy17ebi.dat', 'dfy17ee1.dat', 'dfy17ee2.dat', 'dfy17eus.dat']

In [39]:
for subject, file in zip(subject_list, filenames):
    dscores_subject = pd.read_csv(os.path.join(data_path, 'dscores', subject, file), sep = ",")
    if subject == 'Algebra':
        dscores_tokeep = {"DISTRICT" : "district",
                    "a1_all_rs": "alg_avescore", 
                    "a1_all_d": "alg_numtakers"}
    if subject == 'Biology':
        dscores_tokeep = {"DISTRICT" : "district",
                    "bi_all_rs": "bio_avescore", 
                    "bi_all_d": "bio_numtakers"}
    if subject == 'EnglishI':
        dscores_tokeep = {"DISTRICT" : "district",
                    "e1_all_rs": "eng1_avescore", 
                    "e1_all_d": "eng1_numtakers"}
    if subject == 'EnglishII':
        dscores_tokeep = {"DISTRICT" : "district",
                    "e2_all_rs": "eng2_avescore", 
                    "e2_all_d": "eng2_numtakers"}
    if subject == 'USHistory':
        dscores_tokeep = {"DISTRICT" : "district",
                    "us_all_rs": "us_avescore", 
                    "us_all_d": "us_numtakers"}
    dscores_subject = cleaning.filter_and_rename_cols(dscores_subject, dscores_tokeep)
    merge = '_merge' + subject
    dscores = dscores.merge(dscores_subject, how = 'outer', indicator = True)
    dscores = dscores.rename(columns = {'_merge': merge})
    dscores = dscores.merge(dscores_subject)
dscores.head()

Unnamed: 0,r_3rd_avescore,r_3rd_numtakers,m_3rd_numtakers,district,m_3rd_avescore,r_4th_avescore,r_4th_numtakers,m_4th_numtakers,m_4th_avescore,_merge4th,...,_mergeBiology,eng1_numtakers,eng1_avescore,_mergeEnglishI,eng2_avescore,eng2_numtakers,_mergeEnglishII,us_numtakers,us_avescore,_mergeUSHistory
0,1407.0,41.0,1488.0,1902.0,41.0,1572.0,24.0,1567.0,26.0,both,...,both,41.0,3928.0,both,4088.0,51.0,both,38.0,4047.0,both
1,1423.0,88.0,1446.0,1903.0,90.0,1521.0,101.0,1549.0,98.0,both,...,both,106.0,4022.0,both,4105.0,99.0,both,84.0,4056.0,both
2,1457.0,50.0,1509.0,1904.0,50.0,1542.0,49.0,1486.0,51.0,both,...,both,67.0,3992.0,both,4144.0,71.0,both,71.0,4182.0,both
3,1550.0,28.0,1522.0,1906.0,28.0,1557.0,35.0,1565.0,36.0,both,...,both,44.0,3966.0,both,3875.0,28.0,both,24.0,4091.0,both
4,1404.0,237.0,1428.0,1907.0,231.0,1482.0,235.0,1523.0,233.0,both,...,both,331.0,3822.0,both,3791.0,292.0,both,224.0,4096.0,both


In [40]:
dscores['year'] = 'yr1617'

In [53]:
descriptives = dscores
descriptives = dscores.merge(dref, how = 'inner')

descriptives.head()

AttributeError: 'DataFrame' object has no attribute '_merge'

In [51]:
descriptives = dscores
descriptives = dscores.merge(ddem, indicator = True)
descriptives[descriptives._merge != 'both']
descriptives.head()

Unnamed: 0,r_3rd_avescore,r_3rd_numtakers,m_3rd_numtakers,district,m_3rd_avescore,r_4th_avescore,r_4th_numtakers,m_4th_numtakers,m_4th_avescore,_merge4th,...,students_white_num,students_hisp_num,teachers_nodegree_num,teachers_tenure_ave,teachers_phddegree_num,teachers_turnover_denom,students_frpl_num,teachers_new_num,students_num,_merge
0,1407.0,41.0,1488.0,1902.0,41.0,1572.0,24.0,1567.0,26.0,both,...,479,42,0.0,7.8,0,54.4,192,1.0,576,both
1,1423.0,88.0,1446.0,1903.0,90.0,1521.0,101.0,1549.0,98.0,both,...,1011,137,0.0,9.4,0,100.3,673,2.0,1267,both
2,1457.0,50.0,1509.0,1904.0,50.0,1542.0,49.0,1486.0,51.0,both,...,648,82,0.0,6.4,0,58.1,446,2.0,846,both
3,1550.0,28.0,1522.0,1906.0,28.0,1557.0,35.0,1565.0,36.0,both,...,286,51,0.0,6.6,0,34.5,172,0.3,377,both
4,1404.0,237.0,1428.0,1907.0,231.0,1482.0,235.0,1523.0,233.0,both,...,974,1382,5.3,6.3,0,270.3,2587,23.1,3453,both


In [47]:
pd.set_option('display.max_rows', 500)