In [1]:
import csv
import pandas as pd
import clean
import os
import numpy as np
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from start import data_path

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
year = 'yr1718'

# Basic Descriptives - DREF
data from: https://tea.texas.gov/perfreport/tapr/index.html
reference of labels: https://rptsvr1.tea.texas.gov/perfreport/tapr/2016/download/dstaff.html
steps to cleaning dref can be found in cleaning.py

In [4]:
dref = clean.clean_dref(year = year)
dref.head()

There are  1203 districts in dref


Unnamed: 0,district,distname,distischarter,rating,cntyname
0,1902,CAYUGA ISD,N,M,ANDERSON
1,1903,ELKHART ISD,N,M,ANDERSON
2,1904,FRANKSTON ISD,N,M,ANDERSON
3,1906,NECHES ISD,N,M,ANDERSON
4,1907,PALESTINE ISD,N,M,ANDERSON


# Demographic data - DDEM
data from: https://tea.texas.gov/perfreport/tapr/index.html
reference of labels: https://rptsvr1.tea.texas.gov/perfreport/tapr/2016/download/dstaff.html

In [5]:
ddem = clean.clean_ddem(year = year)
ddem.head()

There are  1203 districts in ddem


Unnamed: 0,district,teachers_num,teachers_new_num,teachers_exp_ave,teachers_tenure_ave,teachers_turnover_denom,teacher_turnover_num,teachers_turnover_ratio,teachers_nodegree_num,teachers_badegree_num,teachers_msdegree_num,teachers_phddegree_num,students_num,students_frpl_num,students_hisp_num,students_white_num,students_black_num
0,1902,105.5,1.0,15.4,7.8,54.4,4.0,7.3,0.0,48.4,4.0,0,576,192,42,479,27
1,1903,179.6,2.0,14.9,9.4,100.3,11.0,11.0,0.0,84.0,19.0,0,1267,673,137,1011,67
2,1904,121.0,2.0,13.7,6.4,58.1,15.1,26.0,0.0,42.0,17.0,0,846,446,82,648,76
3,1906,64.2,0.3,11.9,6.6,34.5,2.5,7.2,0.0,35.3,0.0,0,377,172,51,286,28
4,1907,481.0,23.1,11.2,6.3,270.3,58.9,21.8,5.3,225.8,32.4,0,3453,2587,1382,974,949


# Scores - dscores
data from: https://tea.texas.gov/student.assessment/staar/aggregate/
labels from: https://tea.texas.gov/student.assessment/staar/variables/

In [6]:
subject_list = ['3rd', '4th', '5th', '6th', '7th', '8th',
                'Algebra', 'Biology', 'EnglishI', 'EnglishII', 'USHistory']

In [7]:
dscores = pd.DataFrame(columns = ['district'])
for subject in subject_list:
    dscores_subject = clean.clean_scores(year, subject)
    dscores = dscores.merge(dscores_subject, how='outer', 
                            on = 'district')
dscores.head()

There are  1164 districts in  dataset.
There are  1161 districts in  dataset.
There are  1159 districts in  dataset.
There are  1164 districts in  dataset.
There are  1148 districts in  dataset.
There are  1147 districts in  dataset.
There are  1122 districts in  dataset.
There are  1087 districts in  dataset.
There are  1097 districts in  dataset.
There are  1088 districts in  dataset.
There are  1083 districts in  dataset.


Unnamed: 0,district,r_3rd_avescore,r_3rd_numtakers,m_3rd_avescore,m_3rd_numtakers,r_4th_avescore,r_4th_numtakers,m_4th_avescore,m_4th_numtakers,r_5th_avescore,r_5th_numtakers,m_5th_avescore,m_5th_numtakers,r_6th_avescore,r_6th_numtakers,m_6th_avescore,m_6th_numtakers,r_7th_avescore,r_7th_numtakers,m_7th_avescore,m_7th_numtakers,r_8th_avescore,r_8th_numtakers,m_8th_avescore,m_8th_numtakers,alg_avescore,alg_numtakers,bio_avescore,bio_numtakers,eng1_avescore,eng1_numtakers,eng2_avescore,eng2_numtakers,us_avescore,us_numtakers
0,1902,1466.0,51.0,51.0,1530.0,1554.0,51.0,50.0,1555.0,1607.0,36.0,36.0,1685.0,1576.0,40.0,40.0,1657.0,1698.0,44.0,44.0,1696.0,1677.0,52.0,52.0,1655.0,4254.0,43.0,4239.0,44.0,4082.0,47.0,3962.0,37.0,4172.0,46.0
1,1903,1473.0,93.0,93.0,1516.0,1554.0,73.0,73.0,1613.0,1585.0,81.0,81.0,1651.0,1616.0,85.0,85.0,1653.0,1656.0,87.0,87.0,1647.0,1698.0,94.0,76.0,1673.0,4157.0,116.0,4030.0,113.0,3962.0,129.0,4029.0,100.0,4542.0,81.0
2,1904,1463.0,76.0,76.0,1492.0,1516.0,67.0,67.0,1553.0,1634.0,58.0,58.0,1678.0,1587.0,63.0,63.0,1675.0,1667.0,71.0,71.0,1684.0,1680.0,74.0,43.0,1659.0,4278.0,90.0,4150.0,65.0,4033.0,67.0,3944.0,63.0,4111.0,60.0
3,1906,1502.0,30.0,30.0,1460.0,1528.0,20.0,20.0,1576.0,1619.0,24.0,24.0,1668.0,1577.0,25.0,25.0,1644.0,1690.0,29.0,29.0,1741.0,1702.0,27.0,27.0,1697.0,3823.0,21.0,4280.0,20.0,4054.0,22.0,4165.0,34.0,4131.0,25.0
4,1907,1395.0,231.0,255.0,1412.0,1493.0,243.0,243.0,1555.0,1539.0,254.0,254.0,1591.0,1554.0,234.0,235.0,1653.0,1610.0,248.0,248.0,1637.0,1654.0,246.0,245.0,1711.0,4062.0,240.0,4001.0,258.0,3781.0,336.0,3896.0,305.0,4086.0,222.0


# Merge

In [8]:
descriptives = ddem.merge(dref, on = 'district', how = 'inner')
print("There is perfect overlap between ddem and dref. New dataset contains ", len(descriptives), " districts.")

There is perfect overlap between ddem and dref. New dataset contains  1203  districts.


In [9]:
descriptives = descriptives.merge(dscores, on = 'district', how = 'left', indicator = True)
descriptives = descriptives.dropna(how='all')
print(len(descriptives))

1203


In [10]:
print('Some charter/specialty schools do not have any test scores. We can drop these from the dataset.')
descriptives.distname[descriptives._merge == 'left_only']


Some charter/specialty schools do not have any test scores. We can drop these from the dataset.


62     SAN ANTONIO SCHOOL FOR INQUIRY & C
258                FOCUS LEARNING ACADEMY
265                  ALPHA CHARTER SCHOOL
499    HOUSTON HEIGHTS LEARNING ACADEMY I
508                  ZOE LEARNING ACADEMY
517               GLOBAL LEARNING VILLAGE
519                       C O R E ACADEMY
Name: distname, dtype: object

In [11]:
print('All schools with test scores have demographic and basic information')
descriptives.distname[descriptives._merge == 'right_only']

All schools with test scores have demographic and basic information


Series([], Name: distname, dtype: object)

In [12]:
descriptives = descriptives[descriptives._merge != 'right_only']

# Descriptives of Missing Data

In [13]:
missing_elem_tests = []
elem_tests = [descriptives.r_3rd_avescore, descriptives.m_3rd_avescore,
              descriptives.r_4th_avescore, descriptives.m_4th_avescore,
             descriptives.r_5th_avescore, descriptives.m_5th_avescore,
             descriptives.r_6th_avescore, descriptives.m_6th_avescore]
for test in elem_tests:
    for district in list(descriptives.distname[test.isnull() == True]):
        if district not in missing_elem_tests: #add unique districts
            missing_elem_tests.append(district)
print(len(missing_elem_tests), 'districts missing elementary test scores.')
missing_elem_tests

79 districts missing elementary test scores.


['RICHARD MILBURN ALTER HIGH SCHOOL',
 'POR VIDA ACADEMY',
 'POSITIVE SOLUTIONS CHARTER SCHOOL',
 'SAN ANTONIO SCHOOL FOR INQUIRY & C',
 'HENRY FORD ACADEMY ALAMEDA SCHOOL',
 'CARPE DIEM SCHOOLS',
 'MORGAN ISD',
 'SAN VICENTE ISD',
 'SOUTH TEXAS ISD',
 'TRINITY CHARTER SCHOOL',
 'TEXANS CAN ACADEMIES',
 'ACADEMY FOR ACADEMIC EXCELLENCE',
 'FOCUS LEARNING ACADEMY',
 'WINFREE ACADEMY CHARTER SCHOOLS',
 'ALPHA CHARTER SCHOOL',
 'EVOLUTION ACADEMY CHARTER SCHOOL',
 'RICHLAND COLLEGIATE HIGH SCHOOL',
 'PIONEER TECHNOLOGY & ARTS ACADEMY',
 'RAMIREZ CSD',
 'PASO DEL NORTE ACADEMY CHARTER DIS',
 'EL PASO ACADEMY',
 'EL PASO LEADERSHIP ACADEMY',
 'PREMIER HIGH SCHOOLS',
 'ERATH EXCELS ACADEMY INC',
 'DOSS CONSOLIDATED CSD',
 'EAST TEXAS CHARTER SCHOOLS',
 'GEORGE I SANCHEZ CHARTER',
 'EXCEL ACADEMY',
 'HOUSTON HEIGHTS HIGH SCHOOL',
 'HOUSTON HEIGHTS LEARNING ACADEMY I',
 'CALVIN NELMS CHARTER SCHOOLS',
 'COMQUEST ACADEMY',
 'YES PREP PUBLIC SCHOOLS INC',
 'ZOE LEARNING ACADEMY',
 'THE LAWSON AC

In [14]:
missing_hs_tests = []
if year in ['yr1112', 'yr1214']:
    hs_tests = [descriptives.alg_avescore, descriptives.bio_avescore,
                  descriptives.eng1_avescore, descriptives.eng2_avescore,
                 descriptives.us_avescore]    
else:
    hs_tests = [descriptives.alg_avescore, descriptives.bio_avescore,
                  descriptives.eng1_avescore, descriptives.eng2_avescore,
                 descriptives.us_avescore]
for test in hs_tests:
    for district in list(descriptives.distname[test.isnull() == True]):
        if district not in missing_hs_tests: #add unique districts
            missing_hs_tests.append(district)
print(len(missing_hs_tests), 'districts missing hs test scores')
missing_hs_tests

160 districts missing hs test scores


["ST MARY'S ACADEMY CHARTER SCHOOL",
 'PAWNEE ISD',
 'BEXAR COUNTY ACADEMY',
 'SAN ANTONIO SCHOOL FOR INQUIRY & C',
 'LIGHTHOUSE CHARTER SCHOOL',
 'MALTA ISD',
 'HUBBARD ISD',
 'LEARY ISD',
 'BRAZOS SCHOOL FOR INQUIRY & CREATI',
 'ARROW ACADEMY',
 'SAN VICENTE ISD',
 'GROOM ISD',
 'WALNUT BEND ISD',
 'SIVELLS BEND ISD',
 'LUMIN EDUCATION',
 'NOVA ACADEMY',
 'ACADEMY OF DALLAS',
 'FOCUS LEARNING ACADEMY',
 'JEAN MASSIEU ACADEMY',
 'NOVA ACADEMY (SOUTHEAST)',
 'ALPHA CHARTER SCHOOL',
 'EDUCATION CENTER INTERNATIONAL ACA',
 'GOLDEN RULE CHARTER SCHOOL',
 'ST ANTHONY SCHOOL',
 'RICHLAND COLLEGIATE HIGH SCHOOL',
 'CITYSCAPE SCHOOLS',
 'TRINITY ENVIRONMENTAL ACADEMY',
 'DAWSON ISD',
 'WALCOTT ISD',
 'NORTH TEXAS COLLEGIATE ACADEMY',
 'TRIVIUM ACADEMY',
 'WESTHOFF ISD',
 'MEYERSVILLE ISD',
 'RAMIREZ CSD',
 'COMPASS ACADEMY CHARTER SCHOOL',
 'LA FE PREPARATORY SCHOOL',
 'VISTA DEL FUTURO CHARTER SCHOOL',
 'DOSS CONSOLIDATED CSD',
 'GRANDVIEW-HOPKINS ISD',
 'PRINGLE-MORSE CISD',
 'UNIVERSITY OF

In [15]:
missing_both = [value for value in missing_elem_tests 
                if value in missing_hs_tests] 
print(len(missing_both), 'districts missing elem and hs test scores')
print('of these', 
      len(descriptives[descriptives.distname.isin(missing_both)]
         [descriptives.distischarter == 'Y']),
      'are charters.')

41 districts missing elem and hs test scores
of these 25 are charters.


