In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

## Collecting State SAT & ACT Averages

In [2]:
# SAT titles spread across 4 rows of uneven merges
raw_cols = pd.read_excel('https://nces.ed.gov/programs/digest/d18/tables/xls/tabn226.40.xls',
                   header=None, index_col = 0, skiprows = 1, nrows=4)
# Keep raw data separate
sat = pd.read_excel('https://nces.ed.gov/programs/digest/d18/tables/xls/tabn226.40.xls',
                   header=None, index_col = 0, skiprows=7)

# Manually relabel columns
sat.columns = ['sat_2017', 'sat_2017_sd',
               'sat_2017_erw', 'sat_2017_erw_sd',
               'sat_2017_math', 'sat_2017_math_sd',
               'sat_2017_part',
               'sat_2018', 'sat_2018_sd',
               'sat_2018_erw', 'sat_2018_erw_sd',
               'sat_2018_math', 'sat_2018_math_sd',
               'sat_2018_part']

# Remove 3 Footnotes
sat = sat.iloc[:-3,:]
# Remove blank lines
sat = sat.loc[~sat['sat_2017_erw'].isnull(),:]
sat.drop(columns = sat.columns[sat.columns.str.contains('sd')], inplace = True)

# Correct index
sat.index = sat.index.str.strip('.')
sat.index = sat.index.str.replace('Columbia', 'DC')
sat.index = sat.index.str.strip()

for col in sat.columns:
    sat[col] = sat[col].astype(int)

In [3]:
# ACT titles spread across 3 rows of uneven merges
raw_cols = pd.read_excel('https://nces.ed.gov/programs/digest/d18/tables/xls/tabn226.60.xls',
                   header=None, index_col = 0, skiprows = 1, nrows=3)

act_18 = pd.read_excel('https://nces.ed.gov/programs/digest/d18/tables/xls/tabn226.60.xls',
                   header=None, index_col = 0, skiprows=6)

# drop 3 Footnotes and 2014 data
act_18 = act_18.iloc[:-3,5:]

act_18.columns = ['act_2018', 'act_2018_english', 
                  'act_2018_math', 'act_2018_reading', 
                  'act_2018_science',
                  'act_2014_part', 'act_2018_part']

act_18.drop(columns=['act_2014_part'], inplace=True)
act_18 = act_18.loc[~act_18['act_2018'].isnull(),:]

# Correct index
act_18.index = act_18.index.str.strip('.')
act_18.index = act_18.index.str.replace('District of Columbia', 'DC')



act_17 = pd.read_excel('https://nces.ed.gov/programs/digest/d17/tables/xls/tabn226.60.xls',
                   header=None, index_col = 0, skiprows=6)

# drop 3 Footnotes and 2014 data
act_17 = act_17.iloc[:-3,5:]

act_17.columns = ['act_2017', 'act_2017_english', 
                  'act_2017_math', 'act_2017_reading', 
                  'act_2017_science',
                  'act_2014_part', 'act_2017_part']

act_17.drop(columns=['act_2014_part'], inplace=True)
act_17 = act_17.loc[~act_17['act_2017'].isnull(),:]

# Correct index
act_17.index = act_17.index.str.strip('.')
act_17.index = act_17.index.str.replace('District of Columbia', 'DC')


act = pd.merge(act_17, act_18, left_index=True, right_index=True)
act.index = act.index.str.strip()

In [4]:
# Create final panel dataset
test_scores = pd.merge(sat, act, how='outer', left_index=True, right_index=True)
test_scores.index.rename('State', inplace=True)
test_scores.to_csv('../data/test_scores.csv')

#### QC Checks

In [5]:
# Collections of columns of similar category for easy reference throughout 
sat_cols = test_scores.columns[test_scores.columns.str.contains('sat') &
                              ~test_scores.columns.str.contains('part') ]
sat_st_cols = sat_cols[sat_cols.str.contains('erw') | 
                       sat_cols.str.contains('math')]

act_cols = test_scores.columns[test_scores.columns.str.contains('act') &
                           ~test_scores.columns.str.contains('part') ]
part_cols = test_scores.columns[test_scores.columns.str.contains('part')]

In [6]:
# Return missing values count
print(f"Datasets contains {test_scores.isnull().sum().sum()} missing values.")


# Convert SAT to numeric and check range
for col in sat_st_cols:
    if test_scores[col].max() > 800:
        print(f"Error: {col} out of valid range.")
    elif test_scores[col].min()  < 200:
        print(f"Error: {col} out of valid range.")
        
for col in ['sat_2017', 'sat_2018']:
    if test_scores[col].max() > 1600:
        print(f"Error: {col} out of valid range.")
    elif test_scores[col].min() < 400:
        print(f"Error: {col} out of valid range.")
        
# Convert ACT to numeric and check range
for col in act_cols:
    if test_scores[col].max()  > 36:
        print(f"Error: {col} out of valid range.")
    elif test_scores[col].min()  < 1:
        print(f"Error: {col} out of valid range.")

# Convert participation to numeric and check range
for col in part_cols:
    if test_scores[col].max()  > 100:
        print(f"Error: {col} out of valid range.")
    elif test_scores[col].min()  < 0:
        print(f"Error: {col} out of valid range.")
        

Datasets contains 0 missing values.


## Collecting State NAEP Scores

The NAEP reading and math exams are scraped to provide a comparison to ACT and SAT scores. The NAEP state averages represent state education performance better than the ACT and SAT because this exam is administered to a representative sample. Students do not self-select into participation and the administration of this exam prevents state-level policy differences which confound average scores, unlike the SAT and ACT. However, we cannot strictly compare these because the NAEP has only been administered to 8th grade students recently. All comparisons are made with this important caveat in mind.

In [10]:
# Read NAEP Table 221.60 
naep_r_url = 'https://nces.ed.gov/programs/digest/d18/tables/dt18_221.60.asp'
naep_r_res = requests.get(naep_r_url)
naep_r_soup = BeautifulSoup(naep_r_res.content, 'lxml')
naep_r_table = naep_r_soup.find('tbody')


# Read NAEP Table 222.60 
naep_m_url = 'https://nces.ed.gov/programs/digest/d18/tables/dt18_222.60.asp'
naep_m_res = requests.get(naep_m_url)
naep_m_soup = BeautifulSoup(naep_m_res.content, 'lxml')
naep_m_table = naep_m_soup.find('tbody')

#Extract columns values
naep_r_by_state = pd.DataFrame(columns=range(20))
for tr in naep_r_table.find_all('tr'):
    td = tr.find_all('td')
    
    if len(td) == 10: # Hack to skip first table row
        continue
        
    row = [i.text for i in td]
    if '\xa0' in row: # Skip blank rows used as spacing
        continue
    row = pd.Series(row, index = naep_r_by_state.columns)
    naep_r_by_state = naep_r_by_state.append(row, ignore_index=True)

# Keep only 2017 NAEP averages
naep_r_by_state = pd.DataFrame(naep_r_by_state.iloc[1:-1,-2])    


# Extract index values
states=[]
for th in naep_r_table.find_all('th'):
    if th.text == '\xa0':
        continue
    states.append((th.text).strip())

states = states[2:-1]
states = [re.sub(r'[^a-z A-Z]', "", st) for st in states] # Remove footnotes from index

naep_r_by_state.set_index(pd.Index(states), inplace=True)
naep_r_by_state.columns = ['naep_2017_reading']
naep_r_by_state['naep_2017_reading_rank'] = naep_r_by_state['naep_2017_reading'].rank(ascending=False, method = 'min')


# Extract columns values
naep_m_by_state = pd.DataFrame(columns=range(24))
for tr in naep_m_table.find_all('tr'):
    td = tr.find_all('td')
    
    if len(td) < 20: # Hack to skip first table row
        continue
        
    row = [i.text for i in td]
    if '\xa0' in row: # Skip blank rows used as spacing
        continue
    row = pd.Series(row, index = naep_m_by_state.columns)
    naep_m_by_state = naep_m_by_state.append(row, ignore_index=True)

# Keep only 2017 NAEP averages
naep_m_by_state = pd.DataFrame(naep_m_by_state.iloc[1:-1,-2])    


naep_m_by_state.set_index(pd.Index(states), inplace=True)
naep_m_by_state.columns = ['naep_2017_math']
naep_m_by_state['naep_2017_math_rank'] = naep_m_by_state['naep_2017_math'].rank(ascending=False, method = 'min')


naep_by_state = naep_r_by_state.merge(naep_m_by_state, left_index=True, right_index=True)
naep_by_state.index = naep_by_state.index.str.replace('District of Columbia', 'DC')

naep_by_state.index.rename('State', inplace=True)
naep_by_state.to_csv('../data/naep_by_state.csv')

## Collecting State Education System Statistics

We utilize NCES tables to collect the following variables: spending-per-pupil, teachers-per-pupil, average teacher salary. These variables could not all be collected for identical years due to delays in some aggregation. We do not expect radical canges year-to-year for any states and feel misalignment by only one year will not significantly alter our results.

In [11]:
# Pupils per K12 Teacher
student_teacher = pd.read_excel('https://nces.ed.gov/programs/digest/d18/tables/xls/tabn208.40.xls',
                   header=None, index_col = 0, skiprows=5)
# Keep only US States and variable of interest
student_teacher = student_teacher.iloc[:-20,-2:]

# Correct index
student_teacher.index = student_teacher.index.str.strip('.')
student_teacher.index = student_teacher.index.str.replace('District of Columbia', 'DC')
student_teacher.index = student_teacher.index.str.strip()

student_teacher.columns = ['pupil_teacher_ratio', 'todrop']
student_teacher = student_teacher.loc[~student_teacher['pupil_teacher_ratio'].isnull(),:]



# Spending per K12 Pupil in Attendance
spending_student = pd.read_excel('https://nces.ed.gov/programs/digest/d19/tables/xls/tabn236.70.xls',
                   header=None, index_col = 0, skiprows=5)
# Keep only US States and variable of interest
spending_student = spending_student.iloc[:-12,14:15]
spending_student.columns = ['spending_per_student']

# Correct index
spending_student.index = spending_student.index.str.strip('.')
spending_student.index = spending_student.index.str.replace('District of Columbia', 'DC')
spending_student.index = spending_student.index.str.strip()

spending_student = spending_student.loc[~spending_student['spending_per_student'].isnull(),:]



# K12 Teacher Salary
teacher_sal = pd.read_excel('https://nces.ed.gov/programs/digest/d19/tables/xls/tabn211.60.xls',
                   header=None, index_col = 0, skiprows=5)
# Keep only US States and variable of interest
teacher_sal = teacher_sal.iloc[:-3,6:7]
teacher_sal.columns = ['teacher_salary']

# Correct index
teacher_sal.index = teacher_sal.index.str.strip('.')
teacher_sal.index = teacher_sal.index.str.replace('Columbia', 'DC')
teacher_sal.index = teacher_sal.index.str.strip()

teacher_sal = teacher_sal.loc[~teacher_sal['teacher_salary'].isnull(),:]



# Merge
state_stats = pd.merge(student_teacher, spending_student, left_index=True, right_index=True)
state_stats = pd.merge(state_stats, teacher_sal, left_index=True, right_index=True)
state_stats.drop(columns = ['todrop'], inplace=True)

for col in state_stats.columns:
    state_stats[col] = state_stats[col].astype(float)

# Merge with Test Scores
test_scores_supp = pd.merge(test_scores, state_stats, left_index=True, right_index=True)
test_scores_supp.index.rename('State', inplace=True)
test_scores_supp.to_csv('../data/test_scores_supplemented.csv')