# School level data cleaning #

Since different states present their school level demographic data in different ways, we have no uniform way of dealing with the data. In order to simplify the process, in this notebook we both clean and standardize column names across states. We also convert the columns that contain entries such as "$5,234, 567" or "1,234" to floats, in order to be ready to train models/study the data. 

In [126]:
import pandas as pd
import numpy as np
import os

In [127]:
test_type_dict = {'MA': 'SAT', 'AR':'ACT'}

## Massachusetts school level data ##

In [72]:
state_code = 'MA'
test_type = test_type_dict[state_code]

# we will use this dictionary to rename the columns for MA data for all years. 

rename_dict = {'School Name': 'school_name', 
               'Tests Taken': 'tests_taken', 
               'Reading / Writing': test_type + '_reading_writing',
               'Math': test_type + '_math', 
               'Average Class Size': 'avg_class_size', 
               'Number of Students': 'number_of_students',
               'English Language Learner %': 'perc_english_learner',
               'Students with Disabilities %': 'perc_disabilities',
               'Economically Disadvantaged %': 'perc_econ_disadvg', 
               'African American %': 'perc_african_american',
               'Asian %': 'perc_asian', 
               'Hispanic %': 'perc_hispanic', 
               'White %': 'perc_white', 
               'Native American %':'perc_native_american',
               'Native Hawaiian, Pacific Islander %':'perc_hawaiian_pacific_isl', 
               'Multi-Race, Non-Hispanic %': 'perc_multi_race_non_hisp',
               'School Year': 'year',
               'Total Expenditures per Pupil':'total_exp_per_pupil'
              }

for i in range(19,24):
    folder = os.path.join(r'C:\Users\mauro\OneDrive\Desktop\MATH\Spring_2025\Erdos\project\project_files\data', state_code + str(i))
    filename = state_code + str(i) + '_Combined_Ed_School_Demographic.csv' 
    filepath = os.path.join(folder, filename)
    df = pd.read_csv(filepath)

    # we rename columns and drop rows with missing values
    df = df.rename(columns = rename_dict)
    df = df.dropna(subset = list(rename_dict.values()))

    # there are columns whose values are strings instead of floats or ints; we deal with these.
    for col in ['number_of_students', 'total_exp_per_pupil']:
        df[col] = df[col].str.strip().str.replace(',', '').str.replace('$', '')
        df[col] = df[col].astype(float)

    new_filename = state_code + str(i) + '_cleaned_ed_school_demographics.csv'
    new_filepath = os.path.join(folder, new_filename)

    df.to_csv(new_filepath, index = False)

    
    
    

## Arkansas school level data ##

In [131]:
state_code = 'AR'
test_type = test_type_dict[state_code]

# we will use this dictionary to rename the columns for MA data for all years. 

rename_dict = {'School Name': 'school_name',
               'Math: % Met Readiness Benchmark': 'perc_math_readiness', 
               'English: % Met Readiness Benchmark': 'perc_english_readiness',
               'Reading: % Met Readiness Benchmark': 'perc_reading_readiness',
                'Science: % Met Readiness Benchmark': 'perc_science_readiness',
               '% Met Readiness Benchmark in all Four Subjects': 'perc_all_subject_readiness', 
               ' School Enrollment ': 'number_of_students',
               'School % FRL': 'perc_frl', 
               'School % White': 'perc_white', 
               'School % Hispanic': 'perc_hispanic', 
               'School % Black': 'perc_african_american',
               'School % Other Races': 'perc_other_race', 
               'School Overall % Minority': 'perc_minority', 
               'Number of Students in Grade 11': 'number_grade_11_students',
               ' Number of Students in grade 11 that took the ACT': 'number_test_taking_grade_11_students',
               ' Average ACT Math Score ': test_type + '_math', 
               ' Average ACT English Score ': test_type + '_english',
               ' Average ACT Reading Score ': test_type + '_reading', 
               ' Average ACT Science Score ': test_type + '_science',
               'Is Open Enrollment Charter': 'is_open_enrollment_charter'
              }

for i in range(19,24):
    if i == 20:
        continue
    folder = os.path.join(r'C:\Users\mauro\OneDrive\Desktop\MATH\Spring_2025\Erdos\project\project_files\data', state_code + str(i))
    filename = state_code + str(i) + '_Combined_Ed_School_Demographic.csv' 
    filepath = os.path.join(folder, filename)
    df = pd.read_csv(filepath)

    df['year'] = pd.Series([int('20' + str(i))]*len(df))

    df = df.drop('Unnamed: 0', axis = 1)

    # we rename columns
    df = df.rename(columns = rename_dict)

    # there are columns that have the value "N<10", indicating that fewer than 10 students took the exam. we set these to NaN so that they are dropped
    df = df.replace('N<10', np.nan)
    

    for col in list(rename_dict.values()):
        if col in list(df.columns):
            df = df.dropna(subset = [col])

    

    # there are columns whose values are strings instead of floats or ints; we deal with these.
    for col in ['number_of_students', 'perc_math_readiness',
                'perc_english_readiness', 'perc_reading_readiness',
                'perc_science_readiness', 'perc_all_subject_readiness',
                'perc_frl', 'perc_white', 'perc_hispanic', 'perc_african_american','perc_other_race',
                'perc_minority', 'ACT_math', 'ACT_english','ACT_reading', 'ACT_science',
                 'year'
               ]:
        if col in list(df.columns):
            try:
                df[col] = df[col].str.strip().str.replace(',', '').str.replace('$', '').str.replace('%', '')
                df[col] = df[col].astype(float)
            except:
                print('non-float values in ', col)

    new_filename = state_code + str(i) + '_cleaned_ed_school_demographics.csv'
    new_filepath = os.path.join(folder, new_filename)

    df.to_csv(new_filepath, index = False)

    print('done with', i)

    
    

non-float values in  year
done with 19
non-float values in  year
done with 21
non-float values in  year
done with 22
non-float values in  perc_math_readiness
non-float values in  perc_english_readiness
non-float values in  perc_reading_readiness
non-float values in  perc_science_readiness
non-float values in  perc_all_subject_readiness
non-float values in  year
done with 23


In [116]:
df.head(5)

Unnamed: 0,school_name,Grades,District Name,perc_math_readiness,perc_english_readiness,perc_reading_readiness,perc_science_readiness,perc_all_subject_readiness,number_of_students,perc_frl,...,% Students in grade 11 that took the ACT in 2015-16,ACT_math,ACT_english,ACT_reading,ACT_science,is_open_enrollment_charter,County,School LEA,DLEA,year
0,Academic Center For Excellence,5-12,Cabot School District,9,28,21,13,5,350.0,47.0,...,105%,15.9,15.9,16.6,17.5,0,LONOKE,4304703,4304000,2023
1,Academies At Rivercrest High School,9-12,Rivercrest School District,14,39,24,13,6,346.0,71.0,...,89%,16.7,16.1,17.3,18.5,0,MISSISSIPPI,4706703,4706000,2023
2,Acorn High School,7-12,Ouachita River School District,2,46,31,21,2,267.0,61.0,...,100%,15.2,17.3,17.8,18.2,0,POLK,5706002,5706000,2023
4,Agee Lierly Life Preparation Services School,9-12,Fayetteville School District,4,28,19,11,2,254.0,52.0,...,47%,15.0,14.3,16.0,16.0,0,WASHINGTON,7203029,7203000,2023
5,Alma High School,9-12,Alma School District,18,49,33,25,13,978.0,45.0,...,89%,17.8,17.7,19.1,19.3,0,CRAWFORD,1701002,1701000,2023


In [120]:
list(df.columns)

['school_name',
 'Grades',
 'District Name',
 'perc_math_readiness',
 'perc_english_readiness',
 'perc_reading_readiness',
 'perc_science_readiness',
 'perc_all_subject_readiness',
 'number_of_students',
 'perc_frl',
 'perc_white',
 'perc_hispanic',
 'perc_african_american',
 'perc_other_race',
 'perc_minority',
 'Region',
 'number_grade_11_students',
 'number_test_taking_grade_11_students',
 ' % Students in grade 11 that took the ACT in 2015-16',
 'ACT_math',
 'ACT_english',
 'ACT_reading',
 'ACT_science',
 'is_open_enrollment_charter',
 'County',
 'School LEA',
 'DLEA',
 'year']

In [129]:

# Method 2: Check by column
float_status = df.applymap(lambda x: isinstance(x, float)).all()
print("Float status by column:")
print(float_status)

Float status by column:
school_name                                             False
Grades                                                  False
District Name                                           False
perc_math_readiness                                      True
perc_english_readiness                                   True
perc_reading_readiness                                   True
perc_science_readiness                                   True
perc_all_subject_readiness                               True
number_of_students                                       True
perc_frl                                                 True
perc_white                                               True
perc_hispanic                                            True
perc_african_american                                    True
perc_other_race                                          True
perc_minority                                            True
Region                                        

  float_status = df.applymap(lambda x: isinstance(x, float)).all()


In [84]:
state_code = 'AR'
for i in range(19,24):
    if i == 20:
        continue 
    folder = os.path.join(r'C:\Users\mauro\OneDrive\Desktop\MATH\Spring_2025\Erdos\project\project_files\data', state_code + str(i))
    filename = state_code + str(i) + '_Combined_Ed_School_Demographic.csv' 
    filepath = os.path.join(folder, filename)
    df = pd.read_csv(filepath)


In [85]:
filename

'AR23_Combined_Ed_School_Demographic.csv'

In [82]:
df.columns

Index(['Unnamed: 0', 'School Name', 'Grades', 'District Name',
       'Math: % Met Readiness Benchmark', 'English: % Met Readiness Benchmark',
       'Reading: % Met Readiness Benchmark',
       'Science: % Met Readiness Benchmark',
       '% Met Readiness Benchmark in all Four Subjects', ' School Enrollment ',
       'School % FRL', 'School % White', 'School % Hispanic', 'School % Black',
       'School % Other Races', 'School Overall % Minority', 'Region',
       'Number of Students in Grade 11',
       ' Number of Students in grade 11 that took the ACT',
       ' % Students in grade 11 that took the ACT in 2015-16',
       ' Average ACT Math Score ', ' Average ACT English Score ',
       ' Average ACT Reading Score ', ' Average ACT Science Score ',
       'Is Open Enrollment Charter', 'County', 'School LEA', 'DLEA'],
      dtype='object')