# School level data cleaning #

Since different states present their school level demographic data in different ways, we have no uniform way of dealing with the data. In order to simplify the process, in this notebook we both clean and standardize column names across states. We also convert the columns that contain entries such as "$5,234, 567" or "1,234" to floats, in order to be ready to train models/study the data. 

In [213]:
import pandas as pd
import numpy as np
import os

In [145]:
test_type_dict = {'MA': 'SAT', 'AR':'ACT'}

## Massachusetts school level data ##

In [241]:
state_code = 'MA'
test_type = test_type_dict[state_code]

# we will use this dictionary to rename the columns for MA data for all years. 

rename_dict = {'School Name': 'school_name', 
               'Tests Taken': 'tests_taken', 
               'Reading / Writing': test_type + '_reading_writing',
               'Math': test_type + '_math', 
               'Average Class Size': 'avg_class_size', 
               'Number of Students': 'number_of_students',
               'English Language Learner %': 'perc_english_learner',
               'Students with Disabilities %': 'perc_disabilities',
               'Economically Disadvantaged %': 'perc_econ_disadvg', 
               'African American %': 'perc_african_american',
               'Asian %': 'perc_asian', 
               'Hispanic %': 'perc_hispanic', 
               'White %': 'perc_white', 
               'Native American %':'perc_native_american',
               'Native Hawaiian, Pacific Islander %':'perc_hawaiian_pacific_isl', 
               'Multi-Race, Non-Hispanic %': 'perc_multi_race_non_hisp',
               'Year': 'year',
               'Total Expenditures per Pupil':'total_exp_per_pupil'
              }

for i in range(19,24):
    folder = os.path.join(r'C:\Users\mauro\OneDrive\Desktop\MATH\Spring_2025\Erdos\project\project_files\data', state_code + str(i))
    filename = state_code + str(i) + '_Combined_Ed_School_Demographic.csv' 
    filepath = os.path.join(folder, filename)
    df = pd.read_csv(filepath)

    # we rename columns and drop rows with missing values
    df = df.rename(columns = rename_dict)
    df = df.dropna(subset = list(rename_dict.values()))

    # there are columns whose values are strings instead of floats or ints; we deal with these.
    for col in ['number_of_students', 'total_exp_per_pupil']:
        try:
            if type(df.iloc[0][col]) == str:
                df[col] = df[col].str.strip().str.replace(',', '').str.replace('$', '')
        except:
            print("unable to clean string entry")
        df[col] = df[col].astype(float)

    new_filename = state_code + str(i) + '_cleaned_ed_school_demographics.csv'
    new_filepath = os.path.join(folder, new_filename)

    df.to_csv(new_filepath, index = False)
    print('done with', i)

    
    
    

done with 19
done with 20
done with 21
done with 22
done with 23


In [211]:
test_type

'ACT'

## Arkansas school level data ##

In [239]:
state_code = 'AR'
test_type = test_type_dict[state_code]

# we will use this dictionary to rename the columns for MA data for all years. 

rename_dict = {'School Name': 'school_name',
               'Math: % Met Readiness Benchmark': 'perc_math_readiness', 
               'English: % Met Readiness Benchmark': 'perc_english_readiness',
               'Reading: % Met Readiness Benchmark': 'perc_reading_readiness',
                'Science: % Met Readiness Benchmark': 'perc_science_readiness',
               '% Met Readiness Benchmark in all Four Subjects': 'perc_all_subject_readiness', 
               ' School Enrollment ': 'number_of_students',
               'School % FRL': 'perc_frl', 
               'School % White': 'perc_white', 
               'School % Hispanic': 'perc_hispanic', 
               'School % Black': 'perc_african_american',
               'School % Other Races': 'perc_other_race', 
               'School Overall % Minority': 'perc_minority', 
               'Number of Students in Grade 11': 'number_grade_11_students',
               ' Number of Students in grade 11 that took the ACT': 'number_test_taking_grade_11_students',
               ' Average ACT Math Score ': test_type + '_math', 
               ' Average ACT English Score ': test_type + '_english',
               ' Average ACT Reading Score ': test_type + '_reading', 
               ' Average ACT Science Score ': test_type + '_science',
               'Average ACT Math Score': test_type + '_math', 
               'Average ACT English Score': test_type + '_english',
               'Average ACT Reading Score': test_type + '_reading', 
               'Average ACT Science Score': test_type + '_science',
               'Is Open Enrollment Charter': 'is_open_enrollment_charter',
               ' % Students in grade 11 that took the ACT in 2015-16': 'perc_test_taking_students_15_16',
               ' % Students in grade 9-12 that took the ACT': 'perc_test_taking_grades_9_12_students',
               ' Number of Students taking AP exams ': 'number_AP_exam_taking_students',
               ' Number of AP Exams Taken ': 'number_AP_exams_taken'
              }

for i in range(19,24):
    if i == 20:
        continue
    folder = os.path.join(r'C:\Users\mauro\OneDrive\Desktop\MATH\Spring_2025\Erdos\project\project_files\data', state_code + str(i))
    filename = state_code + str(i) + '_Combined_Ed_School_Demographic.csv' 
    filepath = os.path.join(folder, filename)
    df = pd.read_csv(filepath)

    df['year'] = pd.Series([int('20' + str(i))]*len(df))

    df = df.drop('Unnamed: 0', axis = 1)

    # we rename columns
    df = df.rename(columns = rename_dict)

    # there are rows that have the value "N<10" for the 'ACT_math' column, indicating that fewer than 10 students took the exam. we set these to NaN so that they are dropped.
    df[test_type + '_math'] = df[test_type + '_math'].replace('N<10', np.nan)
    df[test_type + '_math'] = pd.to_numeric(df[test_type + '_math'], errors='coerce')
    
    # we drop the rows that contain NaN values.
    for col in list(rename_dict.values()):
        if col in list(df.columns):
            df = df.dropna(subset = [col])

    

    # there are columns whose values are strings instead of floats or ints; we deal with these.
    for col in ['number_of_students', 'perc_math_readiness', 'number_test_taking_grade_11_students',
                'perc_english_readiness', 'perc_reading_readiness',
                'perc_science_readiness', 'perc_all_subject_readiness',
                'perc_frl', 'perc_white', 'perc_hispanic', 'perc_african_american','perc_other_race',
                'perc_minority', 'ACT_math', 'ACT_english','ACT_reading', 'ACT_science',
                 'year', 'perc_test_taking_students_15_16', 'perc_test_taking_grades_9_12_students'
               ]:
        if col in list(df.columns):
            try:
                if type(df.iloc[0][col]) == str:
                    df[col] = df[col].str.strip().str.replace(',', '').str.replace('$', '').str.replace('%', '')
            except:
                print("unable to reformat string")
            try:
                df[col] = df[col].astype(float)
            except:
                print('non-float values in ', col)
    

    new_filename = state_code + str(i) + '_cleaned_ed_school_demographics.csv'
    new_filepath = os.path.join(folder, new_filename)

    df.to_csv(new_filepath, index = False)

    print('done with', i)
    if i == 22:
        df_test = df

    
    

done with 19
done with 21
done with 22
non-float values in  perc_math_readiness
non-float values in  perc_english_readiness
non-float values in  perc_reading_readiness
non-float values in  perc_science_readiness
non-float values in  perc_all_subject_readiness
done with 23
