In [1]:
import pandas as pd
import numpy as np

# Use the correct relative paths from the root of your project folder
acs_path = '../data/ACS_Estimates_2010-2022.xlsx'
grade_enrollment_path = '../data/Bldg_Grade_Enroll_Cohort_Survival.xlsx'

# Load the data into Pandas DataFrames
acs_data = pd.read_excel(acs_path)
grade_enrollment_data = pd.read_excel(grade_enrollment_path)

# Display the first few rows of the DataFrames to verify
##grade_enrollment_data.head(1000)
acs_data.head(1000)

Unnamed: 0,Year,GEO,District,Total,In Households,Under 3,3_4,5,6-8,9-11,12-14,15-17,In Group Quarters,PK_Estimate,Elementary_Estimate,MS_Estimate,HS_Estimate
0,2010,9700000US2928860,"Springfield R-XII School District, Missouri",40689,40530,7332,5431,1783,7063,6157,7309,5455,159,5431,15003,7309,7255
1,2011,9700000US2928860,"Springfield R-XII School District, Missouri",38358,37960,7891,4152,3352,5434,6282,5810,5039,398,4152,15068,5810,6702
2,2012,9700000US2928860,"Springfield R-XII School District, Missouri",39045,38755,7424,5098,2634,6210,5648,6341,5400,290,5098,14492,6341,7182
3,2013,9700000US2928860,"Springfield R-XII School District, Missouri",41939,41644,7941,5600,2521,7014,6677,5118,6773,295,5600,16212,5118,9008
4,2014,9700000US2928860,"Springfield R-XII School District, Missouri",41738,41397,7568,4822,2637,6718,7365,6803,5484,341,4822,16720,6803,7294
5,2015,9700000US2928860,"Springfield R-XII School District, Missouri",40236,39960,7238,4956,1344,7144,6004,6645,6629,276,4956,14492,6645,8817
6,2016,9700000US2928860,"Springfield R-XII School District, Missouri",40188,39964,7081,4839,1821,7896,6138,6802,5387,224,4839,15855,6802,7165
7,2017,9700000US2928860,"Springfield R-XII School District, Missouri",39851,39602,6345,4376,3410,7906,5671,5985,5909,249,4376,16987,5985,7859
8,2018,9700000US2928860,"Springfield R-XII School District, Missouri",40333,40020,7607,4753,2438,5742,6562,6682,6236,313,4753,14742,6682,8294
9,2019,9700000US2928860,"Springfield R-XII School District, Missouri",41268,40712,5802,5214,1499,7103,6359,7700,7035,556,5214,14961,7700,9357


In [2]:
# Step 1: Calculate the sum of Elementary_Estimate, MS_Estimate, and HS_Estimate for each year in the ACS data
acs_data['RESIDENT_ESTIMATE'] = acs_data['Elementary_Estimate'] + acs_data['MS_Estimate'] + acs_data['HS_Estimate']

# Create a smaller DataFrame with just the 'Year' and 'Resident_Estimate' columns for merging
resident_estimate_by_year = acs_data[['Year', 'RESIDENT_ESTIMATE']].drop_duplicates()

# Step 2: Merge this summed value into the grade_enrollment_data DataFrame
# Perform a left join on 'YEAR' from grade_enrollment_data to 'Year' in the resident_estimate_by_year DataFrame
grade_enrollment_data = pd.merge(grade_enrollment_data, resident_estimate_by_year, left_on='YEAR', right_on='Year', how='left')

# Drop the extra 'Year' column from the merge
grade_enrollment_data = grade_enrollment_data.drop(columns=['Year'])

# Step 3: Calculate the ENROLLMENT_RESIDENT_PROPORTION by dividing TOTAL_ENROLLMENT by RESIDENT_ESTIMATE
grade_enrollment_data['ENROLLMENT_RESIDENT_PROPORTION'] = np.where(grade_enrollment_data['RESIDENT_ESTIMATE'] != 0,
                                                         grade_enrollment_data['TOTAL_ENROLLMENT'] / grade_enrollment_data['RESIDENT_ESTIMATE'],
                                                         np.nan)

# Display the updated grade_enrollment_data DataFrame to verify the new column
grade_enrollment_data.head(1000)

Unnamed: 0,YEAR,COUNTY_DISTRICT_CODE,DISTRICT_NAME,SCHOOL_CODE,SCHOOL_NAME,GRADE,TOTAL_ENROLLMENT,COHORT_SURVIVAL_RATE,RESIDENT_ESTIMATE,ENROLLMENT_RESIDENT_PROPORTION
0,2006,39141,SPRINGFIELD R-XII,1050,CENTRAL HIGH,6,35,,,
1,2006,39141,SPRINGFIELD R-XII,1050,CENTRAL HIGH,7,40,,,
2,2006,39141,SPRINGFIELD R-XII,1050,CENTRAL HIGH,8,40,,,
3,2006,39141,SPRINGFIELD R-XII,1050,CENTRAL HIGH,9,397,,,
4,2006,39141,SPRINGFIELD R-XII,1050,CENTRAL HIGH,10,380,,,
...,...,...,...,...,...,...,...,...,...,...
995,2011,39141,SPRINGFIELD R-XII,3080,PERSHING MIDDLE,6,239,1.039130,27580.0,0.008666
996,2011,39141,SPRINGFIELD R-XII,3080,PERSHING MIDDLE,7,229,0.982833,27580.0,0.008303
997,2011,39141,SPRINGFIELD R-XII,3080,PERSHING MIDDLE,8,236,1.123810,27580.0,0.008557
998,2011,39141,SPRINGFIELD R-XII,3080,PERSHING MIDDLE,0-8,704,1.046062,27580.0,0.025526


In [4]:
# Filter the dataset to include only rows where 'YEAR' is between 2010 and 2023
filtered_data = grade_enrollment_data[(grade_enrollment_data['YEAR'] >= 2010) & (grade_enrollment_data['YEAR'] <= 2023)]

# Display the first few rows of the filtered dataset to verify
print(filtered_data.head(1000))


      YEAR  COUNTY_DISTRICT_CODE      DISTRICT_NAME  SCHOOL_CODE  \
40    2010                 39141  SPRINGFIELD R-XII         1050   
41    2010                 39141  SPRINGFIELD R-XII         1050   
42    2010                 39141  SPRINGFIELD R-XII         1050   
43    2010                 39141  SPRINGFIELD R-XII         1050   
44    2010                 39141  SPRINGFIELD R-XII         1050   
...    ...                   ...                ...          ...   
1289  2015                 39141  SPRINGFIELD R-XII         3140   
1290  2016                 39141  SPRINGFIELD R-XII         3140   
1291  2016                 39141  SPRINGFIELD R-XII         3140   
1292  2016                 39141  SPRINGFIELD R-XII         3140   
1293  2016                 39141  SPRINGFIELD R-XII         3140   

       SCHOOL_NAME GRADE  TOTAL_ENROLLMENT  COHORT_SURVIVAL_RATE  \
40    CENTRAL HIGH     6                41              1.000000   
41    CENTRAL HIGH     7                43     

In [5]:
# Specify the path where you want to save the new Excel file
output_path = '../data/enrollment_training_data.xlsx'

# Save the DataFrame as an Excel file at the specified path
filtered_data.to_excel(output_path, index=False)

# Print a confirmation message
print(f"Dataset saved as '{output_path}'.")

Dataset saved as '../data/enrollment_training_data.xlsx'.


In [6]:
# Load the dataset from the saved Excel file
file_path = '../data/enrollment_training_data.xlsx'
data = pd.read_excel(file_path)

# Replace blank values with NaN
data.replace('', np.nan, inplace=True)

# Save the DataFrame back to the same Excel file
data.to_excel(file_path, index=False)

print(f"Blank values in '{file_path}' have been replaced with NaN.")

Blank values in '../data/enrollment_training_data.xlsx' have been replaced with NaN.
