## Add in School Districts
Taking the school quality data, we will import GeoJSON file of the King County School Districts boundaries and attach each school to a school district

In [87]:
# import packages
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [88]:
# import necessary data
school_data = pd.read_csv('Cleaned-Data/merged-interpolated.csv')
district_geojson = gpd.read_file('Original-Data/Geospatial/School Districts King County.geojson')

In [89]:
# function to take school lat/long and return home district
def find_district(row):
    lat = row['GeoCoded_X']
    lon = row['GeoCoded_Y']
    
    # create point
    point = Point(lat, lon)
    
    # Check which polygon(s) contain the point
    containing_polygons = district_geojson[district_geojson.geometry.contains(point)]
    
    if containing_polygons.empty:
        return None
    else:
        # save district
        district = containing_polygons.iloc[0].to_dict()
        
        # Extract desired fields from the district
        district_name = district['NAME']
        district_id = district['DSTNUM']
        
        # Return a Series with the new columns
        return pd.Series({'District_Name': district_name, 'District_Num': district_id})

In [90]:
 # Apply the function and assign the result to new columns
school_data[['District_Name', 'District_Num']] = school_data.apply(find_district, axis=1)

In [91]:
# save new file as csv
school_data.to_csv('Cleaned-Data/scores-districts.csv', index=False)

## Finding Max Scores per District
For the model training we need to calculate, per district and per year, the *most quality* Elementary, Middle, and High school. let's do that 

In [93]:
# convert school year from range to single year
school_data['StartYear'] = school_data['SchoolYear'].str[:4].astype(int)

years = sorted(school_data['StartYear'].unique())
grades = school_data['Grade Category'].unique()
districts = school_data['District_Num'].unique()

for year in years:
    print(f"{int(year)}, ", end='')
print()
print(grades)
print(districts)

2010, 2011, 2012, 2014, 2015, 2016, 2017, 2018, 2020, 2021, 2022, 2023, 
['Elementary School' 'Middle School' 'High School' 'Other' 'K-12' 'PK-12'
 nan]
['17001' '17210' '17414' '17412' '17408' '17411' '17405' '17417' '17403'
 '17401' '17216' '17407' '17415' '17406' '17410' '17409' '17402' '17400'
 '17404']


In [102]:
# storing all new rows made here
rows = []

def find_max(categories, school_year, school_district):
    filtered_data = school_data[(school_data['StartYear'] == school_year) & (school_data['Grade Category'].isin(categories)) & (school_data['District_Num'] == school_district)]
    
    # Check if filtered_data is not empty
    if not filtered_data.empty:
        max_row = filtered_data.loc[filtered_data['QualityScore'].idxmax()]
        max_score = max_row['QualityScore']
        return max_row, max_score
    else:
        # Return None or a default score if no data matches
        return None, None
    

for year in years:
    for district in districts:
        # find elementary
        valid_categories = ['Elementary School']
        max_elementary_row, max_elementary_score = find_max(valid_categories, year, district)
        
        # find middle
        valid_categories = ['Middle School']
        max_middle_row, max_middle_score = find_max(valid_categories, year, district)
        
        # find high
        valid_categories = ['High School']
        max_high_row, max_high_score = find_max(valid_categories, year, district)
        
        # Append the row to the list
        rows.append({
            'Year': year,
            'District': district,
            'Max Elementary Score': max_elementary_score,
            'Max Middle Score': max_middle_score,
            'Max High Score': max_high_score
        })

# Convert rows to DataFrame
max_scores_df = pd.DataFrame(rows)

print(max_scores_df)


     Year District  Max Elementary Score  Max Middle Score  Max High Score
0    2010    17001              0.296145          0.928950        0.779117
1    2010    17210              0.418491          0.880900        0.452133
2    2010    17414                   NaN          0.927900        0.672500
3    2010    17412                   NaN          0.396931        0.563375
4    2010    17408                   NaN          0.451383        0.580000
..    ...      ...                   ...               ...             ...
223  2023    17410              0.948667          0.876333        0.591000
224  2023    17409              0.555600          0.491000        0.395333
225  2023    17402              0.770667          0.809667        0.857333
226  2023    17400              0.941333          0.541600        0.509000
227  2023    17404                   NaN               NaN        0.050000

[228 rows x 5 columns]


We have some Null values still. Will do some more interpolation on the data

In [108]:
# Sort the dataframe by DistrictNum and Year
max_scores_df = max_scores_df.sort_values(by=['District', 'Year']).reset_index(drop=True)

# Helper function for interpolation with specified fallback for isolated NaNs
def interpolate_with_fallback(column):
    # Linear interpolation for values that can be interpolated
    interpolated = column.interpolate(method="linear", limit_direction="both")
    
    # Calculate fallback value using mean * 1/variance
    mean_value = column.mean()
    variance_value = column.var()
    fallback_value = mean_value * (1 / variance_value) if variance_value != 0 else mean_value
    
    # Fill any remaining NaNs with the fallback value
    return interpolated.fillna(fallback_value)

# Apply the function to each score column in your dataframe
max_scores_df["Max Elementary Score"] = interpolate_with_fallback(max_scores_df["Max Elementary Score"])
max_scores_df["Max Middle Score"] = interpolate_with_fallback(max_scores_df["Max Middle Score"])
max_scores_df["Max High Score"] = interpolate_with_fallback(max_scores_df["Max High Score"])

# Display the modified DataFrame
max_scores_df

Unnamed: 0,Year,District,Max Elementary Score,Max Middle Score,Max High Score
0,2010,17001,0.296145,0.928950,0.779117
1,2011,17001,0.768102,0.944000,0.820967
2,2012,17001,0.768102,0.940900,0.873267
3,2014,17001,0.937333,0.689000,0.783000
4,2015,17001,0.765750,0.583429,0.587333
...,...,...,...,...,...
223,2018,17417,0.843000,0.533714,0.302714
224,2020,17417,0.795000,0.445600,0.267000
225,2021,17417,0.913333,0.569667,0.411000
226,2022,17417,0.794225,0.498750,0.310114


In [109]:
# save as csv
max_scores_df.to_csv('Cleaned-Data/max-district-scores.csv', index=False)