# lowercase suburb names and change object type

final dataset: 
rows:suburb 
features: primary school count per suburb, highschool count per suburb, tertiary count per suburb, average rank of highschools, median rank of highschools,rank of suburb based on total education institutions count

suburb = string
Institute_name = string
School_name = string
School_type = string
Postcode = int
Longitude = float
Latitude = float

#### !!  Important  !!
Before running this code make sure all of the education related datasets are properly downloaded / scraped and saved into the landing folder
- for datasets that are scraped check that they are in the landing folder 
- for datasets that need to be manually downloaded BEFORE running this notebook:

      - please move the csv file (university_locations.csv) to the data/landing folder

In [270]:
import pandas as pd
import geopandas as gpd
import numpy as np
from fuzzywuzzy import process, fuzz
import os


### Preprocessing for TAFE locations

In [391]:
# reading TAFE dataset
tafe_df = pd.read_csv('../data/landing/TAFE_locations.csv')

In [392]:
# extract suburbs where TAFE campuses are located
def extract_suburbs(location):
    suburbs = []
    # split by semicolumn and space to get each part of the multiple campus locations
    for part in location.split('; '):
        # for each part get everything before the first colon (the suburb)
        suburb = part.split(':')[0]
        suburbs.append(suburb)
    return suburbs

# apply function to TAFE df
tafe_df['suburbs'] = tafe_df['Locations'].apply(extract_suburbs)
# each suburb gets an individual row 
tafe_df = tafe_df.explode('suburbs')


In [394]:
# drop locations column - dont need anymore
tafe_df = tafe_df.drop(columns='Locations')
# change column name to align format
tafe_df = tafe_df.rename(columns={'Institute Name': 'institute_name'})
tafe_df = tafe_df.rename(columns={'suburbs': 'suburb'})
tafe_df['institute_name'] = tafe_df['institute_name'].str.lower()
tafe_df['suburb'] = tafe_df['suburb'].str.lower()

In [395]:
tafe_df

Unnamed: 0,institute_name,suburb
0,bendigo tafe,bendigo
0,bendigo tafe,bendigo
0,bendigo tafe,castlemaine
0,bendigo tafe,echuca
1,box hill institute,box hill
...,...,...
14,victoria university,sunshine
14,victoria university,werribee
15,william angliss institute,melbourne
16,wodonga tafe,barnawartha north


### Preprocessing for University locations

In [342]:
# reading university dataset
university_df = pd.read_csv('../data/landing/university_locations.csv')

In [343]:
# drop columns that we dont need 
university_df = university_df.drop(columns=['Campus', 'City or Town', 'Post Code', 'Name'])
# rename columns 
university_df = university_df.rename(columns={'University': 'institute_name'})
university_df['institute_name'] = university_df['institute_name'].str.lower()


In [345]:
# extract the suburb information from address column
pattern = r'(?:,\s*(.*?)\s*V)|(?:\s+([^\s]+)\s*V)' # i want the text between , and VIC, if theres no , then between a space and VIC
university_df['suburb'] = university_df['Address'].str.extract(pattern).bfill(axis=1).iloc[:, 0]
university_df['suburb'] = university_df['suburb'].str.lower()

# drop columnds that we dont need
university_df = university_df.drop(columns=['Address'])

# remove any punctuation
university_df['suburb'] = university_df['suburb'].str.replace('[,"\']', '', regex=True)

##### Combine into Tertiary Insitution location csv

In [396]:
tertiary_df = pd.concat([tafe_df, university_df], ignore_index=True)

# save as csv to data/raw folder 
tertiary_df.to_csv('../data/raw/tertiary_insitutions_suburbs.csv', index=False)

### Preprocessing for Primary Secondary location

In [354]:
# reading dataset
# specify encoding to handle non-UTF-8 characters
prim_sec_df = pd.read_csv('../data/landing/2023_primary_secondary_locations.csv', encoding='ISO-8859-1')

In [None]:
# list of columns to keep
columns_to_keep = ['School_Name', 'School_Type', 'Address_Town', 'Address_Postcode', 'X', 'Y']
# keep selected columns
prim_sec_df = prim_sec_df[columns_to_keep]
# change column names
prim_sec_df = prim_sec_df.rename(columns={
    'School_Name': 'school_name',
    'School_Type': 'school_type',
    'Address_Town': 'suburb',
    'Address_Postcode': 'postcode',
    'X': 'longitude',
    'Y': 'latitude'
})

prim_sec_df['school_name'] = prim_sec_df['school_name'].str.lower()
prim_sec_df['suburb'] = prim_sec_df['suburb'].str.lower()
prim_sec_df['school_type'] = prim_sec_df['school_type'].str.lower()

In [371]:
# split df into primary schools and secondary schools
primary_schools_df = prim_sec_df[prim_sec_df['school_type'].isin(['primary', 'pri/sec'])]
secondary_schools_df = prim_sec_df[prim_sec_df['school_type'].isin(['secondary', 'pri/sec'])]

creating ranking for primary schools

In [416]:
all_enrollments = pd.read_csv('../data/landing/All_schools_enrollments.csv', encoding='ISO-8859-1')

In [417]:
all_enrollments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2290 entries, 0 to 2289
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Education_Sector            2290 non-null   object 
 1   Entity_Type                 2290 non-null   int64  
 2   School_No                   2290 non-null   int64  
 3   School_Name                 2290 non-null   object 
 4   School_Type                 2290 non-null   object 
 5   School_Status               2290 non-null   object 
 6   "Prep Total"                2290 non-null   float64
 7   "Year 1 Total"              2290 non-null   float64
 8   "Year 2 Total"              2290 non-null   float64
 9   "Year 3 Total"              2290 non-null   float64
 10  "Year 4 Total"              2290 non-null   float64
 11  "Year 5 Total"              2290 non-null   float64
 12  "Year 6 Total"              2290 non-null   float64
 13  "Primary Ungraded Total"    2290 

In [423]:
# Step 3: Keep only the columns: 'School_Name', 'School_Type', and 'Primary Total'
prim_enrollments = all_enrollments[['School_Name', 'School_Type', '"Primary Total"']]

# Step 4: Clean column names and values to lowercase
prim_enrollments.columns = [col.lower().replace('"', '').strip() for col in prim_enrollments.columns]
prim_enrollments = prim_enrollments.apply(lambda col: col.str.lower() if col.dtype == 'object' else col)

# Step 5: Filter for primary and Pri/Sec schools
prim_enrollments = prim_enrollments[prim_enrollments['school_type'].isin(['primary', 'pri/sec'])]

# Step 6: Rename 'primary total' to 'total_enrollments' and add a 'rank' column
prim_enrollments = prim_enrollments.rename(columns={'primary total': 'total_enrollments'})
prim_enrollments = prim_enrollments.sort_values(by='total_enrollments', ascending=False)
prim_enrollments['rank'] = prim_enrollments['total_enrollments'].rank(ascending=False).astype(int)

# Step 7: Drop the 'school_type' column
prim_enrollments = prim_enrollments.drop(columns=['school_type'])

# Optional: Print the result or save to file
print(prim_enrollments.head())

                      school_name  total_enrollments  rank
1665         alamanda k-9 college             2560.0     1
1644        truganina p-9 college             1815.0     2
2033  tarneit rise primary school             1743.0     3
2099           haileybury college             1634.0     4
2189        bacchus marsh grammar             1610.0     5


In [424]:
primary_schools_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1823 entries, 0 to 2301
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   school_name  1823 non-null   object 
 1   school_type  1823 non-null   object 
 2   suburb       1823 non-null   object 
 3   postcode     1823 non-null   int64  
 4   longitude    1823 non-null   float64
 5   latitude     1823 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 99.7+ KB


fuzzy match school names to get ranking from primary enrollments df to primary school locations df

In [442]:
# Step 1: Perform exact matching using 'school_name'
merged_df = pd.merge(primary_schools_df, prim_enrollments[['school_name', 'rank']], 
                     on='school_name', how='left')

# Step 2: Identify schools without exact matches
unmatched_schools = merged_df[merged_df['rank'].isna()]['school_name'].tolist()

# Step 3: Perform fuzzy matching for the unmatched schools and track match scores
def fuzzy_match(school_name, choices, threshold=90):
    match, score = process.extractOne(school_name, choices)
    if score >= threshold:
        return match, score
    else:
        return None, None

# Get a list of school names from prim_enrollments for matching
prim_enrollment_names = prim_enrollments['school_name'].tolist()

# Create lists to store the best matches and their scores
best_matches = []
match_scores = []

# Apply fuzzy matching for unmatched schools
for school in unmatched_schools:
    match, score = fuzzy_match(school, prim_enrollment_names)
    best_matches.append(match)
    match_scores.append(score)

    if match:
        # Get the rank for the matched school and update merged_df
        rank = prim_enrollments[prim_enrollments['school_name'] == match]['rank'].values[0]
        merged_df.loc[merged_df['school_name'] == school, 'rank'] = rank

# Step 4: Remove rows where rank is NaN
primary_schools_df_clean = merged_df.dropna(subset=['rank'])

# Step 5: Handle duplicate ranks
# Sort by 'rank' and 'school_name' to prioritize the lowest rank and higher-ranked schools
primary_schools_df_clean = primary_schools_df_clean.sort_values(by=['rank', 'school_name'])

# Create a set to track assigned ranks
assigned_ranks = set()

# Create a list to store the final rows
final_rows = []

# Iterate through the cleaned DataFrame to assign unique ranks
for index, row in primary_schools_df_clean.iterrows():
    if row['rank'] not in assigned_ranks:
        # If the rank is unique, assign it and add to final_rows
        final_rows.append(row)
        assigned_ranks.add(row['rank'])
    else:
        # If the rank is already taken, skip for now and handle later
        continue

# Convert final_rows back to a DataFrame
primary_schools_df_unique = pd.DataFrame(final_rows)

# Step 6: For any remaining schools that couldn't be assigned a unique rank,
# keep the entry with the highest rank (i.e., lowest numerical value)
remaining_duplicates = primary_schools_df_clean[~primary_schools_df_clean.index.isin(primary_schools_df_unique.index)]

# Sort remaining duplicates by 'rank' and keep the lowest rank for each school
remaining_unique = remaining_duplicates.sort_values(by='rank').drop_duplicates(subset=['school_name'], keep='first')

# Combine the final unique entries
primary_schools_df_final = pd.concat([primary_schools_df_unique, remaining_unique]).sort_values(by='rank')

# Step 7: Convert 'rank' to int
primary_schools_df_final['rank'] = primary_schools_df_final['rank'].astype(int)


In [443]:
# Display the updated DataFrame with unique schools and ranks
print("Updated primary_schools_df with unique ranks:")
print(primary_schools_df_final.head())

Updated primary_schools_df with unique ranks:
                      school_name school_type         suburb  postcode  \
1090         alamanda k-9 college     pri/sec     point cook      3030   
1070        truganina p-9 college     pri/sec      truganina      3029   
1207  tarneit rise primary school     primary        tarneit      3029   
3337           haileybury college     pri/sec    keysborough      3173   
4811        bacchus marsh grammar     pri/sec  bacchus marsh      3340   

      longitude  latitude  rank  
1090  144.74133 -37.90824     1  
1070  144.71909 -37.83877     2  
1207  144.66582 -37.83618     3  
3337  145.14643 -37.99598     4  
4811  144.43159 -37.69027     5  


In [444]:
# save as csv to data/raw folder 
primary_schools_df_final.to_csv('../data/raw/primary_school_locations.csv', index=False)

### Preprocessing for Highschool rankings

##### Creating highschool rankings using school completion and achievement information

In [211]:
hs_achievement = pd.read_csv('../data/landing/2023SeniorSecondaryCompletionAndAchievementInformation.csv')

In [212]:
# removing trailing and leading spaces from column names
hs_achievement.columns = hs_achievement.columns.str.strip()
# remove trailing and leading spaces from string entries in df
hs_achievement = hs_achievement.apply(lambda col: col.map(lambda x: x.strip() if isinstance(x, str) else x))

columns_to_keep = ['School', 
                   'Number of VCE and VCE Vocational Major (VM) studies at Units 3 and 4 level with enrolments', 
                   'Percentage of satisfactory VCE completions', 
                   'Median VCE study score', 
                   'Percentage of study scores of 40 and over']
hs_achievement = hs_achievement[columns_to_keep]

In [213]:
cols_to_convert = hs_achievement.columns.difference(['School'])
# convert the non-school columns to numeric, coercing errors to NaN
hs_achievement[cols_to_convert] = hs_achievement[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# fill missing values with the median of their respective columns
for col in cols_to_convert:
    median_value = hs_achievement[col].median()
    # assign column with filled NaN values directly back to df
    hs_achievement[col] = hs_achievement[col].fillna(median_value)

In [216]:
# define weights for each column based on importance
weights = {
    'Percentage of study scores of 40 and over': 0.4,
    'Median VCE study score': 0.3,
    'Number of VCE and VCE Vocational Major (VM) studies at Units 3 and 4 level with enrolments': 0.15,
    'Percentage of satisfactory VCE completions': 0.15
}

# normalise scores by dividing each by its maximum value, so they are on the same scale
for col in weights.keys():
    hs_achievement[col] = hs_achievement[col] / hs_achievement[col].max()

# calculate total score as the weighted sum of the columns
hs_achievement['total_score'] = (
    hs_achievement['Percentage of study scores of 40 and over'] * weights['Percentage of study scores of 40 and over'] +
    hs_achievement['Median VCE study score'] * weights['Median VCE study score'] +
    hs_achievement['Number of VCE and VCE Vocational Major (VM) studies at Units 3 and 4 level with enrolments'] * weights['Number of VCE and VCE Vocational Major (VM) studies at Units 3 and 4 level with enrolments'] +
    hs_achievement['Percentage of satisfactory VCE completions'] * weights['Percentage of satisfactory VCE completions']
)

# rank schools based on the total score (1 is best)
hs_achievement['school_rank'] = hs_achievement['total_score'].rank(ascending=False)

In [217]:
hs_achievement = hs_achievement.sort_values(by='school_rank')
hs_achievement['school_rank'] = hs_achievement['school_rank'].round().astype(int)
cols = hs_achievement.columns.tolist()
# move 'school_rank' to the second position
cols.insert(1, cols.pop(cols.index('school_rank')))
# reorder df
hs_achievement = hs_achievement[cols]

In [218]:
cols = hs_achievement.columns.tolist()
# move 'school_rank' to the second position
cols.insert(1, cols.pop(cols.index('school_rank')))
# reorder df
hs_achievement = hs_achievement[cols]

In [219]:
# rename the columns: lowercase and replace spaces with underscores
hs_achievement.columns = hs_achievement.columns.str.lower().str.replace(' ', '_')
# convert all entries in the 'school' column to lowercase
hs_achievement['school'] = hs_achievement['school'].str.lower()

In [220]:
hs_achievement = hs_achievement.rename(columns={
    'school_rank': 'rank',
    'school': 'school_name',
})

In [221]:
hs_achievement.to_csv('../data/raw/highschool_rankings.csv', index=False)

##### fuzzy match school_names to map rankings to schools

In [298]:
# Step 1: Define a function to remove common words for matching purposes
def clean_school_name(name):
    common_words = ['college', 'school', 'centre', 'senior','prep','year', 'secondary', 'primary', 'grammar', 'christian', 'catholic', 'p-9', 'p-12', '-9', 'sec']
    tokens = name.split()
    filtered_tokens = [token for token in tokens if token.lower() not in common_words]
    return ' '.join(filtered_tokens)

# Step 2: Create cleaned school name lists for matching
achievement_schools_cleaned = [clean_school_name(school) for school in hs_achievement['school_name'].tolist()]
secondary_schools_cleaned = [clean_school_name(school) for school in secondary_schools_df['school_name'].tolist()]

# Step 3: Perform fuzzy matching using the cleaned school names
best_matches = []
match_scores = []

for school in secondary_schools_cleaned:
    match, score = fuzzy_match_school(school, achievement_schools_cleaned)
    best_matches.append(match)  # Store the best match (from cleaned names)
    match_scores.append(score)  # Store the match score

# Step 4: Create a DataFrame to store the matches along with their scores
match_df = pd.DataFrame({
    'original_school_name_in_secondary': secondary_schools_df['school_name'],  # Original school names
    'cleaned_school_name_in_secondary': secondary_schools_cleaned,  # Cleaned school names
    'best_match_cleaned': best_matches,  # Best match (from cleaned names)
    'match_score': match_scores  # Match score
})

# Step 5: Determine how many schools were matched and unmatched based on match_score
threshold = 88  # Set your match score threshold for a valid match


# Step 7: Now merge the ranks from hs_achievement_cleaned to assign them to secondary_schools_df
hs_achievement_cleaned = pd.DataFrame({
    'original_school_name_in_achievement': hs_achievement['school_name'],  # Original names
    'cleaned_school_name_in_achievement': [clean_school_name(school) for school in hs_achievement['school_name'].tolist()],  # Cleaned names
    'rank': hs_achievement['rank']  # Rank
})

# Merge the cleaned school name matches to get the rank
match_df = match_df.merge(hs_achievement_cleaned[['cleaned_school_name_in_achievement', 'rank']], 
                          left_on='best_match_cleaned', right_on='cleaned_school_name_in_achievement', how='left')

# Step 8: Drop the cleaned columns and keep the original school names and the rank
final_secondary_schools_df = pd.merge(
    secondary_schools_df, 
    match_df[['original_school_name_in_secondary', 'rank']], 
    left_on='school_name', 
    right_on='original_school_name_in_secondary', 
    how='left'
)

# Drop the extra 'original_school_name_in_secondary' column if needed
final_secondary_schools_df = final_secondary_schools_df.drop(columns=['original_school_name_in_secondary'])


In [363]:


# Step 2: Sort the dataframe by 'school_name' and 'rank'
final_secondary_schools_df = final_secondary_schools_df.sort_values(by=['school_name', 'rank'])

# Step 3: Drop duplicates based on 'school_name', keeping the row with the highest rank (smallest number)
final_secondary_schools_df_cleaned = final_secondary_schools_df.drop_duplicates(subset='school_name', keep='first')

# Step 4: Identify rows where the rank is the same for repeated school names
duplicates_same_rank = final_secondary_schools_df[final_secondary_schools_df.duplicated(subset=['school_name', 'rank'], keep=False)]

# Step 5: Display the cleaned dataframe without duplicates
print("Cleaned DataFrame without duplicates:")
display(final_secondary_schools_df_cleaned)

Cleaned DataFrame without duplicates:


Unnamed: 0,school_name,school_type,suburb,postcode,longitude,latitude,rank
396,academy of mary immaculate,secondary,fitzroy,3065,144.97441,-37.80371,143
480,adass israel school,pri/sec,elsternwick,3185,145.00820,-37.88390,409
627,aitken college,pri/sec,greenvale,3059,144.89070,-37.62771,249
671,al iman college,pri/sec,melton south,3338,144.56548,-37.70712,365
654,al siraat college,pri/sec,epping,3076,145.03753,-37.62426,202
...,...,...,...,...,...,...,...
220,yea high school,secondary,yea,3717,145.40854,-37.21501,483
483,yeshivah college,pri/sec,st kilda east,3183,145.00034,-37.86812,77
638,yesodei hatorah college,pri/sec,elwood,3184,144.98349,-37.87323,49
681,youth2industry college,secondary,south melbourne,3205,144.96665,-37.83731,565


In [375]:
final_secondary_schools_df_cleaned.to_csv('../data/raw/secondary_school_locations.csv', index=False)

### Create features for final dataset

In [445]:
primary_schools_df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1706 entries, 1090 to 445
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   school_name  1706 non-null   object 
 1   school_type  1706 non-null   object 
 2   suburb       1706 non-null   object 
 3   postcode     1706 non-null   int64  
 4   longitude    1706 non-null   float64
 5   latitude     1706 non-null   float64
 6   rank         1706 non-null   int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 106.6+ KB


In [378]:
final_secondary_schools_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 597 entries, 396 to 20
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   school_name  597 non-null    object 
 1   school_type  597 non-null    object 
 2   suburb       597 non-null    object 
 3   postcode     597 non-null    int64  
 4   longitude    596 non-null    float64
 5   latitude     596 non-null    float64
 6   rank         597 non-null    int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 37.3+ KB


In [398]:
tertiary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   institute_name  131 non-null    object
 1   suburb          131 non-null    object
dtypes: object(2)
memory usage: 2.2+ KB


In [460]:
# Step 1: Get unique suburbs from all datasets
suburbs_combined = pd.concat([
    primary_schools_df_final['suburb'], 
    final_secondary_schools_df_cleaned['suburb'], 
    tertiary_df['suburb']
]).drop_duplicates().reset_index(drop=True)

# Step 2: Create a dataframe with unique suburbs
final_df = pd.DataFrame(suburbs_combined, columns=['suburb'])

# Step 3: Add primary school count per suburb
primary_school_count = primary_schools_df_final.groupby('suburb').size().reset_index(name='primary_school_count')
final_df = final_df.merge(primary_school_count, on='suburb', how='left')

# Step 4: Add secondary school (highschool) count per suburb
highschool_count = final_secondary_schools_df_cleaned.groupby('suburb').size().reset_index(name='highschool_count')
final_df = final_df.merge(highschool_count, on='suburb', how='left')

# Step 5: Add tertiary institution count per suburb
tertiary_count = tertiary_df.groupby('suburb').size().reset_index(name='tertiary_institutions_count')
final_df = final_df.merge(tertiary_count, on='suburb', how='left')

# Step 6: Add average rank of highschools per suburb
avg_highschool_rank = final_secondary_schools_df_cleaned.groupby('suburb')['rank'].mean().reset_index(name='average_highschool_rank')
final_df = final_df.merge(avg_highschool_rank, on='suburb', how='left')

# Step 7: Add median rank of highschools per suburb
median_highschool_rank = final_secondary_schools_df_cleaned.groupby('suburb')['rank'].median().reset_index(name='median_highschool_rank')
final_df = final_df.merge(median_highschool_rank, on='suburb', how='left')

# Step 8: Add average and median rank for primary schools per suburb
avg_primary_school_rank = primary_schools_df_final.groupby('suburb')['rank'].mean().reset_index(name='average_primary_school_rank')
final_df = final_df.merge(avg_primary_school_rank, on='suburb', how='left')

median_primary_school_rank = primary_schools_df_final.groupby('suburb')['rank'].median().reset_index(name='median_primary_school_rank')
final_df = final_df.merge(median_primary_school_rank, on='suburb', how='left')

# Step 9: Create combined 'average_school_rank' and 'median_school_rank' using both primary and highschool ranks
final_df['average_school_rank'] = final_df[['average_primary_school_rank', 'average_highschool_rank']].mean(axis=1)
final_df['median_school_rank'] = final_df[['median_primary_school_rank', 'median_highschool_rank']].median(axis=1)

# Step 10: Calculate total educational institution count
final_df['total_education_count'] = final_df[['primary_school_count', 'highschool_count', 'tertiary_institutions_count']].sum(axis=1)

# Step 11: Rank suburbs based on total education institutions count
final_df['suburb_education_rank'] = final_df['total_education_count'].rank(ascending=False)

# Step 12: Rank suburbs based on median_school_rank
final_df['suburb_median_school_rank'] = final_df['median_school_rank'].rank(ascending=True)

# Step 13: Fill NaN values in the rank columns before converting to int
final_df['suburb_education_rank'] = final_df['suburb_education_rank'].fillna(0)
final_df['suburb_median_school_rank'] = final_df['suburb_median_school_rank'].fillna(0)

# Step 14: Round rank columns to the nearest whole number and convert to int
final_df['suburb_education_rank'] = final_df['suburb_education_rank'].round().astype(int)
final_df['suburb_median_school_rank'] = final_df['suburb_median_school_rank'].round().astype(int)

# Fill NaN values with 0 for counts and 'N/A' for rank columns where applicable
final_df[['primary_school_count', 'highschool_count', 'tertiary_institutions_count']] = final_df[['primary_school_count', 'highschool_count', 'tertiary_institutions_count']].fillna(0)
final_df[['average_highschool_rank', 'median_highschool_rank', 'average_primary_school_rank', 'median_primary_school_rank', 'average_school_rank', 'median_school_rank']] = final_df[['average_highschool_rank', 'median_highschool_rank', 'average_primary_school_rank', 'median_primary_school_rank', 'average_school_rank', 'median_school_rank']].fillna('N/A')


In [462]:
# List of columns to convert to float, except for 'suburb'
columns_to_convert = ['average_highschool_rank', 'median_highschool_rank', 'average_primary_school_rank', 
                      'median_primary_school_rank', 'average_school_rank', 'median_school_rank']

# Convert these columns to float
final_df[columns_to_convert] = final_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [464]:
final_df.describe()

Unnamed: 0,primary_school_count,highschool_count,tertiary_institutions_count,average_highschool_rank,median_highschool_rank,average_primary_school_rank,median_primary_school_rank,average_school_rank,median_school_rank,total_education_count,suburb_education_rank,suburb_median_school_rank
count,944.0,944.0,944.0,374.0,374.0,914.0,914.0,934.0,934.0,944.0,944.0,944.0
mean,1.807203,0.632415,0.138771,282.005405,282.016043,1050.869209,1046.473742,931.804592,929.228319,2.57839,472.40678,462.554025
std,1.598286,1.033607,0.64706,156.878021,159.442325,508.642401,518.589531,534.038618,538.086732,2.719514,250.97864,272.558284
min,0.0,0.0,0.0,1.0,1.0,17.0,17.0,17.0,17.0,1.0,1.0,0.0
25%,1.0,0.0,0.0,146.5,144.125,624.75,615.0,464.625,458.1875,1.0,249.0,226.75
50%,1.0,0.0,0.0,285.25,286.0,1059.5,1074.0,840.25,844.0,1.0,695.0,462.5
75%,2.0,1.0,0.0,403.75,407.0,1528.0,1528.0,1474.25,1474.25,3.0,695.0,698.25
max,13.0,7.0,13.0,588.0,588.0,1820.0,1820.0,1820.0,1820.0,22.0,695.0,934.0


In [466]:
final_df.to_csv('../data/raw/education_final_df.csv', index=False)