# Clean School Data

In [1]:
# Import the required libraries
import requests
import json
import pandas as pd


In [2]:
#Import School Data
#encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252', 'utf-16']

file_path = "Resources\dv331_schoollocations2022.csv"
schools_df = pd.read_csv(file_path, encoding="latin-1")

columns_to_drop = ["Entity_Type","SCHOOL_NO","School_Status","Address_Line_1","Address_Line_2","Postal_Postcode",
                   "Address_State","Address_Town","Postal_Address_Line_1","Postal_Address_Line_1",
                   "Postal_Address_Line_2","Postal_State","Postal_Town","Full_Phone_No","LGA_ID","LGA_Name"]
schools_df = schools_df.drop(columns_to_drop, axis=1)

new_column_names = {
    "Address_Postcode": "POST_CODE",
    "X": "Longitude",
    "Y":"Latitude"
    }
schools_df = schools_df.rename(columns=new_column_names)
schools_df

Unnamed: 0,Education_Sector,School_Name,School_Type,POST_CODE,Longitude,Latitude
0,Government,Alberton Primary School,Primary,3971,146.66660,-38.61771
1,Government,Allansford and District Primary School,Primary,3277,142.59039,-38.38628
2,Government,Avoca Primary School,Primary,3467,143.47565,-37.08450
3,Government,Avenel Primary School,Primary,3664,145.23472,-36.90137
4,Government,Warrandyte Primary School,Primary,3113,145.21398,-37.74268
...,...,...,...,...,...,...
2294,Independent,Lysterfield Lake College,Primary,3804,145.31604,-37.97748
2295,Independent,Plenty River College,Secondary,3752,145.08148,-37.64875
2296,Catholic,Holy Cross Catholic Primary School,Primary,3064,144.90520,-37.53046
2297,Independent,Sidrah Gardens School,Primary,3804,145.31589,-37.97324


In [3]:
# Check for NaN values in a column
has_nan = schools_df['School_Name'].isna().any()

# Display the result - False (no NaN values in column), True (at least one NaN value in column)
print(has_nan)

False


In [4]:
#Import mapping file - use to filter to sample postcodes for further analysis
# Specify the file path and sheet name
file_path = "Resources/Melbourne Postcodes.xlsx"
sheet_name = "Mapping"

# Read the specified sheet into a DataFrame
melb_postcodes_df = pd.read_excel(file_path, sheet_name=sheet_name)
melb_postcodes_df= melb_postcodes_df.dropna(subset=['ABS_SA2_KEY'])
melb_postcodes_df= melb_postcodes_df.dropna(subset=['HOUSE_LOCALITY'])
melb_postcodes_df= melb_postcodes_df.dropna(subset=['SCHOOL_POST_CODE'])

# Display the DataFrame
melb_postcodes_df

Unnamed: 0,MUNICIPALITY,CITY_SHIRE,SUBURB_GROUP,RURAL_TOWNSHIP,POST_CODE,ABS_SA2_KEY,HOUSE_LOCALITY,SCHOOL_POST_CODE,SUBURB_NAME,COMMENTS,SUBURB_POSTCODE_COMMENTS
0,Inner City municipalities and their suburbs,City of Melbourne,Inner,,3053,206041117: Carlton,CARLTON,3053.0,Carlton,,Carlton 3053
2,Inner City municipalities and their suburbs,City of Melbourne,Inner,,3054,206071140: Carlton North - Princes Hill,CARLTON NORTH,3054.0,Carlton North,Shared with City of Yarra,Carlton North 3054 (Shared with City of Yarra)
6,Inner City municipalities and their suburbs,City of Melbourne,Inner,,3031,206031115: Flemington,FLEMINGTON,3031.0,Flemington,Shared with City of Moonee Valley,Flemington 3031 (Shared with City of Moonee Va...
7,Inner City municipalities and their suburbs,City of Melbourne,Inner,,3031,206031115: Flemington,KENSINGTON,3031.0,Kensington,,Kensington 3031
10,Inner City municipalities and their suburbs,City of Melbourne,Inner,,3051,206041506: North Melbourne,NORTH MELBOURNE,3051.0,North Melbourne,Shared with City of Moonee Valley,North Melbourne 3051 (Shared with City of Moon...
...,...,...,...,...,...,...,...,...,...,...,...
997,Western municipalities and their suburbs,City of Wyndham,Outer,,3030,213011570: Derrimut,WERRIBEE,3030.0,Werribee,,Werribee 3030
1000,Western municipalities and their suburbs,City of Wyndham,Outer,,3030,213011570: Derrimut,WERRIBEE SOUTH,3030.0,Werribee South,,Werribee South 3030
1003,Western municipalities and their suburbs,City of Wyndham,Outer,,3024,213051579: Manor Lakes - Quandong,WYNDHAM VALE,3024.0,Wyndham Vale,,Wyndham Vale 3024
1005,Western municipalities and their suburbs,City of Wyndham,Outer,Rural localities,3338,213041571: Brookfield,EYNESBURY,3338.0,Eynesbury,Shared with the Shire of Melton,Eynesbury 3338 (Shared with the Shire of Melton)


In [5]:
# Concatenate SUBURB_NAME values for each POST_CODE
postcode_suburbs_df = melb_postcodes_df.groupby('POST_CODE')['SUBURB_NAME'].agg(lambda x: ', '.join(x)).reset_index()

# Display the resulting DataFrame
postcode_suburbs_df

Unnamed: 0,POST_CODE,SUBURB_NAME
0,3003,West Melbourne
1,3011,"Footscray, Seddon"
2,3012,"Brooklyn, Brooklyn, Kingsville, Maidstone, Wes..."
3,3013,"Yarraville, Aintree, Bonnie Brook"
4,3015,"Newport, Spotswood, South Kingsville"
...,...,...
188,3975,"Lynbrook, Lyndhurst"
189,3976,Hampton Park
190,3977,"Botanic Ridge, Cranbourne, Cranbourne East, Cr..."
191,3978,"Clyde, Clyde North"


In [6]:
school_clean_1_df = pd.merge(melb_postcodes_df, schools_df, how='inner', on='POST_CODE')
columns_to_drop = ["RURAL_TOWNSHIP","ABS_SA2_KEY","HOUSE_LOCALITY","SCHOOL_POST_CODE",
                   "COMMENTS","SUBURB_NAME","SUBURB_POSTCODE_COMMENTS"]
school_clean_1_df = school_clean_1_df.drop(columns_to_drop, axis=1)
school_clean_1_df

Unnamed: 0,MUNICIPALITY,CITY_SHIRE,SUBURB_GROUP,POST_CODE,Education_Sector,School_Name,School_Type,Longitude,Latitude
0,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,Government,Carlton Gardens Primary School,Primary,144.96951,-37.80205
1,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,Government,Carlton Primary School,Primary,144.97045,-37.79560
2,Inner City municipalities and their suburbs,City of Melbourne,Inner,3054,Government,Carlton North Primary School,Primary,144.97303,-37.79106
3,Inner City municipalities and their suburbs,City of Melbourne,Inner,3054,Government,Princes Hill Primary School,Primary,144.96810,-37.78040
4,Inner City municipalities and their suburbs,City of Melbourne,Inner,3054,Government,Princes Hill Secondary College,Secondary,144.96503,-37.78348
...,...,...,...,...,...,...,...,...,...
3237,Western municipalities and their suburbs,City of Wyndham,Outer,3024,Government,Iramoo Primary School,Primary,144.62857,-37.89587
3238,Western municipalities and their suburbs,City of Wyndham,Outer,3024,Government,Wyndham Vale Primary School,Primary,144.60961,-37.89556
3239,Western municipalities and their suburbs,City of Wyndham,Outer,3024,Government,Riverbend Primary School,Primary,144.62526,-37.87668
3240,Western municipalities and their suburbs,City of Wyndham,Outer,3024,Government,Manor Lakes P-12 College,Pri/Sec,144.60185,-37.87351


In [7]:
school_clean_final_df = pd.merge(school_clean_1_df, postcode_suburbs_df, how='inner', on='POST_CODE')
new_column_names = {
    "SUBURB_NAME": "SUBURB_NAMES",
    }
school_clean_final_df = school_clean_final_df.rename(columns=new_column_names)
school_clean_final_df

Unnamed: 0,MUNICIPALITY,CITY_SHIRE,SUBURB_GROUP,POST_CODE,Education_Sector,School_Name,School_Type,Longitude,Latitude,SUBURB_NAMES
0,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,Government,Carlton Gardens Primary School,Primary,144.96951,-37.80205,Carlton
1,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,Government,Carlton Primary School,Primary,144.97045,-37.79560,Carlton
2,Inner City municipalities and their suburbs,City of Melbourne,Inner,3054,Government,Carlton North Primary School,Primary,144.97303,-37.79106,"Carlton North, Carlton North, Princes Hill"
3,Inner City municipalities and their suburbs,City of Melbourne,Inner,3054,Government,Princes Hill Primary School,Primary,144.96810,-37.78040,"Carlton North, Carlton North, Princes Hill"
4,Inner City municipalities and their suburbs,City of Melbourne,Inner,3054,Government,Princes Hill Secondary College,Secondary,144.96503,-37.78348,"Carlton North, Carlton North, Princes Hill"
...,...,...,...,...,...,...,...,...,...,...
3237,Western municipalities and their suburbs,City of Wyndham,Outer,3024,Government,Iramoo Primary School,Primary,144.62857,-37.89587,"Manor Lakes, Wyndham Vale, Mambourin"
3238,Western municipalities and their suburbs,City of Wyndham,Outer,3024,Government,Wyndham Vale Primary School,Primary,144.60961,-37.89556,"Manor Lakes, Wyndham Vale, Mambourin"
3239,Western municipalities and their suburbs,City of Wyndham,Outer,3024,Government,Riverbend Primary School,Primary,144.62526,-37.87668,"Manor Lakes, Wyndham Vale, Mambourin"
3240,Western municipalities and their suburbs,City of Wyndham,Outer,3024,Government,Manor Lakes P-12 College,Pri/Sec,144.60185,-37.87351,"Manor Lakes, Wyndham Vale, Mambourin"


In [8]:
unique_count = school_clean_final_df['POST_CODE'].nunique()

# Display the count of unique strings
print(unique_count)

193


In [9]:
# List all the unique strings in a column
unique_strings = school_clean_final_df['Education_Sector'].unique()

# Display the unique strings
print(unique_strings)

['Government' 'Catholic' 'Independent']


In [10]:
# Check for NaN values in a column
has_nan = school_clean_final_df['SUBURB_GROUP'].isna().any()

# Display the result - False (no NaN values in column), True (at least one NaN value in column)
print(has_nan)

False


In [11]:
# Reset the index and make POST_CODE the new index
school_clean_final_df = school_clean_final_df.set_index('POST_CODE')

# Export cleaned population dataset to a CSV file
school_clean_final_df.to_csv("Cleaned_Data/school_clean_final.csv", index_label="POST_CODE")
school_clean_final_df


Unnamed: 0_level_0,MUNICIPALITY,CITY_SHIRE,SUBURB_GROUP,Education_Sector,School_Name,School_Type,Longitude,Latitude,SUBURB_NAMES
POST_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3053,Inner City municipalities and their suburbs,City of Melbourne,Inner,Government,Carlton Gardens Primary School,Primary,144.96951,-37.80205,Carlton
3053,Inner City municipalities and their suburbs,City of Melbourne,Inner,Government,Carlton Primary School,Primary,144.97045,-37.79560,Carlton
3054,Inner City municipalities and their suburbs,City of Melbourne,Inner,Government,Carlton North Primary School,Primary,144.97303,-37.79106,"Carlton North, Carlton North, Princes Hill"
3054,Inner City municipalities and their suburbs,City of Melbourne,Inner,Government,Princes Hill Primary School,Primary,144.96810,-37.78040,"Carlton North, Carlton North, Princes Hill"
3054,Inner City municipalities and their suburbs,City of Melbourne,Inner,Government,Princes Hill Secondary College,Secondary,144.96503,-37.78348,"Carlton North, Carlton North, Princes Hill"
...,...,...,...,...,...,...,...,...,...
3024,Western municipalities and their suburbs,City of Wyndham,Outer,Government,Iramoo Primary School,Primary,144.62857,-37.89587,"Manor Lakes, Wyndham Vale, Mambourin"
3024,Western municipalities and their suburbs,City of Wyndham,Outer,Government,Wyndham Vale Primary School,Primary,144.60961,-37.89556,"Manor Lakes, Wyndham Vale, Mambourin"
3024,Western municipalities and their suburbs,City of Wyndham,Outer,Government,Riverbend Primary School,Primary,144.62526,-37.87668,"Manor Lakes, Wyndham Vale, Mambourin"
3024,Western municipalities and their suburbs,City of Wyndham,Outer,Government,Manor Lakes P-12 College,Pri/Sec,144.60185,-37.87351,"Manor Lakes, Wyndham Vale, Mambourin"
