# Clean Population Data

In [1]:
# Import the required libraries
import requests
import json
import pandas as pd


In [2]:
#Import Population Data
#encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252', 'utf-16']

file_path = "Resources\ABS_ERP_ASGS2021_1.0.0.csv"
population_df = pd.read_csv(file_path, encoding="utf-8")

columns_to_drop = ["DATAFLOW","MEASURE: Measure","UNIT_MEASURE: Unit of Measure",
                   "OBS_STATUS: Observation Status","OBS_COMMENT: Observation Comment"]
population_df = population_df.drop(columns_to_drop, axis=1)

new_column_names = {
    "SEX: Sex": "Gender",
    "AGE: Age": "Age",
    "REGION_TYPE: Geography Level":"Geography_Level",
    "ASGS_2021: Region":"ABS_SA2_KEY",
    "FREQ: Frequency":"Frequency",
    "TIME_PERIOD: Time Period":"Year",
    "OBS_VALUE": "Estimated_Resident_Population"
    }
population_df = population_df.rename(columns=new_column_names)
population_df


Unnamed: 0,Gender,Age,Geography_Level,ABS_SA2_KEY,Frequency,Year,Estimated_Resident_Population
0,3: Persons,TOT: All ages,AUS: Australia,AUS: Australia,A: Annual,2001,19274701
1,3: Persons,TOT: All ages,AUS: Australia,AUS: Australia,A: Annual,2002,19495210
2,3: Persons,TOT: All ages,AUS: Australia,AUS: Australia,A: Annual,2003,19720737
3,3: Persons,TOT: All ages,AUS: Australia,AUS: Australia,A: Annual,2004,19932722
4,3: Persons,TOT: All ages,AUS: Australia,AUS: Australia,A: Annual,2005,20176844
...,...,...,...,...,...,...,...
3482068,2: Females,8599: 85 and over,GCCSA: Greater Capital City Statistical Areas,9OTER: Other Territories,A: Annual,2017,28
3482069,2: Females,8599: 85 and over,GCCSA: Greater Capital City Statistical Areas,9OTER: Other Territories,A: Annual,2018,28
3482070,2: Females,8599: 85 and over,GCCSA: Greater Capital City Statistical Areas,9OTER: Other Territories,A: Annual,2019,32
3482071,2: Females,8599: 85 and over,GCCSA: Greater Capital City Statistical Areas,9OTER: Other Territories,A: Annual,2020,36


In [3]:
#Import mapping file - use to filter to sample postcodes for further analysis
# Specify the file path and sheet name
file_path = "Resources/Melbourne Postcodes.xlsx"
sheet_name = "Mapping"

# Read the specified sheet into a DataFrame
melb_postcodes_df = pd.read_excel(file_path, sheet_name=sheet_name)
melb_postcodes_df= melb_postcodes_df.dropna(subset=['ABS_SA2_KEY'])
melb_postcodes_df= melb_postcodes_df.dropna(subset=['HOUSE_LOCALITY'])
melb_postcodes_df= melb_postcodes_df.dropna(subset=['SCHOOL_POST_CODE'])

# Display the DataFrame
melb_postcodes_df

Unnamed: 0,MUNICIPALITY,CITY_SHIRE,SUBURB_GROUP,RURAL_TOWNSHIP,POST_CODE,ABS_SA2_KEY,HOUSE_LOCALITY,SCHOOL_POST_CODE,SUBURB_NAME,COMMENTS,SUBURB_POSTCODE_COMMENTS
0,Inner City municipalities and their suburbs,City of Melbourne,Inner,,3053,206041117: Carlton,CARLTON,3053.0,Carlton,,Carlton 3053
2,Inner City municipalities and their suburbs,City of Yarra,Inner,,3054,206071140: Carlton North - Princes Hill,CARLTON NORTH,3054.0,Carlton North,Shared with City of Yarra,Carlton North 3054 (Shared with City of Yarra)
6,Northern municipalities and their suburbs,City of Moonee Valley,Mid,,3031,206031115: Flemington,FLEMINGTON,3031.0,Flemington,Shared with City of Moonee Valley,Flemington 3031 (Shared with City of Moonee Va...
7,Northern municipalities and their suburbs,City of Moonee Valley,Mid,,3031,206031115: Flemington,KENSINGTON,3031.0,Kensington,,Kensington 3031
10,Inner City municipalities and their suburbs,City of Melbourne,Inner,,3051,206041506: North Melbourne,NORTH MELBOURNE,3051.0,North Melbourne,Shared with City of Moonee Valley,North Melbourne 3051 (Shared with City of Moon...
...,...,...,...,...,...,...,...,...,...,...,...
997,Western municipalities and their suburbs,City of Wyndham,Outer,,3030,213011570: Derrimut,WERRIBEE,3030.0,Werribee,,Werribee 3030
1000,Western municipalities and their suburbs,City of Wyndham,Outer,,3030,213011570: Derrimut,WERRIBEE SOUTH,3030.0,Werribee South,,Werribee South 3030
1003,Western municipalities and their suburbs,City of Wyndham,Outer,,3024,213051579: Manor Lakes - Quandong,WYNDHAM VALE,3024.0,Wyndham Vale,,Wyndham Vale 3024
1005,Western municipalities and their suburbs,City of Melton,Outer,Rural localities,3338,213041571: Brookfield,EYNESBURY,3338.0,Eynesbury,Shared with the Shire of Melton,Eynesbury 3338 (Shared with the Shire of Melton)


In [4]:
# Concatenate SUBURB_NAME values for each POST_CODE
postcode_suburbs_df = melb_postcodes_df.groupby('POST_CODE')['SUBURB_NAME'].agg(lambda x: ', '.join(x)).reset_index()

# Display the resulting DataFrame
postcode_suburbs_df

Unnamed: 0,POST_CODE,SUBURB_NAME
0,3003,West Melbourne
1,3011,"Footscray, Seddon"
2,3012,"Brooklyn, Brooklyn, Kingsville, Maidstone, Wes..."
3,3013,"Yarraville, Aintree, Bonnie Brook"
4,3015,"Newport, Spotswood, South Kingsville"
...,...,...
188,3975,"Lynbrook, Lyndhurst"
189,3976,Hampton Park
190,3977,"Botanic Ridge, Cranbourne, Cranbourne East, Cr..."
191,3978,"Clyde, Clyde North"


In [5]:
population_clean_1_df = pd.merge(melb_postcodes_df, population_df, how='inner', on='ABS_SA2_KEY')
population_clean_1_df= population_clean_1_df.dropna(subset=['HOUSE_LOCALITY'])
columns_to_drop = ["RURAL_TOWNSHIP","ABS_SA2_KEY","HOUSE_LOCALITY",
                   "COMMENTS","SUBURB_NAME","SUBURB_POSTCODE_COMMENTS","Geography_Level","Frequency"]
population_clean_1_df = population_clean_1_df.drop(columns_to_drop, axis=1)
population_clean_1_df

Unnamed: 0,MUNICIPALITY,CITY_SHIRE,SUBURB_GROUP,POST_CODE,SCHOOL_POST_CODE,Gender,Age,Year,Estimated_Resident_Population
0,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,3053.0,3: Persons,TOT: All ages,2001,9529
1,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,3053.0,3: Persons,TOT: All ages,2002,10022
2,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,3053.0,3: Persons,TOT: All ages,2003,10611
3,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,3053.0,3: Persons,TOT: All ages,2004,11224
4,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,3053.0,3: Persons,TOT: All ages,2005,11924
...,...,...,...,...,...,...,...,...,...
484780,Western municipalities and their suburbs,City of Wyndham,Outer,3024,3024.0,2: Females,8599: 85 and over,2017,21
484781,Western municipalities and their suburbs,City of Wyndham,Outer,3024,3024.0,2: Females,8599: 85 and over,2018,21
484782,Western municipalities and their suburbs,City of Wyndham,Outer,3024,3024.0,2: Females,8599: 85 and over,2019,18
484783,Western municipalities and their suburbs,City of Wyndham,Outer,3024,3024.0,2: Females,8599: 85 and over,2020,22


In [6]:
population_clean_final_df = pd.merge(population_clean_1_df, postcode_suburbs_df, how='inner', on='POST_CODE')
new_column_names = {
    "SUBURB_NAME": "SUBURB_NAMES",
    }
population_clean_final_df = population_clean_final_df.rename(columns=new_column_names)
population_clean_final_df

Unnamed: 0,MUNICIPALITY,CITY_SHIRE,SUBURB_GROUP,POST_CODE,SCHOOL_POST_CODE,Gender,Age,Year,Estimated_Resident_Population,SUBURB_NAMES
0,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,3053.0,3: Persons,TOT: All ages,2001,9529,Carlton
1,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,3053.0,3: Persons,TOT: All ages,2002,10022,Carlton
2,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,3053.0,3: Persons,TOT: All ages,2003,10611,Carlton
3,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,3053.0,3: Persons,TOT: All ages,2004,11224,Carlton
4,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053,3053.0,3: Persons,TOT: All ages,2005,11924,Carlton
...,...,...,...,...,...,...,...,...,...,...
484780,Western municipalities and their suburbs,City of Wyndham,Outer,3024,3024.0,2: Females,8599: 85 and over,2017,21,"Manor Lakes, Wyndham Vale, Mambourin"
484781,Western municipalities and their suburbs,City of Wyndham,Outer,3024,3024.0,2: Females,8599: 85 and over,2018,21,"Manor Lakes, Wyndham Vale, Mambourin"
484782,Western municipalities and their suburbs,City of Wyndham,Outer,3024,3024.0,2: Females,8599: 85 and over,2019,18,"Manor Lakes, Wyndham Vale, Mambourin"
484783,Western municipalities and their suburbs,City of Wyndham,Outer,3024,3024.0,2: Females,8599: 85 and over,2020,22,"Manor Lakes, Wyndham Vale, Mambourin"


In [7]:
unique_count = population_clean_final_df['POST_CODE'].nunique()

# Display the count of unique strings
print(unique_count)

193


In [8]:
# List all the unique strings in a column
unique_strings = population_clean_final_df['Age'].unique()

# Display the unique strings
print(unique_strings)

['TOT: All ages' 'A04: 0-4' 'A59: 5-9' 'A10: 10-14' 'A15: 15-19'
 'A20: 20-24' 'A25: 25-29' 'A30: 30-34' 'A35: 35-39' 'A40: 40-44'
 'A45: 45-49' 'A50: 50-54' 'A55: 55-59' 'A60: 60-64' 'A65: 65-69'
 'A70: 70-74' 'A75: 75-79' 'A80: 80-84' '8599: 85 and over']


In [9]:
# Check for NaN values in a column
has_nan = population_clean_final_df['Estimated_Resident_Population'].isna().any()

# Display the result - False (no NaN values in column), True (at least one NaN value in column)
print(has_nan)

False


In [10]:
population_clean_final_df = population_clean_final_df.drop_duplicates()
# Reset the index and make POST_CODE the new index
population_clean_final_df = population_clean_final_df.set_index('POST_CODE')

# Export cleaned population dataset to a CSV file
population_clean_final_df.to_csv("Cleaned_Data/population_clean_final.csv", index_label="POST_CODE")
population_clean_final_df


Unnamed: 0_level_0,MUNICIPALITY,CITY_SHIRE,SUBURB_GROUP,SCHOOL_POST_CODE,Gender,Age,Year,Estimated_Resident_Population,SUBURB_NAMES
POST_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3053,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053.0,3: Persons,TOT: All ages,2001,9529,Carlton
3053,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053.0,3: Persons,TOT: All ages,2002,10022,Carlton
3053,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053.0,3: Persons,TOT: All ages,2003,10611,Carlton
3053,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053.0,3: Persons,TOT: All ages,2004,11224,Carlton
3053,Inner City municipalities and their suburbs,City of Melbourne,Inner,3053.0,3: Persons,TOT: All ages,2005,11924,Carlton
...,...,...,...,...,...,...,...,...,...
3024,Western municipalities and their suburbs,City of Wyndham,Outer,3024.0,2: Females,8599: 85 and over,2017,21,"Manor Lakes, Wyndham Vale, Mambourin"
3024,Western municipalities and their suburbs,City of Wyndham,Outer,3024.0,2: Females,8599: 85 and over,2018,21,"Manor Lakes, Wyndham Vale, Mambourin"
3024,Western municipalities and their suburbs,City of Wyndham,Outer,3024.0,2: Females,8599: 85 and over,2019,18,"Manor Lakes, Wyndham Vale, Mambourin"
3024,Western municipalities and their suburbs,City of Wyndham,Outer,3024.0,2: Females,8599: 85 and over,2020,22,"Manor Lakes, Wyndham Vale, Mambourin"
