# Pre-process External Datasets

Perform basic data type transformations and column renaming on external datasets and save into `raw` folder

In [14]:
import os
import pandas as pd
import geopandas as gpd

## ABS SA2 Boundary Shapefile

Read file

In [40]:
sf = gpd.read_file("../data/landing/sa2_data/boundary/SA2_2021_AUST_GDA2020.shp")

sf.head()

Unnamed: 0,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
0,101021007,Braidwood,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,3418.3525,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.58424 -35.44426, 149.58444 -35.4..."
1,101021008,Karabar,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,6.9825,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.21899 -35.36738, 149.218 -35.366..."
2,101021009,Queanbeyan,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,4.762,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.21326 -35.34325, 149.21619 -35.3..."
3,101021010,Queanbeyan - East,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.0032,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.24034 -35.34781, 149.24024 -35.3..."
4,101021012,Queanbeyan West - Jerrabomberra,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.6748,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.19572 -35.36126, 149.1997 -35.35..."


In [41]:
sf.columns

Index(['SA2_CODE21', 'SA2_NAME21', 'CHG_FLAG21', 'CHG_LBL21', 'SA3_CODE21',
       'SA3_NAME21', 'SA4_CODE21', 'SA4_NAME21', 'GCC_CODE21', 'GCC_NAME21',
       'STE_CODE21', 'STE_NAME21', 'AUS_CODE21', 'AUS_NAME21', 'AREASQKM21',
       'LOCI_URI21', 'geometry'],
      dtype='object')

Edit column names to make them a consistent case and consistent with other datasets

In [42]:
# Removes "21" from the string, and converts to lowercase
sf.columns = [c.replace("21", "").lower() for c in sf.columns]

sf.columns

Index(['sa2_code', 'sa2_name', 'chg_flag', 'chg_lbl', 'sa3_code', 'sa3_name',
       'sa4_code', 'sa4_name', 'gcc_code', 'gcc_name', 'ste_code', 'ste_name',
       'aus_code', 'aus_name', 'areasqkm', 'loci_uri', 'geometry'],
      dtype='object')

Check data types

In [43]:
sf.dtypes

sa2_code      object
sa2_name      object
chg_flag      object
chg_lbl       object
sa3_code      object
sa3_name      object
sa4_code      object
sa4_name      object
gcc_code      object
gcc_name      object
ste_code      object
ste_name      object
aus_code      object
aus_name      object
areasqkm     float64
loci_uri      object
geometry    geometry
dtype: object

Save into raw folder

In [44]:
sf_dir = "../data/raw/sa2_data/boundary"

# Create a new sub-folder if one doesn't already exist
if not os.path.exists(sf_dir):
    os.makedirs(sf_dir)

sf_output = f"{sf_dir}/sa2_boundary.shp"

sf.to_file(sf_output)

## ABS SA2 Historical Population Data

Read file, ignoring excess formatting cells

In [5]:
pop_df = pd.read_excel("../data/landing/sa2_data/sa2_population_2001-23.xlsx", 
                       sheet_name = "Table 1", header = [5, 6], skipfooter = 2)

pop_df.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,2001,2002,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
Unnamed: 0_level_1,GCCSA code,GCCSA name,SA4 code,SA4 name,SA3 code,SA3 name,SA2 code,SA2 name,no.,no.,...,no.,no.,no.,no.,no.,no.,no.,no.,no.,no.
0,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021007,Braidwood,2760,2811,...,3762,3849,3950,4041,4145,4218,4282,4332,4366,4396
1,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021008,Karabar,9129,9199,...,8731,8603,8531,8530,8516,8500,8535,8548,8528,8483
2,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021009,Queanbeyan,9717,9513,...,11199,11213,11230,11362,11460,11468,11460,11375,11391,11420
3,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021010,Queanbeyan - East,3925,4073,...,4967,4961,4970,5016,5079,5126,5089,5097,5091,5099
4,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021012,Queanbeyan West - Jerrabomberra,9425,10257,...,13193,13164,13150,13090,13022,12955,12821,12748,12781,12873


In [8]:
pop_df.columns

MultiIndex([('Unnamed: 0_level_0', 'GCCSA code'),
            ('Unnamed: 1_level_0', 'GCCSA name'),
            ('Unnamed: 2_level_0',   'SA4 code'),
            ('Unnamed: 3_level_0',   'SA4 name'),
            ('Unnamed: 4_level_0',   'SA3 code'),
            ('Unnamed: 5_level_0',   'SA3 name'),
            ('Unnamed: 6_level_0',   'SA2 code'),
            ('Unnamed: 7_level_0',   'SA2 name'),
            (                2001,        'no.'),
            (                2002,        'no.'),
            (                2003,        'no.'),
            (                2004,        'no.'),
            (                2005,        'no.'),
            (                2006,        'no.'),
            (                2007,        'no.'),
            (                2008,        'no.'),
            (                2009,        'no.'),
            (                2010,        'no.'),
            (                2011,        'no.'),
            (                2012,        'no.'),


Edit column names for clarity and consistency with other datasets

In [9]:
# List to hold new column names
pop_new_cols = []

for i in range(len(pop_df.columns)):

    # For the first 8 columns, take the second part of the multi-header 
    # Rename "gccsa" to "gcc" for consistency with other datasets
    if i < 8:
        pop_new_cols.append(pop_df.columns[i][1]
                            .lower() # Convert to lowercase
                            .replace(" ", "_") # Replace whitespace with _
                            .replace("gccsa", "gcc") # Replace "gccsa" with "gcc"
                            )

    # For the rest of the columns, take the first part of the multi-header    
    else:
        pop_new_cols.append(str(pop_df.columns[i][0]))

pop_new_cols

['gcc_code',
 'gcc_name',
 'sa4_code',
 'sa4_name',
 'sa3_code',
 'sa3_name',
 'sa2_code',
 'sa2_name',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '2022',
 '2023']

Assign new column names

In [10]:
pop_df.columns = pop_new_cols

pop_df.head()

Unnamed: 0,gcc_code,gcc_name,sa4_code,sa4_name,sa3_code,sa3_name,sa2_code,sa2_name,2001,2002,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021007,Braidwood,2760,2811,...,3762,3849,3950,4041,4145,4218,4282,4332,4366,4396
1,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021008,Karabar,9129,9199,...,8731,8603,8531,8530,8516,8500,8535,8548,8528,8483
2,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021009,Queanbeyan,9717,9513,...,11199,11213,11230,11362,11460,11468,11460,11375,11391,11420
3,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021010,Queanbeyan - East,3925,4073,...,4967,4961,4970,5016,5079,5126,5089,5097,5091,5099
4,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021012,Queanbeyan West - Jerrabomberra,9425,10257,...,13193,13164,13150,13090,13022,12955,12821,12748,12781,12873


Check data types

In [11]:
pop_df.dtypes

gcc_code    object
gcc_name    object
sa4_code     int64
sa4_name    object
sa3_code     int64
sa3_name    object
sa2_code     int64
sa2_name    object
2001         int64
2002         int64
2003         int64
2004         int64
2005         int64
2006         int64
2007         int64
2008         int64
2009         int64
2010         int64
2011         int64
2012         int64
2013         int64
2014         int64
2015         int64
2016         int64
2017         int64
2018         int64
2019         int64
2020         int64
2021         int64
2022         int64
2023         int64
dtype: object

Save into raw folder

In [12]:
pop_df_dir = "../data/raw/sa2_data"

# Create a new sub-folder if one doesn't already exist
if not os.path.exists(pop_df_dir):
    os.makedirs(pop_df_dir)

pop_df_output = f"{pop_df_dir}/sa2_population_2001-23.csv"

pop_df.to_csv(pop_df_output, index = False)

## ABS SA2 Income Data

Read file, ignoring excess formatting cells

In [6]:
income_df = pd.read_excel("../data/landing/sa2_data/sa2_income_2021.xlsx", 
                          sheet_name = "Table 2.4", header = [5, 6], skipfooter = 5, 
                          thousands = ",", na_values = "np")

income_df.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Earners,Median age of earners,Sum,Median,Mean,P80/P20,P80/P50,P20/P50,P10/P50,Gini coefficient,Top 1%,Top 5%,Top 10%,Lowest Quartile,Second Quartile,Third Quartile,Highest Quartile
Unnamed: 0_level_1,SA2,SA2 NAME,(persons),years,$,$,$,ratio,ratio,ratio,ratio,coef.,%,%,%,%,%,%,%
0,Australia,,14760008.0,42.0,1040901000000.0,54890.0,70522.0,5.0,1.88,0.38,0.14,0.483,9.8,22.5,33.4,25.0,25.0,25.0,25.0
1,New South Wales,,4603736.0,42.0,341107500000.0,55854.0,74094.0,5.0,1.9,0.38,0.15,0.497,11.2,24.3,35.2,24.6,24.6,24.7,26.1
2,101021007,Braidwood,2467.0,51.0,169986700.0,46640.0,68904.0,6.84,2.05,0.3,0.07,0.615,21.2,32.6,42.7,31.8,24.3,21.7,22.2
3,101021008,Karabar,5103.0,42.0,355538300.0,65564.0,69672.0,3.37,1.57,0.47,0.23,0.365,4.4,14.5,24.1,17.8,22.6,32.9,26.7
4,101021009,Queanbeyan,7028.0,39.0,486157400.0,63528.0,69174.0,3.24,1.58,0.49,0.26,0.368,4.6,14.7,24.5,16.8,24.0,33.4,25.7


In [7]:
income_df.columns

MultiIndex([(   'Unnamed: 0_level_0',       'SA2'),
            (   'Unnamed: 1_level_0',  'SA2 NAME'),
            (              'Earners', '(persons)'),
            ('Median age of earners',     'years'),
            (                  'Sum',         '$'),
            (               'Median',         '$'),
            (                 'Mean',         '$'),
            (              'P80/P20',     'ratio'),
            (              'P80/P50',     'ratio'),
            (              'P20/P50',     'ratio'),
            (              'P10/P50',     'ratio'),
            (     'Gini coefficient',     'coef.'),
            (               'Top 1%',         '%'),
            (               'Top 5%',         '%'),
            (              'Top 10%',         '%'),
            (      'Lowest Quartile',         '%'),
            (      'Second Quartile',         '%'),
            (       'Third Quartile',         '%'),
            (     'Highest Quartile',         '%')],
           

Edit column names for clarity and consistency with other datasets

In [8]:
# List to hold new column names
income_new_cols = []

for i in range(len(income_df.columns)):

    # For the first 2 columns, take the second part of the multi-header 
    if i < 2:
        income_new_cols.append(income_df.columns[i][1]
                            .lower() # Convert to lowercase
                            .replace(" ", "_") # Replace whitespace with _
                            )

    # For the rest of the columns, take the first part of the multi-header    
    else:
        income_new_cols.append(income_df.columns[i][0]
                               .lower() # Convert to lowercase
                               .replace(" ", "_")) # Replace whitespace with _

income_new_cols

['sa2',
 'sa2_name',
 'earners',
 'median_age_of_earners',
 'sum',
 'median',
 'mean',
 'p80/p20',
 'p80/p50',
 'p20/p50',
 'p10/p50',
 'gini_coefficient',
 'top_1%',
 'top_5%',
 'top_10%',
 'lowest_quartile',
 'second_quartile',
 'third_quartile',
 'highest_quartile']

Other datasets have `sa2_code` instead of `sa2`. Change for consistency with other datasets

In [10]:
income_new_cols[0] = "sa2_code"

Assign new column names

In [11]:
income_df.columns = income_new_cols

income_df.head()

Unnamed: 0,sa2_code,sa2_name,earners,median_age_of_earners,sum,median,mean,p80/p20,p80/p50,p20/p50,p10/p50,gini_coefficient,top_1%,top_5%,top_10%,lowest_quartile,second_quartile,third_quartile,highest_quartile
0,Australia,,14760008.0,42.0,1040901000000.0,54890.0,70522.0,5.0,1.88,0.38,0.14,0.483,9.8,22.5,33.4,25.0,25.0,25.0,25.0
1,New South Wales,,4603736.0,42.0,341107500000.0,55854.0,74094.0,5.0,1.9,0.38,0.15,0.497,11.2,24.3,35.2,24.6,24.6,24.7,26.1
2,101021007,Braidwood,2467.0,51.0,169986700.0,46640.0,68904.0,6.84,2.05,0.3,0.07,0.615,21.2,32.6,42.7,31.8,24.3,21.7,22.2
3,101021008,Karabar,5103.0,42.0,355538300.0,65564.0,69672.0,3.37,1.57,0.47,0.23,0.365,4.4,14.5,24.1,17.8,22.6,32.9,26.7
4,101021009,Queanbeyan,7028.0,39.0,486157400.0,63528.0,69174.0,3.24,1.58,0.49,0.26,0.368,4.6,14.7,24.5,16.8,24.0,33.4,25.7


Check data types

In [12]:
income_df.dtypes

sa2_code                  object
sa2_name                  object
earners                  float64
median_age_of_earners    float64
sum                      float64
median                   float64
mean                     float64
p80/p20                  float64
p80/p50                  float64
p20/p50                  float64
p10/p50                  float64
gini_coefficient         float64
top_1%                   float64
top_5%                   float64
top_10%                  float64
lowest_quartile          float64
second_quartile          float64
third_quartile           float64
highest_quartile         float64
dtype: object

Save into raw folder

In [13]:
income_df_dir = "../data/raw/sa2_data"

# Create a new sub-folder if one doesn't already exist
if not os.path.exists(income_df_dir):
    os.makedirs(income_df_dir)

income_df_output = f"{income_df_dir}/sa2_income_2021.csv"

income_df.to_csv(income_df_output, index = False)

## Department of Education School Location Data

Read file

In [19]:
school_df = pd.read_csv("../data/landing/school_data/school_location.csv", encoding_errors='replace')

school_df.head()

Unnamed: 0,Education_Sector,Entity_Type,School_No,School_Name,School_Type,School_Status,Address_Line_1,Address_Line_2,Address_Town,Address_State,...,Postal_Address_Line_1,Postal_Address_Line_2,Postal_Town,Postal_State,Postal_Postcode,Full_Phone_No,LGA_ID,LGA_Name,X,Y
0,Government,1,1,Alberton Primary School,Primary,O,21 Thomson Street,,Alberton,VIC,...,21 Thomson Street,,ALBERTON,VIC,3971,03 5183 2412,681,Wellington (S),146.6666,-38.61771
1,Government,1,3,Allansford and District Primary School,Primary,O,Frank Street,,Allansford,VIC,...,Frank Street,,ALLANSFORD,VIC,3277,03 5565 1382,673,Warrnambool (C),142.59039,-38.38628
2,Government,1,4,Avoca Primary School,Primary,O,118 Barnett Street,,Avoca,VIC,...,P O Box 12,,AVOCA,VIC,3467,03 5465 3176,599,Pyrenees (S),143.47565,-37.0845
3,Government,1,8,Avenel Primary School,Primary,O,40 Anderson Street,,Avenel,VIC,...,40 Anderson Street,,AVENEL,VIC,3664,03 5796 2264,643,Strathbogie (S),145.23472,-36.90137
4,Government,1,12,Warrandyte Primary School,Primary,O,5-11 Forbes Street,,Warrandyte,VIC,...,5-11 Forbes Street,,WARRANDYTE,VIC,3113,03 9844 3537,421,Manningham (C),145.21398,-37.74268


In [20]:
school_df.columns

Index(['Education_Sector', 'Entity_Type', 'School_No', 'School_Name',
       'School_Type', 'School_Status', 'Address_Line_1', 'Address_Line_2',
       'Address_Town', 'Address_State', 'Address_Postcode',
       'Postal_Address_Line_1', 'Postal_Address_Line_2', 'Postal_Town',
       'Postal_State', 'Postal_Postcode', 'Full_Phone_No', 'LGA_ID',
       'LGA_Name', 'X', 'Y'],
      dtype='object')

Make column name casing consistent

In [21]:
school_df.columns = [c.lower() for c in school_df.columns]

school_df.columns

Index(['education_sector', 'entity_type', 'school_no', 'school_name',
       'school_type', 'school_status', 'address_line_1', 'address_line_2',
       'address_town', 'address_state', 'address_postcode',
       'postal_address_line_1', 'postal_address_line_2', 'postal_town',
       'postal_state', 'postal_postcode', 'full_phone_no', 'lga_id',
       'lga_name', 'x', 'y'],
      dtype='object')

Check data types

In [22]:
school_df.dtypes

education_sector          object
entity_type                int64
school_no                  int64
school_name               object
school_type               object
school_status             object
address_line_1            object
address_line_2            object
address_town              object
address_state             object
address_postcode           int64
postal_address_line_1     object
postal_address_line_2     object
postal_town               object
postal_state              object
postal_postcode            int64
full_phone_no             object
lga_id                     int64
lga_name                  object
x                        float64
y                        float64
dtype: object

Save into raw folder

In [23]:
school_df_dir = "../data/raw/school_data"

# Create a new sub-folder if one doesn't already exist
if not os.path.exists(school_df_dir):
    os.makedirs(school_df_dir)

school_df_output = f"{school_df_dir}/school_location.csv"

school_df.to_csv(school_df_output, index = False)

## DFFH Suburb Historical Rent Data

Read file, ignoring excess formatting cells

In [80]:
rent_df = pd.read_excel("../data/landing/suburb_data/suburb_rent_hist.xlsx", 
                        sheet_name = "All properties", header = [1, 2], 
                        index_col = [0, 1], na_values = "-")

rent_df.head()

Unnamed: 0_level_0,All properties,Mar 2000,Mar 2000,Jun 2000,Jun 2000,Sep 2000,Sep 2000,Dec 2000,Dec 2000,Mar 2001,Mar 2001,...,Mar 2022,Mar 2022,Jun 2022,Jun 2022,Sep 2022,Sep 2022,Dec 2022,Dec 2022,Mar 2023,Mar 2023
Unnamed: 0_level_1,Unnamed: 1_level_1,Count,Median,Count,Median,Count,Median,Count,Median,Count,Median,...,Count,Median,Count,Median,Count,Median,Count,Median,Count,Median
Inner Melbourne,Albert Park-Middle Park-West St Kilda,1143.0,260.0,1134,260,1177.0,270.0,1178.0,275.0,1208.0,275.0,...,867,500,855,515,881,500,832,525,786,545
Inner Melbourne,Armadale,733.0,200.0,737,200,738.0,205.0,739.0,210.0,718.0,215.0,...,805,430,851,450,852,450,840,460,751,490
Inner Melbourne,Carlton North,864.0,260.0,814,260,799.0,265.0,736.0,270.0,718.0,270.0,...,581,580,535,595,547,600,546,600,490,620
Inner Melbourne,Carlton-Parkville,1339.0,260.0,1304,260,1300.0,260.0,1320.0,260.0,1273.0,260.0,...,6143,310,6018,319,6871,340,6627,350,6690,400
Inner Melbourne,CBD-St Kilda Rd,2132.0,320.0,2264,320,2358.0,320.0,2361.0,320.0,2591.0,320.0,...,17845,365,16792,390,18284,419,17627,450,17426,500


In [82]:
rent_df.columns

MultiIndex([('Mar 2000',  'Count'),
            ('Mar 2000', 'Median'),
            ('Jun 2000',  'Count'),
            ('Jun 2000', 'Median'),
            ('Sep 2000',  'Count'),
            ('Sep 2000', 'Median'),
            ('Dec 2000',  'Count'),
            ('Dec 2000', 'Median'),
            ('Mar 2001',  'Count'),
            ('Mar 2001', 'Median'),
            ...
            ('Mar 2022',  'Count'),
            ('Mar 2022', 'Median'),
            ('Jun 2022',  'Count'),
            ('Jun 2022', 'Median'),
            ('Sep 2022',  'Count'),
            ('Sep 2022', 'Median'),
            ('Dec 2022',  'Count'),
            ('Dec 2022', 'Median'),
            ('Mar 2023',  'Count'),
            ('Mar 2023', 'Median')],
           names=['All properties', None], length=186)

Combine column headings and make a consistent case

In [83]:
rent_df.columns = [f"{i}_{j}".lower().replace(" ", "_") for i, j in rent_df.columns]

rent_df.head()

Unnamed: 0,Unnamed: 1,mar_2000_count,mar_2000_median,jun_2000_count,jun_2000_median,sep_2000_count,sep_2000_median,dec_2000_count,dec_2000_median,mar_2001_count,mar_2001_median,...,mar_2022_count,mar_2022_median,jun_2022_count,jun_2022_median,sep_2022_count,sep_2022_median,dec_2022_count,dec_2022_median,mar_2023_count,mar_2023_median
Inner Melbourne,Albert Park-Middle Park-West St Kilda,1143.0,260.0,1134,260,1177.0,270.0,1178.0,275.0,1208.0,275.0,...,867,500,855,515,881,500,832,525,786,545
Inner Melbourne,Armadale,733.0,200.0,737,200,738.0,205.0,739.0,210.0,718.0,215.0,...,805,430,851,450,852,450,840,460,751,490
Inner Melbourne,Carlton North,864.0,260.0,814,260,799.0,265.0,736.0,270.0,718.0,270.0,...,581,580,535,595,547,600,546,600,490,620
Inner Melbourne,Carlton-Parkville,1339.0,260.0,1304,260,1300.0,260.0,1320.0,260.0,1273.0,260.0,...,6143,310,6018,319,6871,340,6627,350,6690,400
Inner Melbourne,CBD-St Kilda Rd,2132.0,320.0,2264,320,2358.0,320.0,2361.0,320.0,2591.0,320.0,...,17845,365,16792,390,18284,419,17627,450,17426,500


Check data types

In [84]:
rent_df.dtypes

mar_2000_count     float64
mar_2000_median    float64
jun_2000_count       int64
jun_2000_median      int64
sep_2000_count     float64
                    ...   
sep_2022_median      int64
dec_2022_count       int64
dec_2022_median      int64
mar_2023_count       int64
mar_2023_median      int64
Length: 186, dtype: object

Save into raw folder

In [85]:
rent_df_dir = "../data/raw/suburb_data"

# Create a new sub-folder if one doesn't already exist
if not os.path.exists(rent_df_dir):
    os.makedirs(rent_df_dir)

rent_df_output = f"{rent_df_dir}/suburb_rent_hist.csv"

rent_df.to_csv(rent_df_output)