# ABS preprocessing

This notebook aggregates the Openspace data such that the relevant fields are extracted and the data is neatly formatted into a csv wherein the rows are associated with an SA2

In [30]:
import pandas as pd
import geopandas as gpd
from shapely import Point

In [31]:
business_listing = pd.read_csv("../../data/landing/business_listing.csv")
SAL = gpd.read_file('../../data/landing/SAL_data/SAL_2021_AUST_GDA2020.shp')

business_listing

Unnamed: 0,census_year,block_id,property_id,base_property_id,clue_small_area,trading_name,business_address,industry_anzsic4_code,industry_anzsic4_description,longitude,latitude
0,2017,266,109851,109851,Carlton,Metropoli's Research Pty Ltd,"Level 1, 74 Victoria Street CARLTON 3053",6950,Market Research and Statistical Services,144.965352,-37.806701
1,2017,266,109851,109851,Carlton,J Hong Restaurant,"Ground , 74 Victoria Street CARLTON 3053",4511,Cafes and Restaurants,144.965352,-37.806701
2,2017,266,534003,534003,Carlton,St2 Expresso,70 Victoria Street CARLTON 3053,4512,Takeaway Food Services,144.965473,-37.806714
3,2017,266,664003,664003,Carlton,RMIT Resources Ltd,20 Cardigan Street CARLTON 3053,8102,Higher Education,144.964753,-37.806312
4,2017,266,664005,664005,Carlton,vacant,24 Cardigan Street CARLTON 3053,0,Vacant Space,144.964772,-37.806203
...,...,...,...,...,...,...,...,...,...,...,...
374205,2017,266,106082,106082,Carlton,RMIT University (BLD 42) (Ericson Building),36-40 Lygon Street CARLTON 3053,8102,Higher Education,144.965375,-37.805471
374206,2017,266,106082,106082,Carlton,RMIT University (BLD 95),24-26 Lygon Street CARLTON 3053,8102,Higher Education,144.965375,-37.805471
374207,2017,266,107083,107083,Carlton,RMIT University,11-13 Orr Street CARLTON 3053,8102,Higher Education,144.965017,-37.806389
374208,2017,266,107087,107087,Carlton,Vacant,8-14 Orr Street CARLTON 3053,0,Vacant Space,144.965370,-37.806513


In [32]:
def convert_ansic_code_with_name(ansic_code: int, version: str = "2006") -> str:
    """
    Convert an ANZSIC code into its division and return the name of the division based on ANZSIC version (2006 or 1993).
    
    Args:
    ansic_code (int): The ANZSIC code to convert.
    version (str): The ANZSIC version ("2006" or "1993"). Default is "2006".
    
    Returns:
    str: The division name corresponding to the given ANZSIC code.
    """
    
    # ANZSIC 2006 Division Code Ranges with Names
    anzsic_2006_divisions = {
        "V": ("Vacant Land", range(0,1)),
        "A": ("Agriculture, Forestry and Fishing", range(100, 600)),
        "B": ("Mining", range(600, 1100)),
        "C": ("Manufacturing", range(1100, 2600)),
        "D": ("Electricity, Gas, Water and Waste Services", range(2600, 3000)),
        "E": ("Construction", range(3000, 3300)),
        "F": ("Wholesale Trade", range(3300, 3900)),
        "G": ("Retail Trade", range(3900, 4400)),
        "H": ("Accommodation and Food Services", range(4400, 4600)),
        "I": ("Transport, Postal and Warehousing", range(4600, 5400)),
        "J": ("Information Media and Telecommunications", range(5400, 6100)),
        "K": ("Financial and Insurance Services", range(6200, 6500)),
        "L": ("Rental, Hiring and Real Estate Services", range(6600, 6800)),
        "M": ("Professional, Scientific and Technical Services", range(6900, 7100)),
        "N": ("Administrative and Support Services", range(7200, 7400)),
        "O": ("Public Administration and Safety", range(7500, 7800)),
        "P": ("Education and Training", range(8000, 8300)),
        "Q": ("Health Care and Social Assistance", range(8400, 8800)),
        "R": ("Arts and Recreation Services", range(8900, 9300)),
        "S": ("Other Services", range(9400, 9700))
    }
    

    
    # Find and return the corresponding division name
    for division, (name, code_range) in anzsic_2006_divisions.items():
        if ansic_code in code_range:
            return f"{name}"
    
    return "Unknown Division"


In [33]:
# group by year and count
business_listing['division'] = business_listing['industry_anzsic4_code'].apply(convert_ansic_code_with_name)

In [34]:
business_listing

Unnamed: 0,census_year,block_id,property_id,base_property_id,clue_small_area,trading_name,business_address,industry_anzsic4_code,industry_anzsic4_description,longitude,latitude,division
0,2017,266,109851,109851,Carlton,Metropoli's Research Pty Ltd,"Level 1, 74 Victoria Street CARLTON 3053",6950,Market Research and Statistical Services,144.965352,-37.806701,"Professional, Scientific and Technical Services"
1,2017,266,109851,109851,Carlton,J Hong Restaurant,"Ground , 74 Victoria Street CARLTON 3053",4511,Cafes and Restaurants,144.965352,-37.806701,Accommodation and Food Services
2,2017,266,534003,534003,Carlton,St2 Expresso,70 Victoria Street CARLTON 3053,4512,Takeaway Food Services,144.965473,-37.806714,Accommodation and Food Services
3,2017,266,664003,664003,Carlton,RMIT Resources Ltd,20 Cardigan Street CARLTON 3053,8102,Higher Education,144.964753,-37.806312,Education and Training
4,2017,266,664005,664005,Carlton,vacant,24 Cardigan Street CARLTON 3053,0,Vacant Space,144.964772,-37.806203,Vacant Land
...,...,...,...,...,...,...,...,...,...,...,...,...
374205,2017,266,106082,106082,Carlton,RMIT University (BLD 42) (Ericson Building),36-40 Lygon Street CARLTON 3053,8102,Higher Education,144.965375,-37.805471,Education and Training
374206,2017,266,106082,106082,Carlton,RMIT University (BLD 95),24-26 Lygon Street CARLTON 3053,8102,Higher Education,144.965375,-37.805471,Education and Training
374207,2017,266,107083,107083,Carlton,RMIT University,11-13 Orr Street CARLTON 3053,8102,Higher Education,144.965017,-37.806389,Education and Training
374208,2017,266,107087,107087,Carlton,Vacant,8-14 Orr Street CARLTON 3053,0,Vacant Space,144.965370,-37.806513,Vacant Land


In [35]:
business_listing['industry_anzsic4_description'].unique()

business_listing.groupby('census_year').size()
by_industry = business_listing.groupby(['census_year', 'industry_anzsic4_description']).size().unstack(fill_value=0)
by_industry

top_20_industries = by_industry.sum().nlargest(20)
top_20_industries

industry_anzsic4_description
Vacant Space                                       57392
Cafes and Restaurants                              29666
Legal Services                                     13415
Takeaway Food Services                             11304
Computer System Design and Related Services         9580
Other Auxiliary Finance and Investment Services     9354
Management Advice and Other Consulting Services     8790
Specialist Medical Services                         7027
Hairdressing and Beauty Services                    6327
Clothing Retailing                                  5750
Accommodation                                       5524
Accounting Services                                 5412
Womens Clothing Retailing                           5245
Employment Placement and Recruitment Services       5201
Pubs, Taverns and Bars                              4980
Real Estate Services                                4951
Other Interest Group Services n.e.c.                4753
Ar

In [36]:
# spatial join
SAL = SAL.to_crs('EPSG:4326')

# Create geometry from the latitude and longitude
business_listing['geometry'] = business_listing.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

# Convert to a GeoDataFrame
gdf = gpd.GeoDataFrame(business_listing, geometry='geometry')

In [37]:
businesses_and_SAL = gpd.sjoin(SAL, gdf , how="inner", predicate="intersects")

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

  businesses_and_SAL = gpd.sjoin(SAL, gdf , how="inner", predicate="intersects")


In [38]:
businesses_and_SAL

Unnamed: 0,SAL_CODE21,SAL_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,SHAPE_Leng,SHAPE_Area,...,property_id,base_property_id,clue_small_area,trading_name,business_address,industry_anzsic4_code,industry_anzsic4_description,longitude,latitude,division
5038,20495,Carlton (Vic.),2,Victoria,AUS,Australia,1.7728,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.062266,0.000181,...,102602,102602,Carlton,Minh Doan,"Ground 0, 3 Drummond Street CARLTON 3053",6931,Legal Services,144.967055,-37.806885,"Professional, Scientific and Technical Services"
5038,20495,Carlton (Vic.),2,Victoria,AUS,Australia,1.7728,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.062266,0.000181,...,102602,102602,Carlton,Anthony Harvey,3 Drummond Street CARLTON 3053,8512,Specialist Medical Services,144.967055,-37.806885,Health Care and Social Assistance
5038,20495,Carlton (Vic.),2,Victoria,AUS,Australia,1.7728,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.062266,0.000181,...,102602,102602,Carlton,Anthony Harvey,3 Drummond Street CARLTON 3053,8512,Specialist Medical Services,144.967055,-37.806885,Health Care and Social Assistance
5038,20495,Carlton (Vic.),2,Victoria,AUS,Australia,1.7728,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.062266,0.000181,...,102602,102602,Carlton,Anthony Harvey,3 Drummond Street CARLTON 3053,8512,Specialist Medical Services,144.967055,-37.806885,Health Care and Social Assistance
5038,20495,Carlton (Vic.),2,Victoria,AUS,Australia,1.7728,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.062266,0.000181,...,102602,102602,Carlton,Anthony Harvey,3 Drummond Street CARLTON 3053,8512,Specialist Medical Services,144.967055,-37.806885,Health Care and Social Assistance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7300,22757,West Melbourne,2,Victoria,AUS,Australia,6.5822,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.139928,0.000673,...,111299,111299,West Melbourne (Industrial),Reserve,120-180 Sims Street WEST MELBOURNE VIC 3003,8922,Nature Reserves and Conservation Parks Operation,144.908307,-37.805751,Arts and Recreation Services
7300,22757,West Melbourne,2,Victoria,AUS,Australia,6.5822,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.139928,0.000673,...,111299,111299,West Melbourne (Industrial),Reserve,120-180 Sims Street WEST MELBOURNE 3003,8922,Nature Reserves and Conservation Parks Operation,144.908307,-37.805751,Arts and Recreation Services
7300,22757,West Melbourne,2,Victoria,AUS,Australia,6.5822,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.139928,0.000673,...,111299,111299,West Melbourne (Industrial),Reserve,120-180 Sims Street WEST MELBOURNE 3003,8922,Nature Reserves and Conservation Parks Operation,144.908307,-37.805751,Arts and Recreation Services
7300,22757,West Melbourne,2,Victoria,AUS,Australia,6.5822,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.139928,0.000673,...,111299,111299,West Melbourne (Industrial),Reserve,120-180 Sims Street WEST MELBOURNE VIC 3003,8922,Nature Reserves and Conservation Parks Operation,144.908307,-37.805751,Arts and Recreation Services


In [39]:
sorted_census_years = business_listing['census_year'].sort_values().unique()
sorted_census_years

array([2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])

In [43]:
import os

n = 30
aggregated = business_listing.pivot_table(index='SAL_CODE21', columns='industry_anzsic4_description', aggfunc='size', fill_value=0)
top_n_industries = aggregated.sum().nlargest(n).index

# Ensure the directory exists
os.makedirs("../../data/curated/business_data", exist_ok=True)

df_years = []

for year in sorted_census_years:
    businesses_and_SAL_year = businesses_and_SAL[businesses_and_SAL['census_year'] == year]
    aggregated = businesses_and_SAL_year.pivot_table(index='SAL_CODE21', columns='industry_anzsic4_description', aggfunc='size', fill_value=0)
    # Get top n industries
    
    aggregated = aggregated[top_n_industries]

    aggregated_by_division = businesses_and_SAL_year.pivot_table(index='SAL_CODE21', columns='division', aggfunc='size', fill_value=0)

    combined = aggregated.add(aggregated_by_division, fill_value=0)

    combined.to_csv(f"../../data/curated/business_data/businesses_and_SAL_{year}.csv")

    df_years.append(combined)


    

In [44]:
df_years[-1]

Unnamed: 0_level_0,Accommodation,Accommodation and Food Services,Accounting Services,Administrative and Support Services,"Agriculture, Forestry and Fishing",Architectural Services,Arts and Recreation Services,Cafes and Restaurants,Clothing Retailing,Computer System Design and Related Services,...,Retail Trade,Specialist Medical Services,Takeaway Food Services,Technical and Vocational Education and Training,"Transport, Postal and Warehousing",Vacant Land,Vacant Space,Watch and Jewellery Retailing,Wholesale Trade,Womens Clothing Retailing
SAL_CODE21,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20495,77.0,290.0,17.0,12.0,0.0,22.0,33.0,160.0,6.0,8.0,...,94.0,9.0,33.0,4.0,2.0,375.0,375.0,2.0,11.0,10.0
20496,0.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
20766,16.0,267.0,17.0,21.0,2.0,13.0,59.0,165.0,22.0,47.0,...,149.0,3.0,72.0,10.0,34.0,646.0,646.0,2.0,14.0,17.0
20830,23.0,60.0,5.0,4.0,0.0,5.0,40.0,31.0,0.0,9.0,...,12.0,156.0,3.0,1.0,4.0,140.0,140.0,0.0,2.0,0.0
20929,0.0,7.0,0.0,0.0,1.0,0.0,19.0,4.0,0.0,0.0,...,5.0,1.0,3.0,0.0,1.0,11.0,11.0,0.0,0.0,0.0
21327,1.0,47.0,1.0,6.0,0.0,1.0,69.0,23.0,1.0,5.0,...,23.0,2.0,15.0,1.0,10.0,136.0,136.0,0.0,28.0,0.0
21640,136.0,1669.0,140.0,263.0,4.0,107.0,190.0,940.0,165.0,261.0,...,1147.0,117.0,390.0,170.0,82.0,3220.0,3220.0,114.0,96.0,87.0
21966,20.0,124.0,11.0,29.0,2.0,21.0,47.0,57.0,1.0,14.0,...,70.0,29.0,32.0,9.0,14.0,440.0,440.0,2.0,37.0,0.0
22038,21.0,62.0,3.0,2.0,0.0,1.0,42.0,18.0,0.0,0.0,...,8.0,12.0,18.0,5.0,3.0,90.0,90.0,0.0,1.0,0.0
22107,1.0,17.0,1.0,21.0,1.0,1.0,12.0,16.0,3.0,17.0,...,25.0,1.0,0.0,6.0,28.0,212.0,212.0,0.0,50.0,1.0
