In [8]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from os import listdir
from os.path import isfile, join
import statsmodels.api as sm
import statsmodels.formula.api as smf

sns.set(style="whitegrid", font_scale=1.25)

file_folder = "./root/data/SAL/ACT/"
files = listdir(file_folder)

suburbs = pd.read_csv("./act_suburbs.csv", index_col="code")
suburbs.index = ["SAL" + str(code) for code in suburbs.index]

metadata = pd.read_excel(
    "./root/metadata/Metadata_2021_GCP_DataPack_R1.xlsx",
    sheet_name=1,
    header=10,
    index_col="Sequential"
)

for i, file in enumerate(files):
    key = file.split("_")[1]
    
    file_data = pd.read_csv(
        f"{file_folder}/{file}",
        index_col="SAL_CODE_2021"
    )
    file_data.columns = [metadata[(metadata["DataPackfile"] == key) & (metadata["Short"] == string)]["Long"][0] for string in file_data.columns]

    suburbs = suburbs.join(file_data, lsuffix=f"_v{i}")

suburbs.index.name = "code"
suburbs.to_csv("./raw_suburb_data.csv")

In [33]:
data = suburbs.loc[:, "north":"name"].copy()

data["population"] = suburbs["Total_Persons_Persons"]
data["age_4"] = suburbs["Age_groups_0_4_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_14"] = data["age_4"] + suburbs["Age_groups_5_14_years_Persons"] / data["population"]
data["age_19"] = data["age_14"] + suburbs["Age_groups_15_19_years_Persons"] / data["population"]
data["age_24"] = data["age_19"] + suburbs["Age_groups_20_24_years_Persons"] / data["population"]
data["age_34"] = data["age_24"] + suburbs["Age_groups_25_34_years_Persons"] / data["population"]
data["age_44"] = data["age_34"] + suburbs["Age_groups_35_44_years_Persons"] / data["population"]
data["age_54"] = data["age_44"] + suburbs["Age_groups_45_54_years_Persons"] / data["population"]
data["age_64"] = data["age_54"] + suburbs["Age_groups_55_64_years_Persons"] / data["population"]
data["age_74"] = data["age_64"] + suburbs["Age_groups_65_74_years_Persons"] / data["population"]
data["age_84"] = data["age_74"] + suburbs["Age_groups_75_84_years_Persons"] / data["population"]
data["indigenous"] = suburbs["Aboriginal_and_or_Torres_Strait_Islander_Persons_Total_Persons"] / data["population"]
data["born_overseas"] = suburbs["Birthplace_Elsewhere_Persons"] / (suburbs["Birthplace_Australia_Persons"] + suburbs["Birthplace_Elsewhere_Persons"])
data["other_language"] = suburbs["Language_used_at_home_Other_Language_Persons"] / (suburbs["Language_used_at_home_Other_Language_Persons"] + suburbs["Language_used_at_home_English_only_Persons"])
data["citizens"] = suburbs["Australian_citizen_Persons"] / data["population"]
data["median_age"] = suburbs["Median_age_of_persons"]
data["median_personal_income"] = suburbs["Median_total_personal_income_weekly"]
data["median_family_income"] = suburbs["Median_total_family_income_weekly"]
data["people_per_bedroom"] = suburbs["Average_number_of_Persons_per_bedroom"]
data["median_household_income"] = suburbs["Median_total_household_income_weekly"]
data["household_size"] = suburbs["Average_household_size"]
data["married"] = suburbs["PERSONS_Total_Married"] / suburbs["PERSONS_Total_Total_v6"]
data["divorced"] = suburbs["PERSONS_Total_Divorced"] / suburbs["PERSONS_Total_Total_v6"]
data["parent_overseas"] = (suburbs["Australian_Both_parents_born_overseas"] + suburbs["Australian_Father_only_born_overseas"] + suburbs["Australian_Mother_only_born_overseas"]) / suburbs["Australian_Total_responses"]
data["afghanistan_born"] = suburbs["PERSONS_Afghanistan_Age_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["bangladesh_born"] = suburbs["PERSONS_Bangladesh_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["bosnia_born"] = suburbs["PERSONS_Bosnia_and_Herzegovina_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["brazil_born"] = suburbs["PERSONS_Brazil_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["cambodia_born"] = suburbs["PERSONS_Cambodia_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["canada_born"] = suburbs["PERSONS_Canada_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["chile_born"] = suburbs["PERSONS_Chile_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["china_born"] = suburbs["PERSONS_China_excludes_SARs_and_Taiwan_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["eygypt_born"] = suburbs["PERSONS_Egypt_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["england_born"] = suburbs["PERSONS_England_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["france_born"] = suburbs["PERSONS_France_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["fiji_born"] = suburbs["PERSONS_Fiji_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["greece_born"] = suburbs["PERSONS_Greece_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["germany_born"] = suburbs["PERSONS_Germany_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["hongkong_born"] = suburbs["PERSONS_Hong_Kong_SAR_of_China_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["india_born"] = suburbs["PERSONS_India_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["indonesia_born"] = suburbs["PERSONS_Indonesia_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["iran_born"] = suburbs["PERSONS_Iran_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["iraq_born"] = suburbs["PERSONS_Iraq_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["ireland_born"] = suburbs["PERSONS_Ireland_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["italy_born"] = suburbs["PERSONS_Italy_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["japan_born"] = suburbs["PERSONS_Japan_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["korea_born"] = suburbs["PERSONS_Korea_Republic_of_South_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["lebanon_born"] = suburbs["PERSONS_Lebanon_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["malaysia_born"] = suburbs["PERSONS_Malaysia_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["malta_born"] = suburbs["PERSONS_Malta_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["mauritius_born"] = suburbs["PERSONS_Mauritius_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["myanmar_born"] = suburbs["PERSONS_Myanmar_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["nepal_born"] = suburbs["PERSONS_Nepal_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["netherlands_born"] = suburbs["PERSONS_Netherlands_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["newzealand_born"] = suburbs["PERSONS_New_Zealand_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["northmacedonia_born"] = suburbs["PERSONS_North_Macedonia_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["pakistan_born"] = suburbs["PERSONS_Pakistan_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["png_born"] = suburbs["PERSONS_Papua_New_Guinea_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["philippines_born"] = suburbs["PERSONS_Philippines_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["poland_born"] = suburbs["PERSONS_Poland_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["samoa_born"] = suburbs["PERSONS_Samoa_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["scotland_born"] = suburbs["PERSONS_Scotland_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["singapore_born"] = suburbs["PERSONS_Singapore_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["southafrica_born"] = suburbs["PERSONS_South_Africa_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["srilanka_born"] = suburbs["PERSONS_Sri_Lanka_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["taiwan_born"] = suburbs["PERSONS_Sri_Lanka_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["thailand_born"] = suburbs["PERSONS_Thailand_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["turkey_born"] = suburbs["PERSONS_Turkey_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["us_born"] = suburbs["PERSONS_United_States_of_America_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["vietnam_born"] = suburbs["PERSONS_Vietnam_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["wales_born"] = suburbs["PERSONS_Wales_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["zimbabwe_born"] = suburbs["PERSONS_Zimbabwe_Total"] / suburbs["PERSONS_Total_Total_v30"]
data["buddhist"] = suburbs["Buddhism_Persons"] / suburbs["Total_Persons_v32"]
data["christian"] = suburbs["Christianity_Total_Persons"] / suburbs["Total_Persons_v32"]
data["hindu"] = suburbs["Hinduism_Persons"] / suburbs["Total_Persons_v32"]
data["islam"] = suburbs["Islam_Persons"] / suburbs["Total_Persons_v32"]
data["jewish"] = suburbs["Judaism_Persons"] / suburbs["Total_Persons_v32"]
data["secular"] = suburbs["Secular_Beliefs_and_Other_Spiritual_Beliefs_and_No_Religious_Affiliation_Total_Persons"] / suburbs["Total_Persons_v32"]
data["public_school_primary"] = (suburbs["Primary_Government_Persons"]) / suburbs["Primary_Total_Primary_Persons"]
data["public_school_secondary"] = (suburbs["Secondary_Government_Persons"]) / suburbs["Secondary_Total_Secondary_Persons"]
data["personal_income_nil"] = suburbs["PERSONS_Negative_Nil_income_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_149"] = data["personal_income_nil"] + suburbs["PERSONS_1_149_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_299"] = data["personal_income_149"] + suburbs["PERSONS_150_299_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_399"] = data["personal_income_299"] + suburbs["PERSONS_300_399_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_499"] = data["personal_income_399"] + suburbs["PERSONS_400_499_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_649"] = data["personal_income_499"] + suburbs["PERSONS_500_649_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_799"] = data["personal_income_649"] + suburbs["PERSONS_650_799_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_999"] = data["personal_income_799"] + suburbs["PERSONS_800_999_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_999"] = data["personal_income_799"] + suburbs["PERSONS_800_999_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_1249"] = data["personal_income_999"] + suburbs["PERSONS_1000_1249_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_1499"] = data["personal_income_1249"] + suburbs["PERSONS_1250_1499_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_1749"] = data["personal_income_1499"] + suburbs["PERSONS_1500_1749_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_1999"] = data["personal_income_1749"] + suburbs["PERSONS_1750_1999_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_2999"] = data["personal_income_1999"] + suburbs["PERSONS_2000_2999_more_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_3499"] = data["personal_income_2999"] + suburbs["PERSONS_3000_3499_Total"] / suburbs["PERSONS_Total_Total_v38"]
data["personal_income_3500_plus"] = data["personal_income_3499"] + suburbs["PERSONS_3500_or_more_Total"] / suburbs["PERSONS_Total_Total_v38"]

data

Unnamed: 0_level_0,north,name,population,age_4,age_14,age_19,age_24,age_34,age_44,age_54,...,personal_income_649,personal_income_799,personal_income_999,personal_income_1249,personal_income_1499,personal_income_1749,personal_income_1999,personal_income_2999,personal_income_3499,personal_income_3500_plus
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAL80017,1,Acton,2848,0.001756,0.001756,0.510885,0.883076,0.976826,0.994733,0.997191,...,0.738748,0.774262,0.793952,0.807314,0.813291,0.816104,0.821378,0.825246,0.826653,0.827707
SAL80018,1,Ainslie,5376,0.048549,0.166481,0.221726,0.294457,0.406622,0.535714,0.672619,...,0.302388,0.352823,0.401027,0.473555,0.537603,0.602990,0.661906,0.813435,0.852711,0.938407
SAL80019,1,Amaroo,6129,0.060858,0.226138,0.305760,0.370370,0.498287,0.654430,0.827704,...,0.290587,0.339019,0.407033,0.494630,0.572331,0.652137,0.729417,0.889450,0.919773,0.960623
SAL80020,1,Aranda,2605,0.054511,0.196929,0.251440,0.314395,0.418042,0.550480,0.680998,...,0.255847,0.304535,0.357041,0.439141,0.520764,0.590931,0.657757,0.827685,0.868258,0.962291
SAL80021,0,Banks,5100,0.062549,0.212353,0.280588,0.349412,0.497255,0.648627,0.800196,...,0.282657,0.342374,0.418263,0.516546,0.623787,0.726798,0.797462,0.923115,0.944016,0.967405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAL80132,1,Weetangera,2795,0.046512,0.184258,0.260107,0.318068,0.419678,0.547406,0.691950,...,0.281195,0.325571,0.387961,0.463533,0.528998,0.610281,0.682337,0.839631,0.883568,0.970123
SAL80133,0,Weston,4000,0.057000,0.189500,0.241500,0.288500,0.393500,0.535500,0.672250,...,0.274467,0.322322,0.382834,0.460019,0.529793,0.603890,0.666564,0.823094,0.858290,0.916641
SAL80134,1,Whitlam,7,0.571429,0.571429,0.571429,0.571429,0.571429,1.428571,1.428571,...,1.500000,1.500000,1.500000,1.500000,1.500000,1.500000,1.500000,1.500000,1.500000,1.500000
SAL80135,0,Wright,3808,0.073792,0.202994,0.239758,0.313813,0.581408,0.794118,0.895221,...,0.178442,0.226750,0.293789,0.391719,0.498850,0.596122,0.689451,0.861321,0.900099,0.953336


In [34]:
for feature in data.columns[3:]:
    model = smf.logit(formula=f"north ~ {feature} + I({feature}**2)", data=data.dropna())
    results = model.fit(disp=0)
    print("****", feature, "****")
    print(results.llr_pvalue)
    print()

**** age_4 ****
0.01749173454547416

**** age_14 ****
0.02224997340771653

**** age_19 ****
0.09904271060499947

**** age_24 ****
0.0007838316228083787

**** age_34 ****
0.00012589212096362633

**** age_44 ****
0.00017792231483742846

**** age_54 ****
0.00010361645462535013

**** age_64 ****
0.002032668913990906

**** age_74 ****
0.020309294804324307

**** age_84 ****
0.012746777500572284

**** indigenous ****
0.5214123798319187

**** born_overseas ****
0.016279618929696846

**** other_language ****
0.005443085392029421

**** citizens ****
0.002296294085135461

**** median_age ****
0.00018177440987576297

**** median_personal_income ****
0.006648817110800808

**** median_family_income ****
0.01567338223951382

**** people_per_bedroom ****
0.015167960312086169

**** median_household_income ****
0.17694397791791128

**** household_size ****
0.14873540183152778

**** married ****
0.13172693333704089

**** divorced ****
0.002015313402820484

**** parent_overseas ****
0.0830585408414982

**



0.0004933547933388006

**** bangladesh_born ****
1.0

**** bosnia_born ****
0.0011513243211434755

**** brazil_born ****
0.3171725133463517

**** cambodia_born ****
4.78873780102849e-05

**** canada_born ****
1.0

**** chile_born ****
0.03583485643484543

**** china_born ****
1.7147838730391048e-05

**** eygypt_born ****
0.24297991023995533

**** england_born ****
0.013189808662288464

**** france_born ****
0.05326789994860867

**** fiji_born ****
1.0

**** greece_born ****
1.0

**** germany_born ****
1.0

**** hongkong_born ****
1.0

**** india_born ****
0.11212650497575258

**** indonesia_born ****
0.002948206797648485

**** iran_born ****
1.0

**** iraq_born ****
1.0

**** ireland_born ****
1.0





**** italy_born ****
1.0

**** japan_born ****
0.2581105951611878

**** korea_born ****
1.0

**** lebanon_born ****
0.1743387686918411

**** malaysia_born ****
1.0

**** malta_born ****
0.06371836761742546

**** mauritius_born ****
0.15157490996901754

**** myanmar_born ****
0.001822211628911343

**** nepal_born ****
0.2495858879900315

**** netherlands_born ****
1.0

**** newzealand_born ****
0.9858530083112738

**** northmacedonia_born ****
0.9914285031612575

**** pakistan_born ****
0.0012235371494875393

**** png_born ****
1.0

**** philippines_born ****
0.8041755014888003

**** poland_born ****
1.0

**** samoa_born ****
1.0

**** scotland_born ****
1.0

**** singapore_born ****
0.8934821350567558

**** southafrica_born ****
1.0

**** srilanka_born ****
0.8529881792767989

**** taiwan_born ****
0.8529881792767989





**** thailand_born ****
0.2554108836881575

**** turkey_born ****
1.0

**** us_born ****
1.0

**** vietnam_born ****
1.0

**** wales_born ****
0.7138830215587312

**** zimbabwe_born ****
1.0

**** buddhist ****
0.0007948999610683769

**** christian ****
0.00020836010256857068

**** hindu ****
0.15052059265649798

**** islam ****
0.0523581811453852

**** jewish ****
1.0

**** secular ****
0.0052372645664541815

**** public_school_primary ****
0.007384040099295186

**** public_school_secondary ****
0.011538530272473075

**** personal_income_nil ****
0.023487716909584226

**** personal_income_149 ****
0.13041766474842448

**** personal_income_299 ****
0.04451083225249247

**** personal_income_399 ****
0.05927451194426134

**** personal_income_499 ****
0.20528138812648974

**** personal_income_649 ****
0.25614190579774443

**** personal_income_799 ****
0.08827897731775664

**** personal_income_999 ****
0.04724529510896789

**** personal_income_1249 ****
0.033986586177212695

**** personal_

In [35]:
predictions = data.iloc[:, :2]
predictions["prediction"] = results.predict(data)
predictions = predictions.dropna()
for i in predictions.index:
    predictions.at[i, "accurate"] = True if abs(predictions.at[i, "north"] - predictions.at[i, "prediction"]) <= .5 else False

print()
print("sensitivity:", f"{len(predictions[(predictions['prediction'] > .5) & (predictions['north'] == 1)]) / len(predictions[predictions['north'] == 1]):.1%}")
print("specificity:", f"{len(predictions[(predictions['prediction'] <= .5) & (predictions['north'] == 0)]) / len(predictions[predictions['north'] == 0]):.1%}")
print("accuracy:", f"{len(predictions[predictions['accurate']]) / len(predictions):.1%}")



sensitivity: 88.1%
specificity: 14.0%
accuracy: 51.7%
