In [174]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from os import listdir
from os.path import isfile, join
import statsmodels.api as sm
import statsmodels.formula.api as smf

sns.set(style="whitegrid", font_scale=1.25)

file_folder = "./root/data/SAL/ACT/"
files = listdir(file_folder)

suburbs = pd.read_csv("./act_suburbs.csv", index_col="code")
suburbs.index = ["SAL" + str(code) for code in suburbs.index]

metadata = pd.read_excel(
    "./root/metadata/Metadata_2021_GCP_DataPack_R1.xlsx",
    sheet_name=1,
    header=10,
    index_col="Sequential"
)

for i, file in enumerate(files):
    key = file.split("_")[1]
    
    file_data = pd.read_csv(
        f"{file_folder}/{file}",
        index_col="SAL_CODE_2021"
    )
    file_data.columns = [metadata[(metadata["DataPackfile"] == key) & (metadata["Short"] == string)]["Long"][0] for string in file_data.columns]

    suburbs = suburbs.join(file_data, lsuffix=f"_v{i}")

suburbs.index.name = "code"
suburbs.to_csv("./raw_suburb_data.csv")

data = suburbs.loc[:, "north":"name"].copy()

In [175]:
data["population"] = suburbs["Total_Persons_Persons"]
data["age_4"] = suburbs["Age_groups_0_4_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_14"] = data["age_4"] + suburbs["Age_groups_5_14_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_19"] = data["age_14"] + suburbs["Age_groups_15_19_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_24"] = data["age_19"] + suburbs["Age_groups_20_24_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_34"] = data["age_24"] + suburbs["Age_groups_25_34_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_44"] = data["age_34"] + suburbs["Age_groups_35_44_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_54"] = data["age_44"] + suburbs["Age_groups_45_54_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_64"] = data["age_54"] + suburbs["Age_groups_55_64_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_74"] = data["age_64"] + suburbs["Age_groups_65_74_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_84"] = data["age_74"] + suburbs["Age_groups_75_84_years_Persons"] / suburbs["Total_Persons_Persons"]

data

Unnamed: 0_level_0,north,name,population,age_4,age_14,age_19,age_24,age_34,age_44,age_54,age_64,age_74,age_84
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
SAL80017,1,Acton,2848,0.001756,0.001756,0.510885,0.883076,0.976826,0.994733,0.997191,0.997191,0.997191,0.997191
SAL80018,1,Ainslie,5376,0.048549,0.166481,0.221726,0.294457,0.406622,0.535714,0.672619,0.800967,0.896763,0.954613
SAL80019,1,Amaroo,6129,0.060858,0.226138,0.305760,0.370370,0.498287,0.654430,0.827704,0.919400,0.970958,0.993637
SAL80020,1,Aranda,2605,0.054511,0.196929,0.251440,0.314395,0.418042,0.550480,0.680998,0.811132,0.903647,0.979271
SAL80021,0,Banks,5100,0.062549,0.212353,0.280588,0.349412,0.497255,0.648627,0.800196,0.916078,0.969216,0.994314
...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAL80132,1,Weetangera,2795,0.046512,0.184258,0.260107,0.318068,0.419678,0.547406,0.691950,0.796064,0.894812,0.973166
SAL80133,0,Weston,4000,0.057000,0.189500,0.241500,0.288500,0.393500,0.535500,0.672250,0.765250,0.865750,0.957500
SAL80134,1,Whitlam,7,0.571429,0.571429,0.571429,0.571429,0.571429,1.428571,1.428571,1.428571,1.428571,1.428571
SAL80135,0,Wright,3808,0.073792,0.202994,0.239758,0.313813,0.581408,0.794118,0.895221,0.957458,0.992122,1.001838


In [176]:
log_reg = smf.logit(formula=f"north ~ {' + '.join(data.columns[3:])}", data=data).fit()

log_reg.summary()


Optimization terminated successfully.
         Current function value: 0.453844
         Iterations 9


0,1,2,3
Dep. Variable:,north,No. Observations:,116.0
Model:,Logit,Df Residuals:,105.0
Method:,MLE,Df Model:,10.0
Date:,"Mon, 25 Jul 2022",Pseudo R-squ.:,0.3451
Time:,21:30:51,Log-Likelihood:,-52.646
converged:,True,LL-Null:,-80.388
Covariance Type:,nonrobust,LLR p-value:,2.564e-08

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-6.0071,12.304,-0.488,0.625,-30.123,18.109
age_4,-9.6008,18.664,-0.514,0.607,-46.182,26.980
age_14,12.1235,41.915,0.289,0.772,-70.029,94.276
age_19,-89.8755,45.798,-1.962,0.050,-179.638,-0.113
age_24,113.8598,33.626,3.386,0.001,47.954,179.766
age_34,-15.3103,9.763,-1.568,0.117,-34.445,3.824
age_44,-2.7316,19.479,-0.140,0.888,-40.910,35.447
age_54,52.8845,27.841,1.900,0.057,-1.683,107.452
age_64,-74.0382,27.703,-2.673,0.008,-128.335,-19.741


In [184]:
predictions = data.iloc[:, :2]
predictions["prediction"] = log_reg.predict(data)
predictions = predictions.dropna()
for i in predictions.index:
    predictions.at[i, "accurate"] = True if abs(predictions.at[i, "north"] - predictions.at[i, "prediction"]) <= .5 else False

print("sensitivity:", f"{len(predictions[predictions['accurate'] == True]) / len(predictions):.1%}")


sensitivity: 81.0%


In [165]:
abs(-.3)

0.3