In [99]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from os import listdir
from os.path import isfile, join
import statsmodels.api as sm
import statsmodels.formula.api as smf

sns.set(style="whitegrid", font_scale=1.25)

file_folder = "./root/data/SAL/ACT/"
files = listdir(file_folder)

suburbs = pd.read_csv("./act_suburbs.csv", index_col="code")
suburbs.index = ["SAL" + str(code) for code in suburbs.index]

metadata = pd.read_excel(
    "./root/metadata/Metadata_2021_GCP_DataPack_R1.xlsx",
    sheet_name=1,
    header=10,
    index_col="Sequential"
)

for i, file in enumerate(files):
    key = file.split("_")[1]
    
    file_data = pd.read_csv(
        f"{file_folder}/{file}",
        index_col="SAL_CODE_2021"
    )
    file_data.columns = [metadata[(metadata["DataPackfile"] == key) & (metadata["Short"] == string)]["Long"][0] for string in file_data.columns]

    suburbs = suburbs.join(file_data, lsuffix=f"_v{i}")

suburbs.index.name = "code"
suburbs.to_csv("./raw_suburb_data.csv")

In [100]:
data = suburbs.loc[:, "north":"name"].copy()

data["population"] = suburbs["Total_Persons_Persons"]
data["age_4"] = suburbs["Age_groups_0_4_years_Persons"] / suburbs["Total_Persons_Persons"]
data["age_14"] = data["age_4"] + suburbs["Age_groups_5_14_years_Persons"] / data["population"]
data["age_19"] = data["age_14"] + suburbs["Age_groups_15_19_years_Persons"] / data["population"]
data["age_24"] = data["age_19"] + suburbs["Age_groups_20_24_years_Persons"] / data["population"]
data["age_34"] = data["age_24"] + suburbs["Age_groups_25_34_years_Persons"] / data["population"]
data["age_44"] = data["age_34"] + suburbs["Age_groups_35_44_years_Persons"] / data["population"]
data["age_54"] = data["age_44"] + suburbs["Age_groups_45_54_years_Persons"] / data["population"]
data["age_64"] = data["age_54"] + suburbs["Age_groups_55_64_years_Persons"] / data["population"]
data["age_74"] = data["age_64"] + suburbs["Age_groups_65_74_years_Persons"] / data["population"]
data["age_84"] = data["age_74"] + suburbs["Age_groups_75_84_years_Persons"] / data["population"]
data["indigenous"] = suburbs["Aboriginal_and_or_Torres_Strait_Islander_Persons_Total_Persons"] / data["population"]
data["born_overseas"] = suburbs["Birthplace_Elsewhere_Persons"] / (suburbs["Birthplace_Australia_Persons"] + suburbs["Birthplace_Elsewhere_Persons"])
data["other_language"] = suburbs["Language_used_at_home_Other_Language_Persons"] / (suburbs["Language_used_at_home_Other_Language_Persons"] + suburbs["Language_used_at_home_English_only_Persons"])
data["citizens"] = suburbs["Australian_citizen_Persons"] / data["population"]
data["median_age"] = suburbs["Median_age_of_persons"]
data["median_personal_income"] = suburbs["Median_total_personal_income_weekly"]
data["median_family_income"] = suburbs["Median_total_family_income_weekly"]
data["people_per_bedroom"] = suburbs["Average_number_of_Persons_per_bedroom"]
data["median_household_income"] = suburbs["Median_total_household_income_weekly"]
data["household_size"] = suburbs["Average_household_size"]
data["married"] = suburbs["PERSONS_Total_Married"] / suburbs["PERSONS_Total_Total"]
data["divorced"] = suburbs["PERSONS_Total_Divorced"] / suburbs["PERSONS_Total_Total"]
data["parent_overseas"] = (suburbs["Australian_Both_parents_born_overseas"] + suburbs["Australian_Father_only_born_overseas"] + suburbs["Australian_Mother_only_born_overseas"]) / suburbs["Australian_Total_responses"]

data

Unnamed: 0_level_0,north,name,population,age_4,age_14,age_19,age_24,age_34,age_44,age_54,...,citizens,median_age,median_personal_income,median_family_income,people_per_bedroom,median_household_income,household_size,married,divorced,parent_overseas
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAL80017,1,Acton,2848,0.001756,0.001756,0.510885,0.883076,0.976826,0.994733,0.997191,...,0.687851,19,257,0,1.0,0,1.0,0.234694,0.025510,0.379348
SAL80018,1,Ainslie,5376,0.048549,0.166481,0.221726,0.294457,0.406622,0.535714,0.672619,...,0.915551,42,1236,3578,0.8,2434,2.5,0.364002,0.085363,0.287307
SAL80019,1,Amaroo,6129,0.060858,0.226138,0.305760,0.370370,0.498287,0.654430,0.827704,...,0.895252,35,1207,3089,0.8,2769,2.9,0.437733,0.057360,0.255382
SAL80020,1,Aranda,2605,0.054511,0.196929,0.251440,0.314395,0.418042,0.550480,0.680998,...,0.888292,42,1372,3533,0.8,3161,2.8,0.465343,0.057154,0.309245
SAL80021,0,Banks,5100,0.062549,0.212353,0.280588,0.349412,0.497255,0.648627,0.800196,...,0.934118,35,1168,2741,0.8,2419,2.8,0.392169,0.073240,0.241791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAL80132,1,Weetangera,2795,0.046512,0.184258,0.260107,0.318068,0.419678,0.547406,0.691950,...,0.901968,41,1332,3730,0.8,3349,2.9,0.493882,0.041898,0.211702
SAL80133,0,Weston,4000,0.057000,0.189500,0.241500,0.288500,0.393500,0.535500,0.672250,...,0.885000,42,1242,3082,0.8,2361,2.5,0.442865,0.089940,0.245311
SAL80134,1,Whitlam,7,0.571429,0.571429,0.571429,0.571429,0.571429,1.428571,1.428571,...,0.714286,27,75,900,1.5,900,6.0,0.714286,0.000000,
SAL80135,0,Wright,3808,0.073792,0.202994,0.239758,0.313813,0.581408,0.794118,0.895221,...,0.830357,32,1446,3150,1.0,2559,2.5,0.394312,0.051955,0.298551


In [128]:
for feature in data.columns[3:]:
    model = smf.logit(formula=f"north ~ {feature} + I({feature}**2)", data=data.dropna())
    results = model.fit()
    print()
    print("****", feature, "****")
    print(results.llr_pvalue)
    print()

Optimization terminated successfully.
         Current function value: 0.664380
         Iterations 7

**** age_4 ****
0.0405962086899047

Optimization terminated successfully.
         Current function value: 0.673543
         Iterations 5

**** age_14 ****
0.11328429202643299

Optimization terminated successfully.
         Current function value: 0.673849
         Iterations 6

**** age_19 ****
0.11724175196704407

Optimization terminated successfully.
         Current function value: 0.612091
         Iterations 9

**** age_24 ****
0.00011616954873924895

Optimization terminated successfully.
         Current function value: 0.608480
         Iterations 7

**** age_34 ****
7.752482337617909e-05

Optimization terminated successfully.
         Current function value: 0.620876
         Iterations 6

**** age_44 ****
0.0003107412504987613

Optimization terminated successfully.
         Current function value: 0.622031
         Iterations 6

**** age_54 ****
0.0003536814209315125

Optimi

In [104]:
predictions = data.iloc[:, :2]
predictions["prediction"] = results.predict(data)
predictions = predictions.dropna()
for i in predictions.index:
    predictions.at[i, "accurate"] = True if abs(predictions.at[i, "north"] - predictions.at[i, "prediction"]) <= .5 else False

print()
print("sensitivity:", f"{len(predictions[(predictions['prediction'] > .5) & (predictions['north'] == 1)]) / len(predictions[predictions['north'] == 1]):.1%}")
print("specificity:", f"{len(predictions[(predictions['prediction'] <= .5) & (predictions['north'] == 0)]) / len(predictions[predictions['north'] == 0]):.1%}")
print("accuracy:", f"{len(predictions[predictions['accurate']]) / len(predictions):.1%}")



sensitivity: 59.6%
specificity: 57.1%
accuracy: 58.4%
