In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import csv

raw_data = "../data/ABS/"
curated_data = "../data/curated/"
meta_data = "../data/meta/"

abs = gpd.read_file("../data/ABS/SA2 shapefile/SA2_2021_AUST_GDA2020.shp")[["SA2_CODE21", "geometry"]]
abs = abs[abs["geometry"] != None]
post = gpd.read_file("../data/ABS/post_shapefile/POA_2021_AUST_GDA2020.shp")[["POA_CODE21", "geometry"]]

In [2]:
gdf = gpd.overlay(abs, post, how="intersection", keep_geom_type=False)

In [3]:
gdf['area'] = gdf.apply(lambda row: row.geometry.area,axis=1)
gdf = gdf[["SA2_CODE21", "POA_CODE21", "area"]]

In [4]:
with open(meta_data + "pobox.csv") as f:
    pobox = list(csv.reader(f))[1:]
pobox = {i[0]: i[1:] for i in pobox}

In [5]:
sa2_poa = gdf
for i in pobox:
    cur = gdf[gdf["POA_CODE21"].isin(pobox[i])]
    cur = cur.assign(POA_CODE21=i)
    sa2_poa = pd.concat([sa2_poa, cur])
sa2_poa.reset_index(inplace=True, drop=True)

In [6]:
sa2_poa["area%"] = sa2_poa["area"] / sa2_poa.groupby("POA_CODE21")["area"].transform("sum")
sa2_poa["SA2_CODE21"] = sa2_poa["SA2_CODE21"].astype(str)
sa2_poa["POA_CODE21"] = sa2_poa["POA_CODE21"].astype(str)

<h2>1. Clean external data</h2>

<h3>1.1 Clean "annual income"</h3>

In [7]:
tbl_income = pd.read_excel(raw_data+"annual income by SA2 regions.xlsx")
tbl_population = pd.read_excel(raw_data+"population estimates by SA2 regions.xlsx")

In [8]:
tbl_income.drop(np.arange(0,8), inplace=True)
tbl_income.drop(np.arange(2303,2306), inplace=True)

tbl_income.drop(tbl_income.columns[[2,3,4,5,7,8,9,10,12,13,14,15,17,18,19,20,22,23,24,25,27]], axis=1, inplace=True)
tbl_income = tbl_income.reset_index(drop=True)
tbl_income.columns = ['sa2', 'sa2_name', 'total_earners', 'median_age', 'income_sum', 'income_median', 'income_mean']
tbl_income["sa2"] = tbl_income["sa2"].astype(str)
tbl_income

Unnamed: 0,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean
0,101021007,Braidwood,2361,51,120763285,41593,51149
1,101021008,Karabar,5100,42,338308979,61777,66335
2,101021009,Queanbeyan,6697,39,441160946,60119,65874
3,101021010,Queanbeyan - East,3393,40,237035656,63051,69860
4,101021011,Queanbeyan Region,12821,44,1050285692,71420,81919
...,...,...,...,...,...,...,...
2290,801101137,Molonglo,np,np,np,np,np
2291,801101138,Molonglo - North,np,np,np,np,np
2292,801101139,Wright,2121,35,182421537,79150,86007
2293,801111140,ACT - South West,355,40,26069449,64227,73435


In [9]:
tbl_income[tbl_income["sa2"] == "101021007"]

Unnamed: 0,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean
0,101021007,Braidwood,2361,51,120763285,41593,51149


In [10]:
stats = tbl_income.columns[2:]
to_do = []
for i in stats:
    to_do += list(tbl_income[tbl_income[i] == "np"]["sa2"])
to_do = set(to_do)
abs["neighbours"] = None
for i, sa2 in abs.iterrows():
    if sa2.SA2_CODE21 in to_do:
        neighbours = abs[~abs.geometry.disjoint(sa2.geometry)].SA2_CODE21.tolist()
        neighbours = [ name for name in neighbours if sa2.SA2_CODE21 != name ]
        abs.at[i, "neighbours"] = neighbours

In [11]:
neighbours = abs.set_index("SA2_CODE21")["neighbours"].to_dict()
neighbours = {i: neighbours[i] for i in neighbours if neighbours[i]}
tbl_income_clean = tbl_income.merge(abs[["SA2_CODE21", "neighbours"]], left_on="sa2", right_on="SA2_CODE21")

while neighbours:
    for i in neighbours:
        try:
            cur = {}
            for j in stats:
                cur[j] = tbl_income_clean[tbl_income_clean["sa2"].isin(neighbours[i])][j].mean()
            neighbours[i] = cur
        except:
            pass
    cur = {i: neighbours[i] for i in neighbours if type(neighbours[i]) == dict}
    neighbours = {i: neighbours[i] for i in neighbours if i not in cur}
    for i, data in tbl_income_clean.iterrows():
        for j in stats:
            if data[j] == "np" and data.sa2 in cur:
                tbl_income_clean.at[i,j] = cur[data.sa2][j]
    for i in neighbours:
        neighbours[i] = [j for j in neighbours[i] if j not in neighbours]
tbl_income_clean = tbl_income_clean.drop(["SA2_CODE21", "neighbours"], axis=1)
tbl_income_clean

Unnamed: 0,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean
0,101021007,Braidwood,2361,51,120763285,41593,51149
1,101021008,Karabar,5100,42,338308979,61777,66335
2,101021009,Queanbeyan,6697,39,441160946,60119,65874
3,101021010,Queanbeyan - East,3393,40,237035656,63051,69860
4,101021012,Queanbeyan West - Jerrabomberra,8476,44,725602722,73851,85607
...,...,...,...,...,...,...,...
2142,801101136,Denman Prospect,416,38,41056005,90337,98692
2143,801101137,Molonglo,1602.333333,36.0,134437238.666667,81222.0,87973.666667
2144,801101139,Wright,2121,35,182421537,79150,86007
2145,801111140,ACT - South West,355,40,26069449,64227,73435


In [12]:
to_do1 = []
for i in tbl_income_clean.columns[2:]:
    to_do1 += list(tbl_income_clean[tbl_income_clean[i] == "np"]["sa2"])
to_do1 = set(to_do1)
to_do1 # this is Torrens Island, a small island home to 2 electric power plants so safe to ignore

{'404021103'}

In [25]:
tbl_income_clean = tbl_income_clean[tbl_income_clean["sa2"] != "404021103"]
tbl_income_clean["income_mean"] = tbl_income_clean["income_sum"].astype(float) / tbl_income_clean["total_earners"].astype(float)

In [14]:
tbl_income_clean.to_csv(curated_data+"clean_annual.csv")

<h3>1.2 Clean "population"</h3>

In [15]:
tbl_population.drop(tbl_population.columns[0:8], axis=1, inplace=True)
tbl_population.drop(tbl_population.columns[1:22], axis=1, inplace=True)
tbl_population.drop(tbl_population.columns[2:6], axis=1, inplace=True)

In [16]:
tbl_population.columns = ["sa2", "2021_population", "km2", "persons/km2"]
tbl_population = tbl_population.dropna(axis=0, how='any', thresh=None, subset=["sa2"], inplace=False)
tbl_population['sa2'] = tbl_population['sa2'].astype(int).astype(str)
tbl_population

Unnamed: 0,sa2,2021_population,km2,persons/km2
0,101021007,4330.0,3418.4,1.3
1,101021008,8546.0,7.0,1223.9
2,101021009,11370.0,4.8,2387.7
3,101021010,5093.0,13.0,391.7
4,101021012,12743.0,13.7,931.9
...,...,...,...,...
2449,801111141,67.0,1202.8,0.1
2450,901011001,1716.0,136.1,12.6
2451,901021002,602.0,13.7,43.9
2452,901031003,310.0,67.2,4.6


In [17]:
tbl_population.to_csv(curated_data+"clean_population.csv")

<h2>2. Join external data</h2>

In [18]:
merge_income_population = tbl_income_clean.merge(tbl_population, on='sa2', how='inner')
merge_income_population

Unnamed: 0,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean,2021_population,km2,persons/km2
0,101021007,Braidwood,2361,51,120763285,41593,51149.210080,4330.0,3418.4,1.3
1,101021008,Karabar,5100,42,338308979,61777,66335.093922,8546.0,7.0,1223.9
2,101021009,Queanbeyan,6697,39,441160946,60119,65874.413319,11370.0,4.8,2387.7
3,101021010,Queanbeyan - East,3393,40,237035656,63051,69860.199234,5093.0,13.0,391.7
4,101021012,Queanbeyan West - Jerrabomberra,8476,44,725602722,73851,85606.739264,12743.0,13.7,931.9
...,...,...,...,...,...,...,...,...,...,...
2141,801101136,Denman Prospect,416,38,41056005,90337,98692.319712,2719.0,4.7,573.2
2142,801101137,Molonglo,1602.333333,36.0,134437238.666667,81222.0,83900.918660,0.0,2.2,0.0
2143,801101139,Wright,2121,35,182421537,79150,86007.325318,3806.0,1.3,2993.1
2144,801111140,ACT - South West,355,40,26069449,64227,73435.067606,554.0,416.8,1.3


In [19]:
merge_post_sa2 = sa2_poa.merge(merge_income_population, left_on="SA2_CODE21", right_on="sa2", how="inner")
stats = merge_post_sa2.columns[6:]
for i in stats:
    if ((i != "median_age") & (i != "income_median") & (i != "income_mean") & (i != "persons/km2")):
        merge_post_sa2[i] = merge_post_sa2["area%"] * merge_post_sa2[i].astype(float)
merge_post_sa2

Unnamed: 0,SA2_CODE21,POA_CODE21,area,area%,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean,2021_population,km2,persons/km2
0,101021007,2536,2.436836e-11,2.511085e-10,101021007,Braidwood,5.928672e-07,51,3.032469e-02,41593,51149.210080,1.087300e-06,8.583894e-07,1.3
1,101021007,2537,3.274170e-11,2.809841e-10,101021007,Braidwood,6.634034e-07,51,3.393256e-02,41593,51149.210080,1.216661e-06,9.605159e-07,1.3
2,101021007,2545,1.716204e-11,1.956121e-10,101021007,Braidwood,4.618402e-07,51,2.362276e-02,41593,51149.210080,8.470005e-07,6.686805e-07,1.3
3,101021007,2580,5.258742e-02,9.695094e-02,101021007,Braidwood,2.289012e+02,51,1.170811e+07,41593,51149.210080,4.197976e+02,3.314171e+02,1.3
4,101021007,2621,2.399097e-11,2.832682e-10,101021007,Braidwood,6.687961e-07,51,3.420839e-02,41593,51149.210080,1.226551e-06,9.683239e-07,1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13239,801071079,2903,1.270430e-12,1.974653e-09,801071079,Gowrie (ACT),4.046063e-06,44,2.949746e-01,65283,72904.091264,6.172764e-06,3.751840e-09,1618.3
13240,801071081,2904,4.374485e-12,4.520853e-09,801071081,Isabella Plains,1.269004e-05,41,8.446155e-01,62676,66557.383327,1.954817e-05,1.130213e-08,1733.1
13241,801071081,2905,2.476339e-04,1.259155e-01,801071081,Isabella Plains,3.534448e+02,41,2.352436e+07,62676,66557.383327,5.444586e+02,3.147887e-01,1733.1
13242,801071087,2904,1.260680e-12,1.302861e-09,801071087,Richardson,2.519734e-06,41,1.627368e-01,60262,64584.901241,3.969818e-06,2.866295e-09,1362.0


In [20]:
post_sa2_agg = merge_post_sa2.groupby("POA_CODE21").agg({"total_earners": "sum", "median_age": "mean", "income_sum": "sum", "2021_population": "sum", "km2": "sum"}).reset_index()
post_sa2_agg["income_mean"] = post_sa2_agg["income_sum"] / post_sa2_agg["total_earners"]
post_sa2_agg["persons/km2"] = post_sa2_agg["2021_population"] / post_sa2_agg["km2"]
post_sa2_agg.rename({"POA_CODE21": "postcode"}, axis=1, inplace=True)
post_sa2_agg

Unnamed: 0,postcode,total_earners,median_age,income_sum,2021_population,km2,income_mean,persons/km2
0,0800,5631.999974,36.750000,4.206090e+08,7678.999968,3.200000,74682.001299,2399.687494
1,0804,1313.062168,39.285714,1.045872e+08,1836.446110,9.143080,79651.358377,200.856395
2,0810,2247.557682,39.761905,1.709450e+08,3872.057394,3.353824,76058.127407,1154.520024
3,0811,2247.557682,39.761905,1.709450e+08,3872.057394,3.353824,76058.127407,1154.520024
4,0812,2177.164726,39.761905,1.583888e+08,1453.078809,16.957135,72750.040663,85.691292
...,...,...,...,...,...,...,...,...
2834,8004,190.248783,37.750000,1.399039e+07,250.462574,0.037371,73537.336914,6702.079900
2835,8008,4460.774965,39.500000,5.572979e+08,5963.800769,2.073457,124932.967832,2876.259511
2836,8009,3770.000000,54.000000,3.098132e+08,6021.000000,287.400000,82178.560477,20.949896
2837,8012,8745.157691,36.500000,6.251141e+08,12376.663636,2.804492,71481.165677,4413.157058


In [21]:
merchant_consumer_info = pd.read_parquet(curated_data+"merchant_consumer_info")
merchant_consumer_info["consumer_postcode"] = merchant_consumer_info["consumer_postcode"].str.pad(width=4, fillchar="0", side="left")

In [22]:
final_table = post_sa2_agg.merge(merchant_consumer_info, left_on='postcode', right_on='consumer_postcode', how='inner')
# len(final_table[final_table["income_sum"].isnull()]["consumer_postcode"].unique())
print(f"Mismatched (null) transactions dropped: {merchant_consumer_info.shape[0] - final_table.shape[0]}")

Mismatched (null) transactions dropped: 1385911


In [23]:
final_table

Unnamed: 0,postcode,total_earners,median_age,income_sum,2021_population,km2,income_mean,persons/km2,merchant_name,revenue_level,...,dollar_value,order_year,order_month,order_day,consumer,consumer_address,consumer_state,consumer_postcode,consumer_gender,fraud_group
0,0800,5631.999974,36.750,4.206090e+08,7678.999968,3.200000,74682.001299,2399.687494,Elit Sed Consequat Associates,a,...,375.167732,2022,4,27,Kelly Clayton,4211 Rodney Tunnel Suite 525,NT,0800,Female,0
1,0800,5631.999974,36.750,4.206090e+08,7678.999968,3.200000,74682.001299,2399.687494,Elit Sed Consequat Associates,a,...,617.879131,2022,10,7,Corey Estrada,1703 Boyd Shore,NT,0800,Male,0
2,0800,5631.999974,36.750,4.206090e+08,7678.999968,3.200000,74682.001299,2399.687494,Elit Sed Consequat Associates,a,...,766.507707,2022,2,28,Nicole Bishop,467 Robert Islands Apt. 834,NT,0800,Female,0
3,0800,5631.999974,36.750,4.206090e+08,7678.999968,3.200000,74682.001299,2399.687494,Malesuada Vel Ltd,b,...,258.865959,2021,7,24,Natalie Herrera,88798 Saunders Hills Apt. 945,NT,0800,Female,0
4,0800,5631.999974,36.750,4.206090e+08,7678.999968,3.200000,74682.001299,2399.687494,Varius Orci Institute,a,...,4.159038,2021,8,8,Shannon Mann,00817 Owens Circle,NT,0800,Female,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11920448,9726,2986.497456,44.375,2.162287e+08,5009.421490,3.582657,72402.110175,1398.242174,Purus Gravida Sagittis Ltd,a,...,270.148399,2022,2,1,Kenneth Powers,04022 Leah Run,QLD,9726,Male,0
11920449,9726,2986.497456,44.375,2.162287e+08,5009.421490,3.582657,72402.110175,1398.242174,Amet Diam Corporation,b,...,4.662204,2021,8,19,Stanley Franklin,0759 Pamela Mall Suite 325,QLD,9726,Male,0
11920450,9726,2986.497456,44.375,2.162287e+08,5009.421490,3.582657,72402.110175,1398.242174,Lorem Foundation,a,...,24.106644,2022,4,14,Elizabeth Lewis,1271 Cheryl Meadow,QLD,9726,Female,0
11920451,9726,2986.497456,44.375,2.162287e+08,5009.421490,3.582657,72402.110175,1398.242174,Purus Gravida Sagittis Ltd,a,...,18.715692,2022,5,15,Elizabeth Lewis,1271 Cheryl Meadow,QLD,9726,Female,0


In [24]:
final_table.to_parquet("../data/curated/merchant_consumer_abs")