In [1]:
import geopandas as gpd

abs = gpd.read_file("../data/ABS/SA2 shapefile/SA2_2021_AUST_GDA2020.shp")[["SA2_CODE21", "geometry"]]
abs = abs[abs["geometry"] != None]
post = gpd.read_file("../data/ABS/post_shapefile/POA_2021_AUST_GDA2020.shp")[["POA_CODE21", "geometry"]]

In [2]:
gdf = gpd.overlay(abs, post, how="intersection", keep_geom_type=False)

In [3]:
gdf['area'] = gdf.apply(lambda row: row.geometry.area,axis=1)
gdf = gdf[["SA2_CODE21", "POA_CODE21", "area"]]

In [4]:
gdf["area%"] = gdf["area"] / gdf.groupby("POA_CODE21")["area"].transform("sum")
gdf["SA2_CODE21"] = gdf["SA2_CODE21"].astype(str)
gdf["POA_CODE21"] = gdf["POA_CODE21"].astype(str)

In [37]:
gdf[gdf["POA_CODE21"] == "2536"]

Unnamed: 0,SA2_CODE21,POA_CODE21,area,area%
0,101021007,2536,2.436836e-11,2.511085e-10
1,101041017,2536,0.006344484,0.06537797
2,101041018,2536,0.003049899,0.03142828
3,101041021,2536,1.547158e-11,1.5943e-10
4,101041024,2536,0.08764876,0.9031937
5,101041026,2536,5.054865e-12,5.208884e-11
6,114011275,2536,1.983351e-11,2.043783e-10
7,114011283,2536,2.975013e-11,3.06566e-10


In [5]:
import pandas as pd
import numpy as np

raw_data = "../data/ABS/"
curated_data = "../data/curated/"

tbl_income = pd.read_excel(raw_data+"annual income by SA2 regions.xlsx")
tbl_population = pd.read_excel(raw_data+"population estimates by SA2 regions.xlsx")

<h2>1. Clean external data</h2>

<h3>1.1 Clean "annual income"</h3>

In [6]:
tbl_income.drop(np.arange(0,8), inplace=True)
tbl_income.drop(np.arange(2303,2306), inplace=True)

tbl_income.drop(tbl_income.columns[[2,3,4,5,7,8,9,10,12,13,14,15,17,18,19,20,22,23,24,25,27]], axis=1, inplace=True)
tbl_income = tbl_income.reset_index(drop=True)
tbl_income.columns = ['sa2', 'sa2_name', 'total_earners', 'median_age', 'income_sum', 'income_median', 'income_mean']
tbl_income["sa2"] = tbl_income["sa2"].astype(str)
tbl_income

Unnamed: 0,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean
0,101021007,Braidwood,2361,51,120763285,41593,51149
1,101021008,Karabar,5100,42,338308979,61777,66335
2,101021009,Queanbeyan,6697,39,441160946,60119,65874
3,101021010,Queanbeyan - East,3393,40,237035656,63051,69860
4,101021011,Queanbeyan Region,12821,44,1050285692,71420,81919
...,...,...,...,...,...,...,...
2290,801101137,Molonglo,np,np,np,np,np
2291,801101138,Molonglo - North,np,np,np,np,np
2292,801101139,Wright,2121,35,182421537,79150,86007
2293,801111140,ACT - South West,355,40,26069449,64227,73435


In [7]:
stats = tbl_income.columns[2:]
to_do = []
for i in stats:
    to_do += list(tbl_income[tbl_income[i] == "np"]["sa2"])
to_do = set(to_do)
abs["neighbours"] = None
for i, sa2 in abs.iterrows():
    if sa2.SA2_CODE21 in to_do:
        neighbours = abs[~abs.geometry.disjoint(sa2.geometry)].SA2_CODE21.tolist()
        neighbours = [ name for name in neighbours if sa2.SA2_CODE21 != name ]
        abs.at[i, "neighbours"] = neighbours

In [8]:
neighbours = abs.set_index("SA2_CODE21")["neighbours"].to_dict()
neighbours = {i: neighbours[i] for i in neighbours if neighbours[i]}
tbl_income_clean = tbl_income.merge(abs[["SA2_CODE21", "neighbours"]], left_on="sa2", right_on="SA2_CODE21")

while neighbours:
    for i in neighbours:
        try:
            cur = {}
            for j in stats:
                cur[j] = tbl_income_clean[tbl_income_clean["sa2"].isin(neighbours[i])][j].mean()
            neighbours[i] = cur
        except:
            pass
    cur = {i: neighbours[i] for i in neighbours if type(neighbours[i]) == dict}
    neighbours = {i: neighbours[i] for i in neighbours if i not in cur}
    for i, data in tbl_income_clean.iterrows():
        for j in stats:
            if data[j] == "np" and data.sa2 in cur:
                tbl_income_clean.at[i,j] = cur[data.sa2][j]
    for i in neighbours:
        neighbours[i] = [j for j in neighbours[i] if j not in neighbours]
tbl_income_clean = tbl_income_clean.drop(["SA2_CODE21", "neighbours"], axis=1)
tbl_income_clean

Unnamed: 0,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean
0,101021007,Braidwood,2361,51,120763285,41593,51149
1,101021008,Karabar,5100,42,338308979,61777,66335
2,101021009,Queanbeyan,6697,39,441160946,60119,65874
3,101021010,Queanbeyan - East,3393,40,237035656,63051,69860
4,101021012,Queanbeyan West - Jerrabomberra,8476,44,725602722,73851,85607
...,...,...,...,...,...,...,...
2142,801101136,Denman Prospect,416,38,41056005,90337,98692
2143,801101137,Molonglo,1602.333333,36.0,134437238.666667,81222.0,87973.666667
2144,801101139,Wright,2121,35,182421537,79150,86007
2145,801111140,ACT - South West,355,40,26069449,64227,73435


In [9]:
to_do1 = []
for i in tbl_income_clean.columns[2:]:
    to_do1 += list(tbl_income_clean[tbl_income_clean[i] == "np"]["sa2"])
to_do1 = set(to_do1)
to_do1 # this is Torrens Island, a small island home to 2 electric power plants so safe to ignore

{'404021103'}

In [10]:
tbl_income_clean = tbl_income_clean[tbl_income_clean["sa2"] != "404021103"]

In [11]:
tbl_income_clean.to_csv(curated_data+"clean_annual.csv")

<h3>1.2 Clean "population"</h3>

In [12]:
tbl_population.drop(tbl_population.columns[0:8], axis=1, inplace=True)
tbl_population.drop(tbl_population.columns[1:22], axis=1, inplace=True)
tbl_population.drop(tbl_population.columns[2:6], axis=1, inplace=True)

In [13]:
tbl_population.columns = ["sa2", "2021_population", "km2", "persons/km2"]
tbl_population = tbl_population.dropna(axis=0, how='any', thresh=None, subset=["sa2"], inplace=False)
tbl_population['sa2'] = tbl_population['sa2'].astype(int).astype(str)
tbl_population

Unnamed: 0,sa2,2021_population,km2,persons/km2
0,101021007,4330.0,3418.4,1.3
1,101021008,8546.0,7.0,1223.9
2,101021009,11370.0,4.8,2387.7
3,101021010,5093.0,13.0,391.7
4,101021012,12743.0,13.7,931.9
...,...,...,...,...
2449,801111141,67.0,1202.8,0.1
2450,901011001,1716.0,136.1,12.6
2451,901021002,602.0,13.7,43.9
2452,901031003,310.0,67.2,4.6


In [14]:
tbl_population.to_csv(curated_data+"clean_population.csv")

<h2>2. Join external data</h2>

In [15]:
merge_income_population = tbl_income_clean.merge(tbl_population, on='sa2', how='inner')
merge_income_population

Unnamed: 0,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean,2021_population,km2,persons/km2
0,101021007,Braidwood,2361,51,120763285,41593,51149,4330.0,3418.4,1.3
1,101021008,Karabar,5100,42,338308979,61777,66335,8546.0,7.0,1223.9
2,101021009,Queanbeyan,6697,39,441160946,60119,65874,11370.0,4.8,2387.7
3,101021010,Queanbeyan - East,3393,40,237035656,63051,69860,5093.0,13.0,391.7
4,101021012,Queanbeyan West - Jerrabomberra,8476,44,725602722,73851,85607,12743.0,13.7,931.9
...,...,...,...,...,...,...,...,...,...,...
2141,801101136,Denman Prospect,416,38,41056005,90337,98692,2719.0,4.7,573.2
2142,801101137,Molonglo,1602.333333,36.0,134437238.666667,81222.0,87973.666667,0.0,2.2,0.0
2143,801101139,Wright,2121,35,182421537,79150,86007,3806.0,1.3,2993.1
2144,801111140,ACT - South West,355,40,26069449,64227,73435,554.0,416.8,1.3


In [16]:
merge_post_sa2 = gdf.merge(merge_income_population, left_on="SA2_CODE21", right_on="sa2", how="inner")
stats = merge_post_sa2.columns[6:]
for i in stats:
    merge_post_sa2[i] = merge_post_sa2["area%"] * merge_post_sa2[i].astype(float)
merge_post_sa2

Unnamed: 0,SA2_CODE21,POA_CODE21,area,area%,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean,2021_population,km2,persons/km2
0,101021007,2536,2.436836e-11,2.511085e-10,101021007,Braidwood,5.928672e-07,1.280653e-08,3.032469e-02,0.000010,0.000013,1.087300e-06,8.583894e-07,3.264411e-10
1,101021007,2537,3.274170e-11,2.809841e-10,101021007,Braidwood,6.634034e-07,1.433019e-08,3.393256e-02,0.000012,0.000014,1.216661e-06,9.605159e-07,3.652793e-10
2,101021007,2545,1.716204e-11,1.956121e-10,101021007,Braidwood,4.618402e-07,9.976218e-09,2.362276e-02,0.000008,0.000010,8.470005e-07,6.686805e-07,2.542957e-10
3,101021007,2580,5.258742e-02,9.695094e-02,101021007,Braidwood,2.289012e+02,4.944498e+00,1.170811e+07,4032.480388,4958.943558,4.197976e+02,3.314171e+02,1.260362e-01
4,101021007,2621,2.399097e-11,2.832682e-10,101021007,Braidwood,6.687961e-07,1.444668e-08,3.420839e-02,0.000012,0.000014,1.226551e-06,9.683239e-07,3.682486e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11821,801071079,2903,1.270430e-12,1.974653e-09,801071079,Gowrie (ACT),4.046063e-06,8.688471e-08,2.949746e-01,0.000129,0.000144,6.172764e-06,3.751840e-09,3.195580e-06
11822,801071081,2904,4.374485e-12,4.520853e-09,801071081,Isabella Plains,1.269004e-05,1.853550e-07,8.446155e-01,0.000283,0.000301,1.954817e-05,1.130213e-08,7.835091e-06
11823,801071081,2905,2.476339e-04,1.259155e-01,801071081,Isabella Plains,3.534448e+02,5.162535e+00,2.352436e+07,7891.879623,8380.557663,5.444586e+02,3.147887e-01,2.182241e+02
11824,801071087,2904,1.260680e-12,1.302861e-09,801071087,Richardson,2.519734e-06,5.341731e-08,1.627368e-01,0.000079,0.000084,3.969818e-06,2.866295e-09,1.774497e-06


In [17]:
post_sa2_agg = merge_post_sa2.groupby("POA_CODE21").sum().reset_index()
post_sa2_agg.drop(["area", "area%"], axis=1, inplace=True)
post_sa2_agg.rename({"POA_CODE21": "postcode"}, axis=1, inplace=True)
post_sa2_agg

Unnamed: 0,postcode,total_earners,median_age,income_sum,income_median,income_mean,2021_population,km2,persons/km2
0,0800,5631.999974,33.000000,4.206090e+08,57789.000082,74682.000125,7678.999968,3.200000,2419.999994
1,0810,2247.557682,38.689316,1.709450e+08,64195.425046,73447.813891,3872.057394,3.353824,1482.569926
2,0812,2177.164726,39.676957,1.583888e+08,63606.250469,72132.529895,1453.078809,16.957135,554.150966
3,0820,1313.062168,39.800996,1.045872e+08,62694.098403,71491.215692,1836.446110,9.143080,541.657373
4,0822,899.191123,40.878605,4.633381e+07,36623.557892,44584.908043,4905.148329,31941.621280,0.250779
...,...,...,...,...,...,...,...,...,...
2629,7466,1984.999996,45.000000,1.157059e+08,49788.999965,58289.999976,4372.999980,3931.600008,1.100000
2630,7467,1984.999999,45.000000,1.157059e+08,49788.999992,58289.999995,4372.999996,3931.600002,1.100000
2631,7468,1984.999999,45.000000,1.157059e+08,49788.999992,58289.999994,4372.999995,3931.600002,1.100000
2632,7469,2140.391676,45.756209,1.119052e+08,45800.898515,52653.496678,4241.731828,4105.857702,1.044130


In [18]:
merchant_consumer_info = pd.read_parquet(curated_data+"merchant_consumer_info")
merchant_consumer_info["consumer_postcode"] = merchant_consumer_info["consumer_postcode"].str.pad(width=4, fillchar="0", side="left")

In [19]:
merchant_consumer_info["consumer_postcode"].unique()

array(['6947', '2805', '5063', ..., '3921', '1120', '3205'], dtype=object)

In [20]:
final_table = post_sa2_agg.merge(merchant_consumer_info, left_on='postcode', right_on='consumer_postcode', how='inner')
# len(final_table[final_table["income_sum"].isnull()]["consumer_postcode"].unique())
print(f"Mismatched (null) transactions dropped: {merchant_consumer_info.shape[0] - final_table.shape[0]}")

Mismatched (null) transactions dropped: 2218439


In [21]:
final_table

Unnamed: 0,postcode,total_earners,median_age,income_sum,income_median,income_mean,2021_population,km2,persons/km2,merchant_name,...,dollar_value,order_year,order_month,order_day,consumer,consumer_address,consumer_state,consumer_postcode,consumer_gender,fraud_group
0,0800,5631.999974,33.0,4.206090e+08,57789.000082,74682.000125,7678.999968,3.2,2419.999994,Elit Sed Consequat Associates,...,375.167732,2022,4,27,Kelly Clayton,4211 Rodney Tunnel Suite 525,NT,0800,Female,0
1,0800,5631.999974,33.0,4.206090e+08,57789.000082,74682.000125,7678.999968,3.2,2419.999994,Elit Sed Consequat Associates,...,617.879131,2022,10,7,Corey Estrada,1703 Boyd Shore,NT,0800,Male,0
2,0800,5631.999974,33.0,4.206090e+08,57789.000082,74682.000125,7678.999968,3.2,2419.999994,Elit Sed Consequat Associates,...,766.507707,2022,2,28,Nicole Bishop,467 Robert Islands Apt. 834,NT,0800,Female,0
3,0800,5631.999974,33.0,4.206090e+08,57789.000082,74682.000125,7678.999968,3.2,2419.999994,Malesuada Vel Ltd,...,258.865959,2021,7,24,Natalie Herrera,88798 Saunders Hills Apt. 945,NT,0800,Female,0
4,0800,5631.999974,33.0,4.206090e+08,57789.000082,74682.000125,7678.999968,3.2,2419.999994,Varius Orci Institute,...,4.159038,2021,8,8,Shannon Mann,00817 Owens Circle,NT,0800,Female,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11087920,7470,1985.000000,45.0,1.157059e+08,49789.000000,58290.000000,4373.000000,3931.6,1.100000,Ipsum Primis Associates,...,56.702387,2021,7,17,Shannon Petty,386 Aaron Manors,TAS,7470,Female,0
11087921,7470,1985.000000,45.0,1.157059e+08,49789.000000,58290.000000,4373.000000,3931.6,1.100000,Ipsum Primis Associates,...,84.885059,2022,3,11,Michael Bush,47924 Jessica Drive Apt. 344,TAS,7470,Male,0
11087922,7470,1985.000000,45.0,1.157059e+08,49789.000000,58290.000000,4373.000000,3931.6,1.100000,Lorem Foundation,...,41.906740,2021,5,12,Kevin Brown,1861 Tina Junction Suite 680,TAS,7470,Undisclosed,0
11087923,7470,1985.000000,45.0,1.157059e+08,49789.000000,58290.000000,4373.000000,3931.6,1.100000,Lorem Foundation,...,9.540977,2022,1,6,Kevin Brown,1861 Tina Junction Suite 680,TAS,7470,Undisclosed,0


In [22]:
final_table.to_parquet("../data/curated/merchant_consumer_abs")

In [36]:
len(set(merchant_consumer_info["consumer_postcode"].unique()) - set(post["POA_CODE21"].unique()))

527

In [37]:
post["POA_CODE21"].unique()

array(['0800', '0810', '0812', ..., '9494', '9797', 'ZZZZ'], dtype=object)

In [38]:
set(merchant_consumer_info["consumer_postcode"].unique()) - set(post["POA_CODE21"].unique())

{'0200',
 '0801',
 '0804',
 '0811',
 '0813',
 '0814',
 '0815',
 '0821',
 '0831',
 '0851',
 '0861',
 '0871',
 '0881',
 '0906',
 '0907',
 '0909',
 '1001',
 '1002',
 '1003',
 '1004',
 '1005',
 '1006',
 '1007',
 '1008',
 '1009',
 '1010',
 '1020',
 '1021',
 '1022',
 '1023',
 '1025',
 '1026',
 '1027',
 '1028',
 '1029',
 '1030',
 '1031',
 '1032',
 '1033',
 '1034',
 '1035',
 '1036',
 '1037',
 '1038',
 '1039',
 '1040',
 '1041',
 '1042',
 '1043',
 '1044',
 '1045',
 '1046',
 '1100',
 '1101',
 '1105',
 '1106',
 '1107',
 '1108',
 '1109',
 '1110',
 '1112',
 '1113',
 '1114',
 '1115',
 '1116',
 '1117',
 '1118',
 '1119',
 '1120',
 '1121',
 '1122',
 '1123',
 '1124',
 '1125',
 '1126',
 '1127',
 '1128',
 '1129',
 '1130',
 '1131',
 '1132',
 '1133',
 '1134',
 '1135',
 '1136',
 '1137',
 '1138',
 '1139',
 '1140',
 '1141',
 '1142',
 '1143',
 '1144',
 '1145',
 '1146',
 '1147',
 '1148',
 '1149',
 '1150',
 '1151',
 '1152',
 '1153',
 '1154',
 '1155',
 '1156',
 '1157',
 '1158',
 '1159',
 '1160',
 '1161',
 '1162',
 