# ***Notebook for testing***

In [1]:
import os
import sys
import pandas as pd
sys.path.append("../")
from pyspark.sql import SparkSession

## Correspondence

In [2]:
correspondence = pd.read_csv("../data/tables/abs/postcode_correspondences_2021.csv")
correspondence.head(15)

Unnamed: 0,POSTCODE,SA2_CODE_2021,SA2_NAME_2021,RATIO_FROM_TO,INDIV_TO_REGION_QLTY_INDICATOR,OVERALL_QUALITY_INDICATOR,BMOS_NULL_FLAG
0,800.0,701011002,Darwin City,1.0,Good,Poor,0.0
1,810.0,701021018,Jingili,0.0534036,Poor,Poor,0.0
2,810.0,701021029,Wanguri,0.053635,Poor,Poor,0.0
3,810.0,701021024,Moil,0.058218,Poor,Poor,0.0
4,810.0,701021028,Wagaman,0.0589462,Poor,Poor,0.0
5,810.0,701021010,Alawa,0.0609595,Poor,Poor,0.0
6,810.0,701021027,Tiwi,0.0728956,Poor,Poor,0.0
7,810.0,701021023,Millner,0.0747341,Poor,Poor,0.0
8,810.0,701021016,Coconut Grove,0.0839578,Poor,Poor,0.0
9,810.0,701021026,Rapid Creek,0.0952216,Poor,Poor,0.0


Postcodes may be matched to multiple SA2 regions, therefore we will choose the region with the highest ratio, i.e. the SA2 region with the highest percentage of population for that postcode. For example, in the table above, you can see the postcode 0810 is matched to 12 different SA2 regions. Of the twelve, we will choose code 701021021 - Lyons (NT) - as about 17% of all people from the 0810 postcode are within this region, which is the highest percentage.

In [3]:
# get indices of max ratio values for each postcode
max_indices = correspondence.groupby("POSTCODE")["RATIO_FROM_TO"].idxmax()

In [4]:
correspondence_filtered = correspondence.loc[max_indices].reset_index(drop=True)
correspondence_filtered.head(5)

Unnamed: 0,POSTCODE,SA2_CODE_2021,SA2_NAME_2021,RATIO_FROM_TO,INDIV_TO_REGION_QLTY_INDICATOR,OVERALL_QUALITY_INDICATOR,BMOS_NULL_FLAG
0,800.0,701011002,Darwin City,1.0,Good,Poor,0.0
1,810.0,701021021,Lyons (NT),0.17456,Poor,Poor,0.0
2,812.0,701021019,Karama,0.257474,Poor,Poor,0.0
3,820.0,701011008,Stuart Park,0.206533,Poor,Poor,0.0
4,822.0,702041063,East Arnhem,0.209947,Poor,Poor,0.0


We can then get rid of all columns except postcode and SA2 code, which will be of use for combining our ABS data with the synthetic data.

In [5]:
correspondence_filtered = correspondence_filtered[["POSTCODE", "SA2_CODE_2021"]]
correspondence_filtered.head(5)

Unnamed: 0,POSTCODE,SA2_CODE_2021
0,800.0,701011002
1,810.0,701021021
2,812.0,701021019
3,820.0,701011008
4,822.0,702041063


We can now assign a SA2 code for each customer in our synthetic dataset.

In [6]:
# read in data
# data is seperated by pipe "|", not comma
customers = pd.read_csv("../data/tables/synthetic/tbl_consumer.csv", sep="|")
customers

Unnamed: 0,name,address,state,postcode,gender,consumer_id
0,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1195503
1,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
3,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,154128
4,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,712975
...,...,...,...,...,...,...
499994,Jessica Avila,508 Miranda Overpass Apt. 218,QLD,4400,Female,1385608
499995,Steven Thornton,7913 Schwartz Mission Suite 483,VIC,3097,Undisclosed,1466964
499996,Christy Smith,5681 Zachary Mountain Apt. 060,NSW,2756,Undisclosed,1253484
499997,Donna Sutton,54140 Jacob Point,VIC,3989,Female,175005


In [7]:
customers_merged = pd.merge(customers, correspondence_filtered, left_on="postcode", right_on="POSTCODE", how="left")
customers_merged = customers_merged.drop("POSTCODE", axis=1)
customers_merged.head(5)

Unnamed: 0,name,address,state,postcode,gender,consumer_id,SA2_CODE_2021
0,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1195503,
1,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208,124011455.0
2,Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530,702021055.0
3,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,154128,124011452.0
4,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,712975,509031247.0


In [8]:
customers_merged["SA2_CODE_2021"].isna().sum()

83181

83,181 missing SA2 codes - maybe impute with mean/median values for state when using ABS data later? Or find similar postcodes or use latitude/longitude data?

In [9]:
# replace NaN values with zeroes (helps with merging later)
customers_merged["SA2_CODE_2021"] = customers_merged["SA2_CODE_2021"].fillna(0)
customers_merged.head(5)

Unnamed: 0,name,address,state,postcode,gender,consumer_id,SA2_CODE_2021
0,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1195503,0
1,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208,124011455
2,Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530,702021055
3,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,154128,124011452
4,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,712975,509031247


## ABS Data Cleaning

In [10]:
abs_df = pd.read_csv("../data/tables/abs/ABS_2021.csv")
abs_df.head(15)

Unnamed: 0,DATAFLOW,MEDAVG: Median/Average,REGION: Region,REGION_TYPE: Region Type,STATE: State,TIME_PERIOD: Time Period,OBS_VALUE
0,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,AUS: Australia,AUS: Australia,AUS: Australia,2021,38.0
1,ABS:C21_G02_SA2(1.0.0),2: Median total personal income ($/weekly),AUS: Australia,AUS: Australia,AUS: Australia,2021,805.0
2,ABS:C21_G02_SA2(1.0.0),3: Median total family income ($/weekly),AUS: Australia,AUS: Australia,AUS: Australia,2021,2120.0
3,ABS:C21_G02_SA2(1.0.0),4: Median total household income ($/weekly),AUS: Australia,AUS: Australia,AUS: Australia,2021,1746.0
4,ABS:C21_G02_SA2(1.0.0),5: Median mortgage repayment ($/monthly),AUS: Australia,AUS: Australia,AUS: Australia,2021,1863.0
5,ABS:C21_G02_SA2(1.0.0),6: Median rent ($/weekly),AUS: Australia,AUS: Australia,AUS: Australia,2021,375.0
6,ABS:C21_G02_SA2(1.0.0),7: Average number of persons per bedroom,AUS: Australia,AUS: Australia,AUS: Australia,2021,0.8
7,ABS:C21_G02_SA2(1.0.0),8: Average household size,AUS: Australia,AUS: Australia,AUS: Australia,2021,2.5
8,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,112: Richmond - Tweed,SA4: Statistical Area Level 4,1: New South Wales,2021,46.0
9,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,127: Sydney - South West,SA4: Statistical Area Level 4,1: New South Wales,2021,35.0


In [11]:
# Before filtering:
abs_df.shape

(23632, 7)

First, let's get state-wide statistics for imputation purposes later. Since we are focusing on median personal income, we will only get each state's median personal income number.

In [12]:
stat = "2: Median total personal income ($/weekly)"

NSW_median_income = float(abs_df[(abs_df["MEDAVG: Median/Average"] == stat) &
                           (abs_df["REGION: Region"] == "1: New South Wales")]["OBS_VALUE"])

VIC_median_income = float(abs_df[(abs_df["MEDAVG: Median/Average"] == stat) &
                           (abs_df["REGION: Region"] == "2: Victoria")]["OBS_VALUE"])

QLD_median_income = float(abs_df[(abs_df["MEDAVG: Median/Average"] == stat) &
                           (abs_df["REGION: Region"] == "3: Queensland")]["OBS_VALUE"])

SA_median_income = float(abs_df[(abs_df["MEDAVG: Median/Average"] == stat) &
                           (abs_df["REGION: Region"] == "4: South Australia")]["OBS_VALUE"])

WA_median_income = float(abs_df[(abs_df["MEDAVG: Median/Average"] == stat) &
                           (abs_df["REGION: Region"] == "5: Western Australia")]["OBS_VALUE"])

TAS_median_income = float(abs_df[(abs_df["MEDAVG: Median/Average"] == stat) &
                           (abs_df["REGION: Region"] == "6: Tasmania")]["OBS_VALUE"])

NT_median_income = float(abs_df[(abs_df["MEDAVG: Median/Average"] == stat) &
                           (abs_df["REGION: Region"] == "7: Northern Territory")]["OBS_VALUE"])

ACT_median_income = float(abs_df[(abs_df["MEDAVG: Median/Average"] == stat) &
                           (abs_df["REGION: Region"] == "8: Australian Capital Territory")]["OBS_VALUE"])

Firstly, we only want SA2 data, and data from 2021.

In [13]:
abs_filtered = abs_df[(abs_df["REGION_TYPE: Region Type"] == "SA2: Statistical Area Level 2") & 
                      (abs_df["TIME_PERIOD: Time Period"] == 2021)]
abs_filtered.shape

(19722, 7)

In [14]:
abs_filtered.head(5)

Unnamed: 0,DATAFLOW,MEDAVG: Median/Average,REGION: Region,REGION_TYPE: Region Type,STATE: State,TIME_PERIOD: Time Period,OBS_VALUE
34,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,101031014: Cooma,SA2: Statistical Area Level 2,1: New South Wales,2021,44.0
35,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,101051540: Goulburn Surrounds,SA2: Statistical Area Level 2,1: New South Wales,2021,48.0
36,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,102021052: Summerland Point - Gwandalan,SA2: Statistical Area Level 2,1: New South Wales,2021,45.0
37,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,103031075: Wollangambe - Wollemi,SA2: Statistical Area Level 2,1: New South Wales,2021,0.0
38,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,105031106: Wellington,SA2: Statistical Area Level 2,1: New South Wales,2021,41.0


Now, we can convert the SA2 region into only its code, and also remove unnecessary features. We will also rename the relevant columns for ease of access.

In [15]:
# change SA2 feature
abs_filtered["REGION: Region"] = abs_filtered["REGION: Region"].str[:9].astype("int64")
abs_filtered.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abs_filtered["REGION: Region"] = abs_filtered["REGION: Region"].str[:9].astype("int64")


Unnamed: 0,DATAFLOW,MEDAVG: Median/Average,REGION: Region,REGION_TYPE: Region Type,STATE: State,TIME_PERIOD: Time Period,OBS_VALUE
34,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,101031014,SA2: Statistical Area Level 2,1: New South Wales,2021,44.0
35,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,101051540,SA2: Statistical Area Level 2,1: New South Wales,2021,48.0
36,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,102021052,SA2: Statistical Area Level 2,1: New South Wales,2021,45.0
37,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,103031075,SA2: Statistical Area Level 2,1: New South Wales,2021,0.0
38,ABS:C21_G02_SA2(1.0.0),1: Median age of persons,105031106,SA2: Statistical Area Level 2,1: New South Wales,2021,41.0


In [16]:
abs_filtered = abs_filtered.drop(["DATAFLOW", "REGION_TYPE: Region Type", "STATE: State",
                                  "TIME_PERIOD: Time Period"], axis=1) # drop useless columns
abs_filtered = abs_filtered.rename(columns={"MEDAVG: Median/Average": "statistic",
                                            "REGION: Region": "region",
                                            "OBS_VALUE": "value"}) # rename columns
abs_filtered.shape

(19722, 3)

We should engineer new columns based on the categorical values of the `Statistic` column.

In [17]:
abs_filtered = abs_filtered.pivot_table(index="region", columns="statistic", values="value", aggfunc="sum")
abs_filtered = abs_filtered.reset_index()
abs_filtered

statistic,region,1: Median age of persons,2: Median total personal income ($/weekly),3: Median total family income ($/weekly),4: Median total household income ($/weekly),5: Median mortgage repayment ($/monthly),6: Median rent ($/weekly),7: Average number of persons per bedroom,8: Average household size
0,101021007,51.0,760.0,1886.0,1429.0,1732.0,330.0,0.8,2.2
1,101021008,38.0,975.0,2334.0,1989.0,1950.0,350.0,0.8,2.6
2,101021009,37.0,996.0,2233.0,1703.0,1700.0,330.0,0.9,2.1
3,101021010,36.0,1104.0,2412.0,1796.0,1700.0,310.0,0.9,2.1
4,101021012,37.0,1357.0,3332.0,3014.0,2300.0,430.0,0.8,2.9
...,...,...,...,...,...,...,...,...,...
2467,901021002,40.0,741.0,1678.0,2519.0,1972.0,231.0,1.1,3.8
2468,901031003,38.0,585.0,1687.0,1825.0,1849.0,90.0,0.8,3.0
2469,901041004,50.0,736.0,1630.0,1184.0,1300.0,240.0,0.8,2.1
2470,997979799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# rename all columns for simplicity
abs_filtered.columns = ["region", "median_age", "median_personal_income",
                        "median_family_income", "median_household_income", "median_mortgage",
                        "median_rent", "avg_bedroom", "avg_household"]
abs_filtered.shape

(2472, 9)

In [19]:
abs_filtered.drop("region", axis=1).describe()

Unnamed: 0,median_age,median_personal_income,median_family_income,median_household_income,median_mortgage,median_rent,avg_bedroom,avg_household
count,2472.0,2472.0,2463.0,2463.0,2463.0,2463.0,2463.0,2463.0
mean,39.728964,835.014159,2115.051969,1762.125863,1805.879009,358.440926,0.811246,2.508242
std,8.617423,286.225602,745.723936,621.424829,688.744319,129.056917,0.17231,0.537332
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35.0,675.0,1655.5,1346.0,1430.0,290.0,0.8,2.3
50%,39.0,798.0,2079.0,1735.0,1815.0,365.0,0.8,2.5
75%,44.0,959.25,2477.0,2135.5,2167.0,426.0,0.9,2.8
max,91.0,3250.0,7000.0,7000.0,9999.0,1200.0,2.0,6.0


We should remove instances which include outlier values for some features. For example, we can see that minimum values for a lot of the statistics are zero, which doesn't make sense. We should also remove any NaN values.

In [20]:
# remove outliers
abs_filtered = abs_filtered[(abs_filtered["median_age"] > 0) & 
                            (abs_filtered["median_personal_income"] > 0) &
                            (abs_filtered["median_family_income"] > 0) &
                            (abs_filtered["median_household_income"] > 0) &
                            (abs_filtered["median_mortgage"] > 0) &
                            (abs_filtered["median_rent"] > 0) &
                            (abs_filtered["avg_bedroom"] > 0) &
                            (abs_filtered["avg_household"] > 0)]

abs_filtered = abs_filtered.dropna() # remove NaN values

abs_filtered.drop("region", axis=1).describe()

Unnamed: 0,median_age,median_personal_income,median_family_income,median_household_income,median_mortgage,median_rent,avg_bedroom,avg_household
count,2350.0,2350.0,2350.0,2350.0,2350.0,2350.0,2350.0,2350.0
mean,40.180851,845.071064,2167.366809,1799.73617,1881.170213,370.586383,0.82234,2.560596
std,6.407814,235.630911,639.611874,540.384664,585.24616,112.146069,0.117765,0.371431
min,23.0,240.0,551.0,575.0,109.0,20.0,0.5,1.3
25%,36.0,684.25,1695.5,1373.0,1500.0,300.0,0.8,2.3
50%,39.0,803.0,2094.5,1755.0,1849.0,370.0,0.8,2.5
75%,44.0,957.75,2487.0,2148.75,2167.0,430.0,0.9,2.8
max,70.0,2150.0,7000.0,4799.0,9999.0,1200.0,1.9,5.1


We can see that the distributions of these numeric features now look much more sensible.

In [21]:
# external dataset shape after filtering
abs_filtered.shape

(2350, 9)

Now we can take the median personal income (our variable of interest) and merge this with our customer dataset according to SA2 region code.

In [23]:
median_personal_income_df = abs_filtered.loc[:, ["region", "median_personal_income"]]
customers_merged["SA2_CODE_2021"] = customers_merged["SA2_CODE_2021"].astype("int") # helps with merging

df_merged = pd.merge(customers_merged, median_personal_income_df, left_on="SA2_CODE_2021", right_on="region", how="left")
df_merged = df_merged.drop("SA2_CODE_2021", axis=1) # drop duplicate column
df_merged.head(5)

Unnamed: 0,name,address,state,postcode,gender,consumer_id,region,median_personal_income
0,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1195503,,
1,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208,124011455.0,740.0
2,Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530,702021055.0,416.0
3,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,154128,124011452.0,687.0
4,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,712975,509031247.0,897.0


Since we are missing some income numbers (see the `NaN` value), we will impute each missing value with the median income of the state where the customer is from.

In [24]:
replacement_values = {"NSW": NSW_median_income, # values determined through initial ABS dataset
                      "VIC": VIC_median_income,
                      "QLD": QLD_median_income,
                      "SA": SA_median_income,
                      "WA": WA_median_income,
                      "TAS": TAS_median_income,
                      "NT": NT_median_income,
                      "ACT": ACT_median_income}

replacement_series = df_merged["state"].map(replacement_values)

df_merged["median_personal_income"] = df_merged["median_personal_income"].fillna(replacement_series)
df_merged.head(5)

Unnamed: 0,name,address,state,postcode,gender,consumer_id,region,median_personal_income
0,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1195503,,848.0
1,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208,124011455.0,740.0
2,Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530,702021055.0,416.0
3,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,154128,124011452.0,687.0
4,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,712975,509031247.0,897.0


In [25]:
df_merged["median_personal_income"].isna().any()

False

We can see that our median personal income column now has no missing values, and hence the data has been properly imputed.

In [28]:
# define the folder path and filename
output_path = "../data/curated/"
file_name = "consumers_median_income"

# create the folder if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# save df to csv in the specified folder
file_path = os.path.join(output_path, file_name)
df_merged.to_csv(file_path, index=False)