# Consumer's dataset merging process

In [2]:
import folium
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [7]:
df = pd.read_parquet("../data/curated/consumer/")
processed_postcode = pd.read_csv('../data/curated/processed_postcode.csv')
processed_postcode = processed_postcode[['postcode', 'SA2_code']]
processed_postcode

Unnamed: 0,postcode,SA2_code
0,200,801051049.0
1,800,701011002.0
2,801,701011002.0
3,804,701011009.0
4,810,701021010.0
...,...,...
3162,9013,310031284.0
3163,9015,310031284.0
3164,9464,302031038.0
3165,9726,309101268.0


In [8]:
# ACT has multiple areas with postcode 2611, check the existance
df.loc[(df['postcode']==2611) & (df['state'] == 'ACT')]

Unnamed: 0,consumer_id,user_id,name,address,state,postcode,gender


In [9]:
df = df.merge(processed_postcode, on='postcode', how='left')
df

Unnamed: 0,consumer_id,user_id,name,address,state,postcode,gender,SA2_code
0,28,458885,Gregory Barrett,7083 Carson Lane,WA,6176,Male,507051314.0
1,78,319257,Cory Best,992 Becky Junction,SA,5410,Male,405011111.0
2,101,9180,Jasmine Sanchez,598 Johnson Motorway Apt. 104,SA,5554,Female,405041124.0
3,108,191536,Karen Robinson,638 Chen Islands Suite 258,SA,5052,Female,403031065.0
4,133,234634,Christopher Cook,701 Diaz Walks Apt. 086,WA,6985,Male,501021007.0
...,...,...,...,...,...,...,...,...
499994,1499570,400937,Terry Johnson,884 Nicole Pine,VIC,3579,Female,215031401.0
499995,1499611,206860,Lisa King,95327 Walter Island,SA,5134,Female,401021010.0
499996,1499869,381121,Elizabeth Moore,21645 Jackson Landing Apt. 066,QLD,4403,Female,307021180.0
499997,1499911,56561,Rhonda Berry,02494 Day Islands Apt. 198,WA,6105,Female,506021121.0


In [10]:
#TODO: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl

income_df = pd.read_excel('../data/external/total_income.xlsx', sheet_name='Table 1.4')
income_df

Unnamed: 0,Australian Bureau of Statistics,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,Personal Income in Australia Table 1. Total In...,,,,,,,,,,...,,,,,,,,,,
1,Released at 11:30 am (Canberra time) 22/01/2021,,,,,,,,,,...,,,,,,,,,,
2,Table 1.4,Total Income by Statistical Area Level 2 (2014...,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,Earners (persons),,,,,Median age of earners (years),,,...,,,,,Mean ($),,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2301,801111140,ACT - South West,349,289,347,364,355,40,39,39,...,62592,55384,61096,64227,62505,72858,70503,67445,73435,
2302,801111141,Namadgi,np,np,21,19,18,np,np,34,...,np,49981,58498,40479,np,np,57309,56357,39278,
2303,,,,,,,,,,,...,,,,,,,,,,
2304,Totals may not align with the sum of their com...,,,,,,,,,,...,,,,,,,,,,


In [11]:
income_df1 = income_df.drop(income_df.index[0:6], inplace=False).reset_index(drop=True)
income_df1.columns = income_df.iloc[5].values.flatten().tolist()
income_df1 = (income_df1.iloc[:, [0, 1, 26]])
income_df1.drop(income_df1.index[2297:2300], inplace=True)
income_df1.rename(columns={'2018-19':'Mean Total Income'}, inplace=True)
income_df1.rename(columns={'SA2':'SA2_code'}, inplace=True)
#income_df1['Mean Total Income'].replace('np', np.NAN)
income_df1 = income_df1.dropna().reset_index()
#income_df1.astype({'SA2': 'int'})
income_df1

Unnamed: 0,index,SA2_code,SA2 NAME,Mean Total Income
0,2,101021007,Braidwood,51149
1,3,101021008,Karabar,66335
2,4,101021009,Queanbeyan,65874
3,5,101021010,Queanbeyan - East,69860
4,6,101021011,Queanbeyan Region,81919
...,...,...,...,...
2283,2292,801101137,Molonglo,np
2284,2293,801101138,Molonglo - North,np
2285,2294,801101139,Wright,86007
2286,2295,801111140,ACT - South West,73435


In [15]:
(income_df1['Mean Total Income'] == 'np').sum()

44

In [16]:
# missing value checking for mean total income
SA2_missing = (income_df1['SA2 NAME'].where(income_df1['Mean Total Income'] == 'np')).dropna()
SA2_missing.size

44

In [12]:
# combining consumer with mean total income based on SA2 code
df = df.merge(income_df1, on='SA2_code', how='left')
df

Unnamed: 0,consumer_id,user_id,name,address,state,postcode,gender,SA2_code,index,SA2 NAME,Mean Total Income
0,28,458885,Gregory Barrett,7083 Carson Lane,WA,6176,Male,507051314.0,,,
1,78,319257,Cory Best,992 Becky Junction,SA,5410,Male,405011111.0,1682.0,Light,55650
2,101,9180,Jasmine Sanchez,598 Johnson Motorway Apt. 104,SA,5554,Female,405041124.0,1695.0,Kadina,51413
3,108,191536,Karen Robinson,638 Chen Islands Suite 258,SA,5052,Female,403031065.0,1636.0,Belair,73814
4,133,234634,Christopher Cook,701 Diaz Walks Apt. 086,WA,6985,Male,501021007.0,1749.0,Capel,60552
...,...,...,...,...,...,...,...,...,...,...,...
499994,1499570,400937,Terry Johnson,884 Nicole Pine,VIC,3579,Female,215031401.0,1008.0,Gannawarra,46093
499995,1499611,206860,Lisa King,95327 Walter Island,SA,5134,Female,401021010.0,1580.0,Uraidla - Summertown,71963
499996,1499869,381121,Elizabeth Moore,21645 Jackson Landing Apt. 066,QLD,4403,Female,307021180.0,1221.0,Jondaryan,50927
499997,1499911,56561,Rhonda Berry,02494 Day Islands Apt. 198,WA,6105,Female,506021121.0,1869.0,Perth Airport,74118


In [13]:
df.to_csv('../data/curated/consumer_income.csv')


In [24]:
income_df1.to_csv('../data/curated/income.csv')