In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("ADS project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/09/12 16:02:31 WARN Utils: Your hostname, mugis-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.13.24.151 instead (on interface en0)
22/09/12 16:02:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/12 16:02:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/12 16:02:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/09/12 16:02:33 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/09/12 16:02:33 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [2]:
import pandas as pd
import numpy as np

raw_data = "../data/ABS/"
curated_data = "../data/curated/"

tbl_income = pd.read_excel(raw_data+"annual income by SA2 regions.xlsx")
tbl_population = pd.read_excel(raw_data+"population estimates by SA2 regions.xlsx")


<h2>1. Clean external data</h2>

<h3>1.1 Clean "annual income"</h3>

In [3]:
tbl_income.drop(np.arange(0,8), inplace=True)
tbl_income.drop(np.arange(2303,2306), inplace=True)

tbl_income.drop(tbl_income.columns[[2,3,4,5,7,8,9,10,12,13,14,15,17,18,19,20,22,23,24,25,27]], axis=1, inplace=True)
tbl_income = tbl_income.reset_index(drop=True)
tbl_income.columns = ['sa2', 'sa2_name', 'total_earners', 'median_age', 'income_sum', 'income_median', 'income_mean']

In [4]:
tbl_income = tbl_income[tbl_income["total_earners"] != "np"]
tbl_income["total_earners"] = pd.to_numeric(tbl_income["total_earners"])
tbl_income["median_age"] = pd.to_numeric(tbl_income["median_age"])
tbl_income["income_sum"] = pd.to_numeric(tbl_income["income_sum"])
tbl_income["income_median"] = pd.to_numeric(tbl_income["income_median"])
tbl_income["income_mean"] = pd.to_numeric(tbl_income["income_mean"])

In [5]:
tbl_income

Unnamed: 0,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean
0,101021007,Braidwood,2361,51,120763285,41593,51149
1,101021008,Karabar,5100,42,338308979,61777,66335
2,101021009,Queanbeyan,6697,39,441160946,60119,65874
3,101021010,Queanbeyan - East,3393,40,237035656,63051,69860
4,101021011,Queanbeyan Region,12821,44,1050285692,71420,81919
...,...,...,...,...,...,...,...
2288,801101135,Coombs,2270,35,179834174,74179,79222
2289,801101136,Denman Prospect,416,38,41056005,90337,98692
2292,801101139,Wright,2121,35,182421537,79150,86007
2293,801111140,ACT - South West,355,40,26069449,64227,73435


In [6]:
tbl_income.to_csv(curated_data+"clean_annual.csv")

<h3>1.2 Clean "population"</h3>

In [7]:
tbl_population.drop(tbl_population.columns[0:8], axis=1, inplace=True)
tbl_population.drop(tbl_population.columns[1:22], axis=1, inplace=True)
tbl_population.drop(tbl_population.columns[2:6], axis=1, inplace=True)

In [8]:
tbl_population.columns = ["sa2", "2021_population", "km2", "persons/km2"]
tbl_population = tbl_population.dropna(axis=0, how='any', thresh=None, subset="sa2", inplace=False)
tbl_population['sa2'] = tbl_population['sa2'].astype(float).astype(int)
tbl_population

Unnamed: 0,sa2,2021_population,km2,persons/km2
0,101021007,4330.0,3418.4,1.3
1,101021008,8546.0,7.0,1223.9
2,101021009,11370.0,4.8,2387.7
3,101021010,5093.0,13.0,391.7
4,101021012,12743.0,13.7,931.9
...,...,...,...,...
2449,801111141,67.0,1202.8,0.1
2450,901011001,1716.0,136.1,12.6
2451,901021002,602.0,13.7,43.9
2452,901031003,310.0,67.2,4.6


In [9]:
tbl_population.to_csv(curated_data+"clean_population.csv")

<h3>1.3 Clean "postcode_sa2"</h3>

In [10]:
postcode_2011_SA2_2011 = pd.ExcelFile(raw_data+"1270055006_CG_POSTCODE_2011_SA2_2011.xls")
postcode_sa2 = pd.read_excel(postcode_2011_SA2_2011, "Table 3")

In [11]:
postcode_sa2.drop(np.arange(0,6), inplace=True)
postcode_sa2.drop(np.arange(5994,5997), inplace=True)

postcode_sa2.drop(postcode_sa2.columns[3:6], axis=1, inplace=True)
postcode_sa2.drop(postcode_sa2.columns[0], axis=1, inplace=True)
postcode_sa2 = postcode_sa2.reset_index(drop=True)

postcode_sa2.columns = ['postcode', 'sa2']
postcode_sa2['sa2'] = pd.to_numeric(postcode_sa2['sa2'])

In [12]:
postcode_sa2.to_csv(curated_data+"postcode_sa2.csv")

<h2>2. Join external data</h2>

<h3>2.1 Join "annual income" and "population"</h3>

In [13]:
merge_income_population = tbl_income.merge(tbl_population, on='sa2', how='inner')

In [14]:
merge_income_population

Unnamed: 0,sa2,sa2_name,total_earners,median_age,income_sum,income_median,income_mean,2021_population,km2,persons/km2
0,101021007,Braidwood,2361,51,120763285,41593,51149,4330.0,3418.4,1.3
1,101021008,Karabar,5100,42,338308979,61777,66335,8546.0,7.0,1223.9
2,101021009,Queanbeyan,6697,39,441160946,60119,65874,11370.0,4.8,2387.7
3,101021010,Queanbeyan - East,3393,40,237035656,63051,69860,5093.0,13.0,391.7
4,101021012,Queanbeyan West - Jerrabomberra,8476,44,725602722,73851,85607,12743.0,13.7,931.9
...,...,...,...,...,...,...,...,...,...,...
2099,801101135,Coombs,2270,35,179834174,74179,79222,4834.0,2.3,2085.6
2100,801101136,Denman Prospect,416,38,41056005,90337,98692,2719.0,4.7,573.2
2101,801101139,Wright,2121,35,182421537,79150,86007,3806.0,1.3,2993.1
2102,801111140,ACT - South West,355,40,26069449,64227,73435,554.0,416.8,1.3


<h3>2.2 Join "merge_income_population" and "postcode_sa2"</h3>

In [15]:
merge_income_population_postcode = merge_income_population.merge(postcode_sa2, on='sa2', how='left')
merge_income_population_postcode = merge_income_population_postcode.dropna(axis=0, how='any', thresh=None, subset="postcode", inplace=False)

In [16]:
agg_income_population_postcode = merge_income_population_postcode.groupby("postcode").agg(
    {
    "total_earners": "sum",
    "median_age": "mean",
    "income_sum": "sum",
    "2021_population": "sum",
    "km2": "sum"
    }
).reset_index()

In [17]:
agg_income_population_postcode

Unnamed: 0,postcode,total_earners,median_age,income_sum,2021_population,km2
0,0800,5632,33.000000,420609031,7679.0,3.2
1,0810,21932,39.583333,1574969237,36216.0,24.3
2,0812,11443,42.000000,801236575,19888.0,12.0
3,0820,15219,39.555556,1235996605,22877.0,67.8
4,0822,21869,40.142857,1425664118,61406.0,248437.3
...,...,...,...,...,...,...
2416,7466,1985,45.000000,115705864,4373.0,3931.6
2417,7467,1985,45.000000,115705864,4373.0,3931.6
2418,7468,1985,45.000000,115705864,4373.0,3931.6
2419,7469,1985,45.000000,115705864,4373.0,3931.6


<h3>2.3 Join "agg_income_population_postcode" and "mechant_consumer_info" (internal data)</h3>

In [18]:
merchant_consumer_info = pd.read_parquet(curated_data+"mechant_consumer_info")

In [19]:
merchant_consumer_info["consumer_postcode"] = merchant_consumer_info["consumer_postcode"].str.pad(width=4, fillchar="0", side="left")

In [20]:
final_table = agg_income_population_postcode.merge(merchant_consumer_info, left_on='postcode', right_on='consumer_postcode', how='right')
len(final_table[final_table["income_sum"].isnull()]["consumer_postcode"].unique())
# print(f"Mismatched (null) transactions dropped: {mechant_consumer_info.shape[0] - final_table.shape[0]}")

746

In [21]:
final_table

Unnamed: 0,postcode,total_earners,median_age,income_sum,2021_population,km2,merchant_name,products,revenue_level,take_rate,dollar_value,order_year,order_month,order_day,consumer,consumer_address,consumer_state,consumer_postcode,consumer_gender
0,2021,16590.0,42.00,2.483261e+09,22413.0,5.0,Nullam Enim Sed Incorporated,"[tent, and, awning, shops]",e,0.27,24.224445,2021,7,29,Todd Long,883 Patty Mountains Apt. 285,NSW,2021,Male
1,2021,16590.0,42.00,2.483261e+09,22413.0,5.0,Felis Limited,"[furniture, home, furnishings, and, equipment,...",e,0.18,212.857318,2021,5,2,Todd Long,883 Patty Mountains Apt. 285,NSW,2021,Male
2,3874,2856.0,51.00,1.385853e+08,5556.0,1931.9,Non Lobortis Corporation,"[computer, programming, data, processing, and,...",e,0.36,168.526052,2021,3,11,Anthony Bryant,76494 Derrick Court,VIC,3874,Male
3,2299,32828.0,38.75,2.002030e+09,64000.0,49.8,Eget Metus Eu Institute,"[shoe, shops]",e,0.42,328.258033,2021,7,2,Christopher Rodriguez,30554 Evans Stream Apt. 379,NSW,2299,Male
4,2798,10769.0,46.50,6.337809e+08,19335.0,6122.8,Et Nunc Consulting,"[books, periodicals, and, newspapers]",e,0.16,979.978796,2021,8,15,Sara Allen,92887 Stewart Flat,NSW,2798,Undisclosed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6988071,3201,22248.0,40.50,1.274879e+09,39621.0,32.7,Dictum Ultricies Ltd,"[computer, programming, data, processing, and,...",a,6.43,570.485193,2021,6,19,Michelle Matthews,2887 Joanne Vista,VIC,3201,Female
6988072,3201,22248.0,40.50,1.274879e+09,39621.0,32.7,Ornare Fusce Inc.,"[hobby, toy, and, game, shops]",a,5.98,189.266530,2021,7,20,Michelle Matthews,2887 Joanne Vista,VIC,3201,Female
6988073,3201,22248.0,40.50,1.274879e+09,39621.0,32.7,Orci In Consequat Corporation,"[gift, card, novelty, and, souvenir, shops]",a,6.61,77.808667,2021,7,3,Michelle Matthews,2887 Joanne Vista,VIC,3201,Female
6988074,3201,22248.0,40.50,1.274879e+09,39621.0,32.7,Vel Est Tempor LLP,"[computers, computer, peripheral, equipment, a...",a,5.76,10.768560,2021,6,28,Michelle Matthews,2887 Joanne Vista,VIC,3201,Female


In [24]:
final_table.to_parquet(curated_data+"transact_abs.parquet")