# The following code is for external data processing

In [1]:
import re
import numpy as np
import pandas as pd
import re
import geopandas as gpd
import os

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = (
    SparkSession.builder.appName("MAST30034 Project 2 Preprocessing")
    .config("spark.driver.memory", '4g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.parquet.writeLegacyFormat", 'true')
    .getOrCreate()
)

22/09/22 22:26:46 WARN Utils: Your hostname, DESKTOP-3NQ3PQI resolves to a loopback address: 127.0.1.1; using 172.31.183.205 instead (on interface eth0)
22/09/22 22:26:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/22 22:26:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/22 22:26:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/09/22 22:26:53 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


### Mapping Postcode to ABS Postal Areas

In [None]:
def postcode_to_str(col):
    return col.astype(str).str.zfill(4)

In [None]:
# Read in data
postal_areas_gdf = gpd.read_file('../data/raw/postcodes/abs_postal_areas.zip')
consumer_details_df = pd.read_parquet('../data/curated/cleaned_consumers.parquet')
postcode_df = pd.read_csv('../data/raw/postcodes/postcodes.csv').drop_duplicates('postcode')

consumer_details_df['postcode'] = postcode_to_str(consumer_details_df['postcode'])
postcode_df['postcode'] = postcode_to_str(postcode_df['postcode'])

# Convert postcode dataframe to geodataframe
postcode_gdf = gpd.GeoDataFrame(
    postcode_df, geometry=gpd.points_from_xy(postcode_df['long'], postcode_df['lat'])
)
postcode_gdf.crs = postal_areas_gdf.crs

# Get list of postcodes not listed as abs postal areas and filter geodataframe to just these postcodes
unmapped = consumer_details_df[~consumer_details_df['postcode'].astype(str).str.zfill(4).isin(postal_areas_gdf['POA_CODE21'])]['postcode'].unique()
postcodes_gdf = postcode_gdf[postcode_gdf['postcode'].isin(unmapped)]

# Spatially join unmapped postcodes and abs postal areas
postcode_poa_gdf = postcodes_gdf.sjoin(postal_areas_gdf, how = 'inner')

# Remove and rename columns 
postcode_poa_df = postcode_poa_gdf[['postcode', 'POA_CODE21']]
postcode_poa_df = postcode_poa_df.rename(columns = {'POA_CODE21' : 'poa'})

# Combine abs mapped postcodes with unmapped postcodes
postcode_poa_df = pd.concat([postcode_poa_df, postal_areas_gdf[['POA_CODE21', 'POA_CODE21']].set_axis(['postcode', 'poa'], axis = 1)], ignore_index = True).reset_index(drop = True)

In [None]:
output_dir = '../data/curated/census/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

postcode_poa_df.to_parquet(output_dir + 'postcode_poa.parquet', index = False)

All but 2 postcodes could be mapped to abs postal areas. Niether of these could be found in the Australia post website. https://postcodes-australia.com/postcodes/6958 says 6958 is a Western Australian postcode reserved for non standard use<br><br>
This results in the removal of 317 consumers from the dataset

In [None]:
removed_consumers = consumer_details_df[~consumer_details_df['postcode'].astype(str).str.zfill(4).isin(postcode_poa_df['postcode'])]
len(removed_consumers), removed_consumers['postcode'].unique()

# Age/Gender Weighting

Read in data

In [3]:
transaction_sdf = spark.read.parquet(
    '../data/tables/transactions_20210228_20210827_snapshot/'
).union(
    spark.read.parquet(
        '../data/tables/transactions_20210828_20220227_snapshot/'
    )
).union(
    spark.read.parquet(
        '../data/tables/transactions_20220228_20220828_snapshot/'
    )
)

id_df = pd.read_parquet(
    '../data/tables/consumer_user_details.parquet'
)

consumer_df = pd.read_parquet(
    '../data/curated/cleaned_consumers.parquet'
)

age_df = pd.read_parquet(
    '../data/curated/census/age_data.parquet'
)

postcode_poa_df = pd.read_parquet(
    '../data/curated/census/postcode_poa.parquet'
)

                                                                                

Generate population by age intervals (18-24, 25-34, 35-44, 45-54, 55-64, 65+)

In [4]:
cols = []
for start_yr, end_yr in zip([18,25,35,45,55], [25,35,45,55,65]):
    for g in ['m', 'f']:
        col = f'age_{start_yr}_{end_yr - 1}_{g}'
        cols.append(col)
        age_df[col] = age_df.filter(regex = '|'.join([f'(age_yr_{x}_{g})'for x in range(start_yr,end_yr)])).astype(int).sum(axis = 1)

for g in ['m', 'f']:
    col = f'age_65+_{g}'
    cols.append(col)
    age_df[col] = age_df.filter(regex = '|'.join([f'(age_yr_{x}_{x+4}_{g})'for x in range(65,100, 5)] + ['age_yr_100_yr_over_[mf]'])).astype(int).sum(axis = 1)


age_df = age_df[['poa'] + cols].melt(id_vars = 'poa')
age_df['gender'] = age_df['variable'].apply(lambda x : 'Male' if x[-1] == 'm' else 'Female')
age_df['variable'] = age_df['variable'].apply(lambda x : x[:-2])
age_df = pd.pivot_table(age_df, values = 'value', index =['poa', 'gender'], columns='variable')


idx = pd.IndexSlice
aus_age_m = age_df.loc[idx[:, 'Male'], :].sum()
aus_age_f = age_df.loc[idx[:, 'Female'], :].sum()

aus_prob_m = aus_age_m.sum()/(aus_age_m.sum() + aus_age_f.sum())

aus_age_m *= 1/aus_age_m.sum()
aus_age_f *= 1/aus_age_f.sum()


age_df = age_df.apply(lambda x : x/x.sum(), axis = 1)
age_df = age_df.fillna(age_df.mean())
age_df = age_df.reset_index()


Add ABS postal area (poa) to each consumer

In [5]:
consumer_df = pd.merge(consumer_df, postcode_poa_df, how = 'inner', on = 'postcode')

Create dataframe grouped by merchant and postal area (poa) with propn of customers for each corresponding postcode

In [6]:

merchant_df = transaction_sdf.join(
    spark.createDataFrame(id_df),
    on = 'user_id'
).join(
    spark.createDataFrame(consumer_df),
    on = 'consumer_id'
).groupBy(
    'merchant_abn', 'poa'
).count().toPandas()



[Stage 7:>                  (0 + 8) / 8][Stage 8:>                  (0 + 0) / 8]

22/09/22 22:28:05 WARN TaskSetManager: Stage 8 contains a task of very large size (4035 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [7]:
def get_propn(df):
    df['propn'] = df['count']/df['count'].sum()
    return df
merchant_df = merchant_df.groupby('merchant_abn').apply(get_propn)

Join to age data and scale the values according to the proportion of customers from each postcode

In [8]:

# Joins merchantsand poa data with abs data for population by age for each poa
merchant_df = pd.merge(merchant_df, age_df, on = 'poa')


In [9]:
# Creates scaled version of each population metric by age
for col in age_df.columns:
    if col.startswith('age'):
        merchant_df[col] = merchant_df[col]*merchant_df['propn']

merchant_df = merchant_df.drop(['poa', 'count', 'propn'], axis = 1).groupby(['merchant_abn', 'gender']).sum()


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 34312)
Traceback (most recent call last):
  File "/usr/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/root/.virtualenvs/ads_proj2/lib/python3.10/site-packages/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/root/.virtualenvs/ads_proj2/lib/python3.10/site-packages/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/root/.virtualenvs/ads_proj2/lib/python3.10/site-packages/pyspark/accumulators.py", line 257, in accum_u

The final result is an estimate of the probability that a given customer comes from each age group (assumes that all customers are above the age of 18)

In [10]:
merchant_df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,age_18_24,age_25_34,age_35_44,age_45_54,age_55_64,age_65+
merchant_abn,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10023283211,Female,0.090436,0.155516,0.155512,0.1675,0.175922,0.255115


Calculating weights

In [11]:
# Constants
prob_bnpl = 0.05
prob_female_g_bnpl = 0.57
prob_male_g_bnpl = 0.43

prob_age_g_bnpl = pd.Series(data = {'age_18_24' : 0.26, 'age_25_34' : 0.35, 'age_35_44' : 0.2, 'age_45_54' : 0.12, 'age_55_64' : 0.04,'age_65+' : 0.01})

prob_bnpl_g_age_m = prob_age_g_bnpl*prob_male_g_bnpl*prob_bnpl/aus_age_m/aus_prob_m
prob_bnpl_g_age_f = prob_age_g_bnpl*prob_female_g_bnpl*prob_bnpl/aus_age_f/(1-aus_prob_m)

In [12]:
def get_prob_bnpl(row):
    if row.name[1] == 'Male':
        return (row*prob_bnpl_g_age_m).sum()
    if row.name[1] == 'Female':
        return (row*prob_bnpl_g_age_f).sum()

merchant_df['weight'] = merchant_df.apply(get_prob_bnpl, axis = 1)
merchant_df = merchant_df[['weight']]

In [16]:
merchant_df

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
merchant_abn,gender,Unnamed: 2_level_1
10023283211,Female,0.049574
10023283211,Male,0.038421
10142254217,Female,0.049346
10142254217,Male,0.038263
10165489824,Female,0.051551
...,...,...
99987905597,Male,0.037582
99989036621,Female,0.046277
99989036621,Male,0.039041
99990536339,Female,0.046267


In [19]:
merchant_df.to_parquet('../data/curated/merchant_gender_weights.parquet')