# Run "ETL" script

In [111]:
# Open Spark session 
from pyspark.sql import SparkSession, functions as F


# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Data_Explorer")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [112]:
# Read the files 
transactions_sample = spark.read.parquet('../data/tables/transactions_20210828_20220227_snapshot')
transactions_sample2 = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot')
transactions_sample3 = spark.read.parquet('../data/tables/transactions_20220228_20220828_snapshot')
transactions_sample.unionByName(transactions_sample2, True)
transactions_sample.unionByName(transactions_sample3, True)
consumer_details = spark.read.parquet('../data/tables/consumer_user_details.parquet')
merchants_tbl = spark.read.parquet('../data/tables/tbl_merchants.parquet')
customer_tbl = spark.read.option("delimiter", "|").option("header",True).csv('../data/tables/tbl_consumer.csv')

                                                                                

In [113]:
merchants = merchants_tbl.toPandas()

In [114]:
import re
# this function standardises the tags attribute, creating a list with the 'description', 'revenue band' and 'BNPL service charge'
def tag_extract(tag_string): 
    # first need to preprocess
    string =  re.sub('\[','(', tag_string.lower())
    string = re.sub('\]',')', string)
    # break the string into sections
    string_cut = string.split('),')
    new_string = []
    # first extract the description 
    new_string.append(str(string_cut[0].strip('((')))
    # second extract the band
    new_string.append(str(re.search(r'[a-z]',string_cut[1]).group()))
    # finally the take rate
    new_string.append(float(re.search(r'[0-9]+\.[0-9]+',string_cut[2]).group()))
    return(new_string)
################
# now we can run the algorithm
tags = merchants['tags']
processed_tags = []
for i in tags:
    processed_tags.append(tag_extract(i))

In [115]:
import pandas as pd
merchant_tbl = pd.DataFrame(processed_tags, columns=('Description', 'Earnings_Class', 'BNPL_Fee'))
merchant_tbl = pd.concat([merchants, merchant_tbl], axis=1)
# drop the tags column 
merchant_tbl.drop(columns='tags', inplace=True)

In [116]:
# and convert back to spark dataframe 
merchants_tbl = spark.createDataFrame(merchant_tbl)

In [117]:
# This could be further expanded in breaking the discription up further

In [119]:
customer_tbl = customer_tbl.join(consumer_details, ['consumer_id'])

In [130]:
full_dataset = transactions_sample.join(customer_tbl, ['user_id'])
merchants_tbl = merchants_tbl.withColumnRenamed('name','company_name')
full_dataset = full_dataset.join(merchants_tbl, ['merchant_abn'])

In [131]:
# lets add a day (mon,...), weekly & monthly attribute
import pyspark.sql.functions as F
full_dataset = full_dataset.withColumn('Day', F.dayofweek('order_datetime'))
full_dataset = full_dataset.withColumn('Month', F.month('order_datetime'))
# now we can also add the bnpl revenue from a transaction 
full_dataset = full_dataset.withColumn('BNPL_Revenue', F.col('dollar_value') * 0.01 * F.col('BNPL_Fee'))
full_dataset.createOrReplaceTempView('data')
# we can remove name, location and customerID for now, due to being unnnesesary attributes (although company_name could also be removed)
full_dataset = spark.sql("""
select merchant_abn, user_id, dollar_value, order_id, order_datetime, state, postcode, gender, company_name, 
        Description, Earnings_Class, BNPL_Fee, BNPL_Revenue, Day, Month, weekofyear(order_datetime) as weekofyear from data
""")

# Standardisation of Customers 
The objective of this section is to verify if a customer's details have been recorded correctly

In [132]:
import pandas as pd
# dataset link 
link = 'https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv'
postcodes = pd.read_csv("../data/tables/australian_postcodes.csv")

In [133]:
postcodes['postcode'] = postcodes['postcode'].astype('str')

In [134]:
cust = customer_tbl.toPandas()

In [135]:
cust

Unnamed: 0,consumer_id,name,address,state,postcode,gender,user_id
0,1195503,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1
1,179208,Mary Smith,3764 Amber Oval,NSW,2782,Female,2
2,1194530,Jill Jones MD,40693 Henry Greens,NT,862,Female,3
3,154128,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,4
4,712975,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,5
...,...,...,...,...,...,...,...
499994,1385608,Jessica Avila,508 Miranda Overpass Apt. 218,QLD,4400,Female,499995
499995,1466964,Steven Thornton,7913 Schwartz Mission Suite 483,VIC,3097,Undisclosed,499996
499996,1253484,Christy Smith,5681 Zachary Mountain Apt. 060,NSW,2756,Undisclosed,499997
499997,175005,Donna Sutton,54140 Jacob Point,VIC,3989,Female,499998


In [157]:
postcodes_sub = postcodes[['postcode', 'state', 'sa3name', 'sa4name', 'SA3_NAME_2016', 'electoraterating', 'electorate']]

In [158]:
# First imputate missing values
postcodes_sub["sa3name"] = postcodes_sub.groupby("state")["sa3name"].transform(lambda x: x.fillna(x.mode()))
postcodes_sub["sa4name"] = postcodes_sub.groupby("state")["sa4name"].transform(lambda x: x.fillna(x.mode()))
postcodes_sub["SA3_NAME_2016"] = postcodes_sub.groupby("state")["SA3_NAME_2016"].transform(lambda x: x.fillna(x.mode()))
postcodes_sub["electoraterating"] = postcodes_sub.groupby("state")["electoraterating"].transform(lambda x: x.fillna(x.mode()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  postcodes_sub["sa3name"] = postcodes_sub.groupby("state")["sa3name"].transform(lambda x: x.fillna(x.mode()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  postcodes_sub["sa4name"] = postcodes_sub.groupby("state")["sa4name"].transform(lambda x: x.fillna(x.mode()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

In [162]:
postcodes_sub.groupby("state")["sa3name"].transform(lambda x: x.fillna(x.mode()))

0                        Woden
1                          NaN
2                  Darwin City
3                  Darwin City
4                  Darwin City
                 ...          
18437    Brisbane Inner - West
18438    Brisbane Inner - West
18439                   Nundah
18440         Surfers Paradise
18441           Melbourne City
Name: sa3name, Length: 18442, dtype: object

In [159]:
postcodes_sub.isna().sum()

postcode               0
state                  0
sa3name              397
sa4name              397
SA3_NAME_2016        177
electoraterating    1502
electorate             0
dtype: int64

In [186]:
import numpy as np

In [225]:
postcodes_agg = postcodes_sub.groupby(['state', 'postcode'], as_index=False).agg(sa3name = pd.NamedAgg('sa3name',lambda x: pd.Series.mode(x) if len(pd.Series.mode(x))>0 else np.NaN),
                                                 sa4name = pd.NamedAgg('sa4name',lambda x: pd.Series.mode(x) if len(pd.Series.mode(x))>0 else np.NaN),
                                                 electoraterating = pd.NamedAgg('electoraterating',lambda x: pd.Series.mode(x) if len(pd.Series.mode(x))>0 else np.NaN),
                                                 SA3_NAME_2016 = pd.NamedAgg('SA3_NAME_2016',lambda x: pd.Series.mode(x) if len(pd.Series.mode(x))>0 else np.NaN),
                                                 electorate = pd.NamedAgg('electorate',lambda x: pd.Series.mode(x) if len(pd.Series.mode(x))>0 else np.NaN)
                                                 )

In [226]:
# Imputate
imputation = postcodes_agg.groupby('state', as_index=False).agg(sa3name_mode = pd.NamedAgg('sa3name',lambda x: pd.Series.mode(x) if len(pd.Series.mode(x))>0 else np.NaN),
                                                 sa4name_mode = pd.NamedAgg('sa4name',lambda x: pd.Series.mode(x) if len(pd.Series.mode(x))>0 else np.NaN),
                                                 electoraterating_mode = pd.NamedAgg('electoraterating',lambda x: pd.Series.mode(x) if len(pd.Series.mode(x))>0 else np.NaN),
                                                 SA3_NAME_2016_mode = pd.NamedAgg('SA3_NAME_2016',lambda x: pd.Series.mode(x) if len(pd.Series.mode(x))>0 else np.NaN),
                                                 electorate_mode = pd.NamedAgg('electorate',lambda x: pd.Series.mode(x) if len(pd.Series.mode(x))>0 else np.NaN)
                                                 )

In [230]:
postcodes_agg = postcodes_agg.merge(imputation, on='state', how='left')

In [231]:
postcodes_agg.sa3name.fillna(postcodes_agg.sa3name_mode, inplace=True)
postcodes_agg.sa4name.fillna(postcodes_agg.sa4name_mode, inplace=True)
postcodes_agg.electoraterating.fillna(postcodes_agg.electoraterating_mode, inplace=True)
postcodes_agg.SA3_NAME_2016.fillna(postcodes_agg.SA3_NAME_2016_mode, inplace=True)
postcodes_agg.electorate.fillna(postcodes_agg.electorate_mode, inplace=True)

In [234]:
postcodes_agg = postcodes_agg.drop(['sa3name_mode', 'sa4name_mode', 'electoraterating_mode', 'SA3_NAME_2016_mode', 'electorate_mode'], axis = 1)

In [236]:
full_cust = cust.merge(postcodes_agg, on=['postcode', 'state'], how='left')