In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F
from sklearn.metrics import mean_absolute_error
import sklearn

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

# Summary

### Introduction

Buy Now Pay Later (BNPL) services are a popular method of financing that allows consumers to make purchases and pay for them at a later date. Users pay off what is owed through interest free timely installments and don't have to worry about having the given funds at the time of purchase. Your BNPL firm has recently started their own “pay in 5 installments plan” and is only able to onboard 100 merchants every year for their plan due to limited resources. Choosing 100 merchants at random could cost the company millions of dollars in revenue down the line so it is vital to choose the top 100 merchants available to maximize profits. This is no easy task however and that is where I and my team come in. I’m Aryan Shahi, this is Jai Bretherton, this is Eesha Syed, and this is Minma Herath. We are all bachelor of science students at the University of Melbourne who are majoring in data science and we all accepted to offer our experience to help solve this problem. Our task was to determine the top 100 merchants to be selected by the firm based on given and external data.

### Datasets

##### Given

In [3]:
merchantdf = spark.read.parquet("../data/curated/merchantdf.parquet/")
userdf = spark.read.parquet("../data/tables/consumer_user_details.parquet")
transactiondf = spark.read.parquet("../data/curated/transactiondf.parquet/")
consumerdf = spark.read.option("header","true").csv("../data/tables/tbl_consumer.csv", sep="|")
consumerfrauddf = spark.read.parquet("../data/curated/consumerfrauddf.parquet/")
merchantfrauddf = spark.read.parquet("../data/curated/merchantfrauddf.parquet/")

                                                                                

In [4]:
merchantdf.limit(3)

                                                                                

company_name,tags,merchant_abn,take_rate,revenue_band
Felis Limited,"furniture, home f...",10023283211,0.18,e
Arcu Ac Orci Corp...,"cable, satellite,...",10142254217,4.22,b
Nunc Sed Company,"jewelry, watch, c...",10165489824,4.4,b


In [5]:
userdf.limit(3)

user_id,consumer_id
1,1195503
2,179208
3,1194530


In [6]:
transactiondf.limit(3)

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime
14935,79417999332,136.06570809815838,23acbb7b-cf98-458...,2021-11-26
1,46451548968,72.61581642788431,76bab304-fa2d-400...,2021-11-26
14936,89518629617,3.0783487174439297,a2ae446a-2959-41c...,2021-11-26


In [7]:
consumerdf.limit(3)

name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530


In [8]:
consumerfrauddf.limit(3)

user_id,order_datetime,fraud_probability
3753,2022-02-16,48.85325253622543
9646,2021-09-23,47.83931206340956
243,2021-09-02,50.88971939168309


In [9]:
merchantfrauddf.limit(3)

merchant_abn,order_datetime,fraud_probability
19492220327,2021-11-28,44.40365864749536
31334588839,2021-10-02,42.75530083865367
19492220327,2021-12-22,38.867790051131095


##### External

In [10]:
populationdf = pd.read_excel("../data/tables/population.xlsx", sheet_name = 'Table 3', skiprows = 7)
incomedf = spark.read.option("header","false").csv("../data/tables/datasource-AU_Govt_ABS-UoM_AURIN_DB_3abs_personal_income_total_income_sa2_2011_2018.csv")
postcodedf = spark.read.option("header","true").csv("../data/tables/australian_postcodes.csv")
postcode_ratio_df = pd .read_excel("../data/tables/1270055006_CG_POSTCODE_2011_SA2_2011.xls",sheet_name = 'Table 3',skiprows = 5)

In [11]:
populationdf.head(3)

Unnamed: 0,S/T code,S/T name,GCCSA code,GCCSA name,SA4 code,SA4 name,SA3 code,SA3 name,SA2 code,SA2 name,...,no..9,no..10,no..11,no..12,no..13,no..14,no..15,no..16,no..17,no..18
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,New South Wales,1RNSW,Rest of NSW,101.0,Capital Region,10102.0,Queanbeyan,101021007.0,Braidwood,...,300.0,352.0,362.0,424.0,328.0,293.0,237.0,123.0,82.0,4330.0
2,1.0,New South Wales,1RNSW,Rest of NSW,101.0,Capital Region,10102.0,Queanbeyan,101021008.0,Karabar,...,556.0,590.0,580.0,516.0,412.0,320.0,216.0,159.0,90.0,8546.0


In [12]:
incomedf.limit(3)

22/10/10 01:05:23 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15,_c16,_c17,_c18,_c19,_c20,_c21,_c22,_c23,_c24,_c25,_c26,_c27,_c28,_c29,_c30,_c31,_c32,_c33,_c34,_c35,_c36,_c37,_c38,_c39
FID,fid,sa2_code,sa2_name,earners_persons_2...,earners_persons_2...,earners_persons_2...,earners_persons_2...,earners_persons_2...,earners_persons_2...,earners_persons_2...,median_age_of_ear...,median_age_of_ear...,median_age_of_ear...,median_age_of_ear...,median_age_of_ear...,median_age_of_ear...,median_age_of_ear...,sum_aud_2011_12,sum_aud_2012_13,sum_aud_2013_14,sum_aud_2014_15,sum_aud_2015_16,sum_aud_2016_17,sum_aud_2017_18,median_aud_2011_12,median_aud_2012_13,median_aud_2013_14,median_aud_2014_15,median_aud_2015_16,median_aud_2016_17,median_aud_2017_18,mean_aud_2011_12,mean_aud_2012_13,mean_aud_2013_14,mean_aud_2014_15,mean_aud_2015_16,mean_aud_2016_17,mean_aud_2017_18,wkb_geometry
abs_personal_inco...,74,103031075,Wollangambe - Wol...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MULTIPOLYGON (((-...
abs_personal_inco...,131,107011133,Port Kembla Indus...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MULTIPOLYGON (((-...


In [13]:
postcodedf.limit(3)

id,postcode,locality,state,long,lat,dc,type,status,sa3,sa3name,sa4,sa4name,region,Lat_precise,Long_precise,SA1_MAINCODE_2011,SA1_MAINCODE_2016,SA2_MAINCODE_2016,SA2_NAME_2016,SA3_CODE_2016,SA3_NAME_2016,SA4_CODE_2016,SA4_NAME_2016,RA_2011,RA_2016,MMM_2015,MMM_2019,ced,altitude,chargezone,phn_code,phn_name,lgaregion,electorate,electoraterating
230,200,ANU,ACT,149.119,-35.2777,,,,,,,,R1,-35.2777,149.119,80105104901,80105104901,801051049,Acton,80105,North Canberra,801,Australian Capita...,1,1,1,1,,,N2,,,,Durack,
21820,200,Australian Nation...,ACT,149.1189,-35.2777,,,Added 19-Jan-2020,,,,,R1,-35.2776999,149.118527,80105104901,80105104901,801051049,Acton,80105,North Canberra,801,Australian Capita...,1,1,1,1,,,N2,,,,Durack,
232,800,DARWIN,NT,130.83668,-12.458684,,,Updated 6-Feb-2020,70101.0,Darwin City,701.0,Darwin,R1,-12.3932794,130.7766611,70101100203,70101100218,701011002,Darwin City,70101,Darwin City,701,Darwin,3,3,2,2,,,NT1,PHN701,Northern Territory,Darwin,Solomon,Inner Metropolitan


In [14]:
postcode_ratio_df.head(3)

Unnamed: 0,POSTCODE,POSTCODE.1,SA2_MAINCODE_2011,SA2_NAME_2011,RATIO,PERCENTAGE
0,,,,,,
1,800.0,800.0,701011002.0,Darwin City,1.0,99.999998
2,810.0,810.0,701021010.0,Alawa,0.071997,7.199707


The datasets we were given included user data, fraud data, consumer data, merchant data, and 3 batches of transactional data. The transactional datasets when combined had a total of 1.42 million rows with each row representing a transaction and containing features such as the amount, the datetime, and the merchant/user involved. The user dataset had 499,999 rows with each row representing a user and the features included were user id and consumer id. It was used to join with the consumer dataset which also contained 499,999 rows with each row representing a consumer but had useful features such as the consumers gender, and address. The term consumer and user will be used interchangeably throughout this presentation but will refer to the same thing, a person that is involved in a transaction with any merchant. <br>
The fraud data came from 2 datasets, one for user fraud and the other for merchant fraud. Both had the same features including id, the datetime, and the fraud probability with each row respectively representing a particular user or merchant at a given datetime. The merchant fraud dataset contained 114 rows whilst the consumer fraud dataset contained 34,864 rows. The final dataset we were given was the merchant data where each of the 4026 rows represented the information for a given merchant with the columns being merchant abn, the company name, and their respective tags.
<br>
3 external datasets were also used, the first of which was SA2 population data containing population numbers for each SA2 code. The second dataset was income data by SA2 region. It was obtained from the Australian Bureau of Statistics (ABS) website and consisted of 2289 rows with each row representing a SA2 region and its respective statistics such as income earned and the demographics of the people that earned it for each year starting from 2011 and ending in 2018. The final external dataset used was the postcode and SA2 dataset which allowed for the joining of given and external datasets.

In [15]:
futureuserfrauddf = spark.read.parquet("../data/curated/future_user_fraud.parquet")
futuremerchantfrauddf = spark.read.parquet("../data/curated/future_merchant_fraud.parquet")

In [16]:
futureuserfrauddf.limit(3)

user_id,max_amount,max_transactions,transaction_std,amount_sd
19979,8150.115728134807,9,0.84476618165645,400.16781038916304
23492,4097.470542751887,9,0.8232347161406164,294.7157993790497
12568,2042.0161577505255,8,0.8757538133134091,239.9449439981541


In [17]:
futuremerchantfrauddf.limit(3)

merchant_abn,max_amount,max_transactions,transaction_sd,amount_sd
83412691377,201.92111142420248,57,6.131770235768689,24.483086120347316
73256306726,2074.313458274829,26,3.277427700964838,230.4986067967289
38700038932,5444.634544750903,23,3.5588728058858576,713.0931587735073


### Pre Processing and Outlier Analysis

For the datasets provided, the majority of fields contained valid entries for all transactions and no preprocessing was required. One problematic field however was the dollar amount of transactions, as some transactions were found to have clearly nonsensical dollar values, such as less than a cent. Since we are not given information about the product range of these merchants, we made the safe assumption that a transaction needed a dollar value higher than a cent to be considered valid.
<br>
Outside of this, the only field that required significant preprocessing was the tags field located in the merchant data. This field in fact contained 3 separate pieces of important information: a category for the merchant, a label indicating their revenue band on a scale from a-e, as well as their negotiated take rate with the BNPL firm. This field was therefore split into 3 separate columns.
<br>


The population dataset contained, for each SA2 code, the total population and the population of each age group in intervals of 4, example: 0-4, 5-9, 10-14, etc.. Because these age groups alone don’t have much meaning, custom age groups were created: under 10, adolescent (aged 10-17), young adult (18-34), middle age (35-59), and old (60+). The age ranges selected for these age groups were based on similar popular age groups globally defined and noting the fact that the average retirement age in Australia is 65.
<br>
Using these age groups, it was found that their distribution was almost the same on the state and territory level (show pie charts), with the middle age population being the most dominant followed by old and young adults, except in the territories where the young adults population was larger than old population. Then came the adolescent and under 10 population which had minor differences in their distribution.
<br>
It was found that a total of 38 SA2 regions had a population of zero. Upon closer inspection, these regions were places such as airports, creeks, and other ‘green’ uninhabitable places. On the opposite end, for a large population of young adults, it was found that these places were cities or near the cities. Another find was that there were some places populated entirely by either young adults, middle aged, or old people. Upon analyzing further, the total population of each of these regions was less than or equal to 10, which makes the find understandable.
<br>

An obstacle we faced was the fact that our given data was postcode-based and external data (population and income) was SA2 based. Hence, we needed data that would correspond each postcode to SA2 codes. We found 2 such datasets, however, as a postcode region is larger than a SA2 region, one postcode can have many SA2 codes. So how to accurately assign our external data to our existing data?
<br>
One of our postcode datasets contained ratio, i.e. how the population of a postcode is distributed amongst the SA2 codes. So we could use weighted averaging as opposed to just averaging to get a single value from population/income data for each postcode. However, the postcode ratio dataset contained less correspondences than our postcode dataset without ratio, which would lead to more loss of records when joining, almost 1.5 million to be exact. Hence, we decided that the weighted averaging provided by the postcode ratio dataset would not be worth the loss of these many records. After combining our given data with external data, we had a total of 11,818,811 records.
<br>

The income dataset contained the same few columns for each year ranging from 2011 to 2018. Seeing as the transaction data started from 2021 we removed all columns 2017 and below to get the latest SA2 income data available. Useless columns such as sum of everyone's income were removed and that left us with 5 columns which included SA2 code, number of earners, median age of earners, median income, and mean income. Distributions were made for each feature to see how they varied for the different SA2 regions and based on them only 50 rows were removed. 48 of these were removed because the entire row was filled with nulls and the other 2 were removed because they had implausible median ages. This brought the final dataset down to 2239 entries.
<br>

When inner joining all the given data, the only decrease in the number of rows was when there was a join with the merchants dataframe. The number of rows decreased from 14.1 million to 13.6 million because the merchants dataframe didn’t have the abn’s of some of the merchants that were showing up in the transactional data. When inner joining with external datasets the number of rows lost were 1.8 million because of SA2 mismatches and rows containing null values for all columns. This meant our final dataset consisted of 11.8 million rows.
<br>


We were given two sets of fraud data one for consumer fraud and the other for merchant fraud. We decided to set the threshold to 1% so anything above 1% would be considered fraud and that way we could be confident that all the fraud transactions had been removed. Since the minimum user fraud was 8 percent and the minimum merchant fraud was 18% this meant that all fraud transactions got removed. This only removed 60 thousand rows which was only 0.005% of the overall data so nothing major had been lost, but removing this fraud meant that we were left with transactions that only had 0.01% fraud chance which meant they were almost guaranteed to be accurate. This allowed us to create distributions for both merchants and users in terms of total transactions and total money spent/made in a day which will allow us to predict future fraud. If a user or merchant has 1.5 times their standard deviation in the number of transactions or transaction amounts in a day then they will be considered fraud. It was interesting to find that there were also some duplicate entries in the user fraud dataset.
<br>

This gave us our final dataset to be used in modelling and thus ranking:

In [18]:
finaldf = spark.read.parquet("../data/curated/finaldf.parquet/")
finaldf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-21,80682333501,3146,5651,604753,0.3672339667473312,2a59c978-f760-42d...,SA,Male,Orci Corp.,florists supplies...,4.88,b,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-19,99478391356,3146,5651,604753,3035.1695642706595,82e100bc-25c2-4e3...,SA,Male,Orci Quis Foundation,"equipment, tool, ...",1.52,c,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-19,86578477987,3146,5651,604753,61.05946896765003,9e3c8e62-9e8e-4e8...,SA,Male,Leo In Consulting,"watch, clock, and...",6.43,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-14,32361057556,3146,5651,604753,155.3456409871304,e4ff9499-e96d-4e6...,SA,Male,Orci In Consequat...,"gift, card, novel...",6.61,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-16,20445424481,3146,5651,604753,57.66971365811276,4a36f2ed-7bcc-43d...,SA,Male,Amet Industries,digital goods: bo...,6.29,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1


### Modelling

One of our very first ideas we had for the modeling and ranking of merchants was to simply onboard the merchants that had generated the most overall revenue. 

In [20]:
modeldf1 = finaldf.withColumn("year", date_format(col("order_datetime"), "yyyy").cast("long"))
modeldf1 = modeldf1.withColumn("month", date_format(col("order_datetime"), "MM").cast("long"))
modeldf1 = modeldf1.withColumn('profit', round(finaldf.dollar_value * (finaldf.take_rate / 100), 2))

for field in ("postcode","take_rate"):
    modeldf1 = modeldf1.withColumn(
        field,
        col(field).cast('double')
    )

modeldf1 = modeldf1.drop("order_datetime","order_id","company_name","consumer_fraud_%","merchant_fraud_%")

modeldf1 = modeldf1.groupBy("year","month","merchant_abn").agg(
    round(sum("profit"),2).alias("monthly_profit"),
)
modeldf1.limit(3)

                                                                                

year,month,merchant_abn,monthly_profit
2022,10,50315283629,7611.69
2022,9,40279146273,3666.26
2022,5,60956456424,17330.74


However, what we as a group realised is that a simple model such as this would be making the rather significant assumption that once a merchant is onboarded by the firm, every single transaction made with that merchant would now be made with the BNPL technology, which clearly isn’t a very realistic assumption.
 
So to improve on this idea, we decided we would still create this model, but we would also build 2 additional models. In order to determine these other features, we reviewed surveys from [C+R research](https://www.crresearch.com/blog/buy_now_pay_later_statistics) as well as [Bankrate](https://www.bankrate.com/loans/personal-loans/buy-now-pay-later-statistics/) regarding BNPL usage statistics. Some interesting statistics we found was that only 3% of respondents owed over $2 500 currently to BNPL services, and this debt was owed across an average of 3.8 different purchased items. Considering that under the current structure, consumers would only be able to pay back items in exactly 5 installments, we determined that there was likely to be a limit as to how much people would spend in a single BNPL transaction, and hence we would prefer merchants who not only generated large revenue, but did so through many smaller value transactions vs fewer larger transactions. This was the motivation behind our second model being based around the number of transactions for a given merchant.

In [37]:
modeldf2 = finaldf.withColumn("year", date_format(col("order_datetime"), "yyyy").cast("long"))
modeldf2 = modeldf2.withColumn("month", date_format(col("order_datetime"), "MM").cast("long"))

for field in ("postcode","take_rate"):
    modeldf2 = modeldf2.withColumn(
        field,
        col(field).cast('double')
    )

modeldf2 = modeldf2.drop("order_datetime","order_id","company_name","consumer_fraud_%","merchant_fraud_%")

modeldf2 = modeldf2.groupBy("year","month","merchant_abn").agg(
    count("dollar_value").alias("monthly_transactions"),
)

modeldf2.limit(5)

                                                                                

year,month,merchant_abn,monthly_transactions
2022,10,50315283629,1359
2022,9,40279146273,868
2022,5,60956456424,4213
2022,9,10323485998,476
2022,5,94472466107,1070


However, by similar logic, we should also prefer merchants with a wider consumer base, or a merchant that makes a large amount of revenue, from frequent, smaller value transactions that are likely to be from different customers. So while having our third model be based on the number of unique customers for a merchant would be justified, another interesting fact that we found was that amongst certain predefined income ranges, the highest proportion of BNPL users came from the group with average household income between \\$50 000-\\$75 000. Combining these two ideas led to our final model, which was based on the number of unique customers for a given merchant who resides in a postcode with median income between \\$50 000 and \\$75 000.

In [25]:
LOWER_INCOME_THRESHOLD = 50000
UPPER_INCOME_THRESHOLD = 75000
modeldf3 = finaldf.withColumn("year", date_format(col("order_datetime"), "yyyy").cast("long"))
modeldf3 = modeldf3.withColumn("month", date_format(col("order_datetime"), "MM").cast("long"))
modeldf3 = modeldf3.filter((modeldf3.median_income > LOWER_INCOME_THRESHOLD) &
                         (modeldf3.median_income < UPPER_INCOME_THRESHOLD))
                                     
for field in ("postcode","take_rate"):
    modeldf3 = modeldf3.withColumn(
        field,
        col(field).cast('double')
    )

modeldf3 = modeldf3.drop("order_datetime","order_id","company_name","consumer_fraud_%","merchant_fraud_%")
modeldf3 = modeldf3.groupBy("year","month","merchant_abn").agg(
    countDistinct("consumer_id").alias("monthly_middle_customers"),
)
modeldf3

                                                                                

year,month,merchant_abn,monthly_middle_customers
2022,5,94472466107,300
2022,4,16165222459,27
2022,5,60956456424,1107
2022,5,82539239304,24
2021,11,53655334735,7
2021,3,33163130598,34
2021,10,24852446429,2818
2021,4,31585975447,191
2022,9,46987545043,245
2022,4,52160665475,162


Additionally, to improve these ideas further, rather than just derive statistics such as overall revenue based on the previous transactions seen in the data, we aimed to identify any potential trends in these statistics for merchants, and hence use these trends to try and predict, for example the future revenue, of every merchant in every month of 2023. 
 
Such predictions were done through the use of a simple time series regression model, which is essentially just a typical linear regression model, but with the addition of what are called lagged variables. To take the example of our model to predict future revenue, the lagged variables in this time series regression would be the predicted revenue of the merchant for the previous certain amount of months. 

In [32]:
dfp = spark.read.parquet("../data/curated/trans_num_predictions.parquet").toPandas()
print(dfp)
print(mean_absolute_error(dfp["monthly_transactions"],  dfp["prediction"]))

       merchant_abn  int_month  monthly_transactions  merchant_abn_class  \
0       10023283211         19                   146                 0.0   
1       10023283211         20                   173                 0.0   
2       10023283211         21                   164                 0.0   
3       10142254217         19                   149                 1.0   
4       10142254217         20                   130                 1.0   
...             ...        ...                   ...                 ...   
12049   99987905597         20                    15              4016.0   
12050   99987905597         21                     8              4016.0   
12051   99990536339         19                     2              4017.0   
12052   99990536339         20                     1              4017.0   
12053   99990536339         21                     1              4017.0   

                                        merchant_abn_ohe  \
0      (1.0, 0.0, 0.0, 0.0,

In [33]:
dfp = pd.read.parquet("../data/curated/profit_merch_predictions.parquet").toPandas()
print(dfp)
print(mean_absolute_error(dfp["monthly_transactions"],  dfp["prediction"]))

AttributeError: module 'pandas' has no attribute 'read'

In [None]:
dfp = pd.read.parquet("../data/curated/medium_customers_predictions.parquet").toPandas()
print(dfp)
print(mean_absolute_error(dfp["monthly_transactions"],  dfp["prediction"]))

This leaves the important question of how many months should we use to predict the next month’s value? On one hand, having more lags means a smaller sample size of months to build our model on, as for example, the month of February 2021 doesn’t have any data from a previous month to use as a feature, meaning it is of no use to us. And so the more lags we add, the more months we discard. However, on the other hand, more lags means our model can more accurately predict certain trends over time. Extended analysis (see appropriate notebooks for further analysis) determined that for a large majority of merchants, two lags was seen as being the best of both worlds.

### Results