# Preliminary Analysis

BNPL Data timeline: 2021-2-28 to 2022-10-26

In [33]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import* 
from pyspark.sql.functions import regexp_replace, col, trim, split

In [34]:
spark = (
    SparkSession.builder.appName("Preliminary Analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.driver.memory","4G")
    .config("spark.executor.memory","4G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

24/09/03 13:42:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# BNPL dataset

# Table 1

In [35]:
consumer_fraud = spark.read.csv('../data/tables/tables 1/consumer_fraud_probability.csv', header=True, inferSchema=True)
merchant_fraud = spark.read.csv('../data/tables/tables 1/merchant_fraud_probability.csv', header=True, inferSchema=True)
consumer_user_details = spark.read.parquet('../data/tables/tables 1/consumer_user_details.parquet')
tbl_consumer = spark.read.csv('../data/tables/tables 1/tbl_consumer.csv', header=True, inferSchema=True)
tbl_merchants = spark.read.parquet('../data/tables/tables 1/tbl_merchants.parquet')

In [36]:
# split tbl_consumer table
# single column into multiple columns
split_col = split(tbl_consumer['name|address|state|postcode|gender|consumer_id'], r'\|')

# create separate columns for each part
tbl_consumer = tbl_consumer.withColumn('name', split_col.getItem(0)) \
                           .withColumn('address', split_col.getItem(1)) \
                           .withColumn('state', split_col.getItem(2)) \
                           .withColumn('postcode', split_col.getItem(3)) \
                           .withColumn('gender', split_col.getItem(4)) \
                           .withColumn('consumer_id', split_col.getItem(5))

tbl_consumer = tbl_consumer.drop('name|address|state|postcode|gender|consumer_id')

tbl_consumer.show(3)

+----------------+--------------------+-----+--------+------+-----------+
|            name|             address|state|postcode|gender|consumer_id|
+----------------+--------------------+-----+--------+------+-----------+
|Yolanda Williams|413 Haney Gardens...|   WA|    6935|Female|    1195503|
|      Mary Smith|     3764 Amber Oval|  NSW|    2782|Female|     179208|
|   Jill Jones MD|  40693 Henry Greens|   NT|     862|Female|    1194530|
+----------------+--------------------+-----+--------+------+-----------+
only showing top 3 rows



# Join all comsumer, merchant table

In [37]:
consumer_table = consumer_fraud.join(consumer_user_details, on="user_id", how="inner")
consumer_table = consumer_table.join(tbl_consumer, on="consumer_id", how="inner")
consumer_table.show(10)

+-----------+-------+--------------+------------------+-----------------+--------------------+-----+--------+-----------+
|consumer_id|user_id|order_datetime| fraud_probability|             name|             address|state|postcode|     gender|
+-----------+-------+--------------+------------------+-----------------+--------------------+-----+--------+-----------+
|    1174371|     95|    2021-09-22|10.950213110987248|      Linda Burns|76786 Stephanie I...|  NSW|    2352|     Female|
|    1174371|     95|    2021-11-03| 9.077685805360991|      Linda Burns|76786 Stephanie I...|  NSW|    2352|     Female|
|     921339|    152|    2021-12-17|14.821132072309535|       Tina Clark|     637 Daniel View|  NSW|    2161|     Female|
|     612215|    275|    2021-06-04| 12.90435899477071|Dr. Nicholas Hill|36154 Rivera Neck...|  VIC|    3147|Undisclosed|
|    1493354|    378|    2021-10-24|15.080740281581361| William Sullivan|716 Samuel Rapids...|  VIC|    3533|       Male|
|    1493354|    378|   

In [38]:
merchant_table = merchant_fraud.join(tbl_merchants, on="merchant_abn", how="inner")
merchant_table.show(10)

+------------+--------------+------------------+--------------------+--------------------+
|merchant_abn|order_datetime| fraud_probability|                name|                tags|
+------------+--------------+------------------+--------------------+--------------------+
| 11149063370|    2022-02-25| 51.01538421455241|     Et Arcu Limited|([art dealers and...|
| 11149063370|    2021-11-14|52.407803322764764|     Et Arcu Limited|([art dealers and...|
| 11149063370|    2021-08-28| 56.43761254995139|     Et Arcu Limited|([art dealers and...|
| 11470993597|    2021-09-28| 63.37734364737917|      Sed Associates|((watch, clock, a...|
| 11590404675|    2021-12-21|29.607818240092094|         Arcu Sed PC|((antique shops -...|
| 14530561097|    2021-09-15| 80.80054474543395|        Duis At Inc.|[[jewelry, watch,...|
| 15043504837|    2021-10-08|25.054391991473924|   Odio Incorporated|([jewelry, watch,...|
| 15043504837|    2021-12-14| 26.12523097610844|   Odio Incorporated|([jewelry, watch,...|

### Consumer table preprocessing

In [39]:
# check for invalid fraud probabilities
invalid_fraud_prob = consumer_table.filter((col("fraud_probability") < 0) | (col("fraud_probability") > 100))
invalid_fraud_prob.show(truncate=False)

+-----------+-------+--------------+-----------------+----+-------+-----+--------+------+
|consumer_id|user_id|order_datetime|fraud_probability|name|address|state|postcode|gender|
+-----------+-------+--------------+-----------------+----+-------+-----+--------+------+
+-----------+-------+--------------+-----------------+----+-------+-----+--------+------+



### Merchant table preprocessing

In [40]:
# convert all string to lowercase
merchant_table = merchant_table.withColumn("name", lower(col("name"))) \
                               .withColumn("tags", lower(col("tags")))



In [41]:
# convert all brackets to []

# replace '(' with '['
merchant_table = merchant_table.withColumn("tags_converted", regexp_replace(col("tags"), r'\(', '['))

# replace ')' with ']'
merchant_table = merchant_table.withColumn("tags_converted", regexp_replace(col("tags_converted"), r'\)', ']'))



In [42]:
# split the elements by '], [' to get the three parts
split_col = split(col("tags_converted"), r'\], \[')

# clean up each part and assign them to separate columns
merchant_table = merchant_table.withColumn("category", trim(regexp_replace(split_col.getItem(0), r'^\[|\]$', ''))) \
                               .withColumn("subcategory", trim(regexp_replace(split_col.getItem(1), r'^\[|\]$', ''))) \
                               .withColumn("take_rate", trim(regexp_replace(split_col.getItem(2), r'^\[take rate: |\]$', '')))

# keep only numeric values
merchant_table = merchant_table.withColumn("category", regexp_replace(col("category"), r'^\[|\]$', ''))
merchant_table = merchant_table.withColumn("take_rate", regexp_replace(col("take_rate"), r'[^\d.]+', ''))

merchant_table.drop('tags', 'tags_converted')

merchant_abn,order_datetime,fraud_probability,name,category,subcategory,take_rate
11149063370,2022-02-25,51.01538421455241,et arcu limited,art dealers and g...,b,4.84
11149063370,2021-11-14,52.40780332276477,et arcu limited,art dealers and g...,b,4.84
11149063370,2021-08-28,56.43761254995139,et arcu limited,art dealers and g...,b,4.84
11470993597,2021-09-28,63.37734364737917,sed associates,"watch, clock, and...",d,1.35
11590404675,2021-12-21,29.607818240092094,arcu sed pc,antique shops - s...,b,4.19
14530561097,2021-09-15,80.80054474543395,duis at inc.,"jewelry, watch, c...",c,1.69
15043504837,2021-10-08,25.054391991473924,odio incorporated,"jewelry, watch, c...",b,4.62
15043504837,2021-12-14,26.12523097610844,odio incorporated,"jewelry, watch, c...",b,4.62
15043504837,2021-08-29,59.77648897297805,odio incorporated,"jewelry, watch, c...",b,4.62
15157368385,2021-12-13,64.2774131928303,tempus non lacini...,artist supply and...,b,3.98


In [43]:
# check for invalid fraud probabilities
invalid_fraud_prob = merchant_table.filter((col("fraud_probability") < 0) | (col("fraud_probability") > 100))
invalid_fraud_prob.show(truncate=False)


+------------+--------------+-----------------+----+----+--------------+--------+-----------+---------+
|merchant_abn|order_datetime|fraud_probability|name|tags|tags_converted|category|subcategory|take_rate|
+------------+--------------+-----------------+----+----+--------------+--------+-----------+---------+
+------------+--------------+-----------------+----+----+--------------+--------+-----------+---------+



# Table 2 3 4 - transactions

In [44]:
# 3 transactions tables
tables_2 = spark.read.parquet('../data/tables/tables 2')
tables_3 = spark.read.parquet('../data/tables/tables 3')
tables_4 = spark.read.parquet('../data/tables/tables 4')

In [45]:
print('number of transactions in table 2 3 4: ', tables_2.count(), tables_3.count(), tables_4.count())


number of transactions in table 2 3 4:  3643266 4508106 6044133


In [46]:
# combine all transactions - 14195505 transactions with no duplicate record
transaction_table = tables_2.union(tables_3).union(tables_4)

In [47]:
transaction_table.show(5)

+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|  18478| 62191208634|63.255848959735246|949a63c8-29f7-4ab...|    2021-08-20|
|      2| 15549624934| 130.3505283105634|6a84c3cf-612a-457...|    2021-08-20|
|  18479| 64403598239|120.15860593212783|b10dcc33-e53f-425...|    2021-08-20|
|      3| 60956456424| 136.6785200286976|0f09c5a5-784e-447...|    2021-08-20|
|  18479| 94493496784| 72.96316578355305|f6c78c1a-4600-4c5...|    2021-08-20|
+-------+------------+------------------+--------------------+--------------+
only showing top 5 rows



In [48]:
# Check duplicate transaction records

# group by all columns and count occurrences
duplicates = transaction_table.groupBy(transaction_table.columns).count()

# keep only duplicate records
duplicates = duplicates.filter(col("count") > 1)

# duplicate row
duplicates.show()



+-------+------------+------------+--------+--------------+-----+
|user_id|merchant_abn|dollar_value|order_id|order_datetime|count|
+-------+------------+------------+--------+--------------+-----+
+-------+------------+------------+--------+--------------+-----+



                                                                                

# External dataset

### Location datasets (Suburb and postcode)
1. https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip
2. https://github.com/schappim/australian-postcodes/blob/master/australian-postcodes-2021-04-23.csv

In [61]:
import geopandas as gpd
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower

# SA2 dataset
gda = gpd.read_file('../data/tables/SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp')

# convert suburb column to lowercase
gda['Suburb'] = gda['SA2_NAME21'].str.lower()

# convert GeoPandas DataFrame to a Pandas DataFrame (dropping the geometry temporarily)
gda_pandas = pd.DataFrame(gda.drop(columns=gda.geometry.name))

# convert the Pandas DataFrame to a PySpark DataFrame
gda_spark = spark.createDataFrame(gda_pandas)



# postcode dataset
postcode = spark.read.csv('../data/tables/australian-postcodes-2021-04-23.csv', header=True)

# convert suburb column to lowercase
postcode = postcode.withColumn("Suburb", lower(col("Suburb")))



# join two dataset on Suburb name
joined_df = gda_spark.join(postcode, on='Suburb', how='inner')

# convert back to GeoPandas and reattach the geometry
result_gda = gpd.GeoDataFrame(joined_df.toPandas(), geometry=gda.geometry)

# select interested columns
result_gda = result_gda[['Suburb', 'Zip', 'geometry']]

result_gda.head(10)


Unnamed: 0,Suburb,Zip,geometry
0,karabar,2620,"POLYGON ((149.58424 -35.44426, 149.58444 -35.4..."
1,queanbeyan,2620,"POLYGON ((149.21899 -35.36738, 149.218 -35.366..."
2,googong,2620,"POLYGON ((149.21326 -35.34325, 149.21619 -35.3..."
3,batemans bay,2536,"POLYGON ((149.24034 -35.34781, 149.24024 -35.3..."
4,batemans bay,2536,"POLYGON ((149.19572 -35.36126, 149.1997 -35.35..."
5,eden,2551,"POLYGON ((149.23125 -35.41517, 149.23142 -35.4..."
6,goulburn,2580,"POLYGON ((149.26183 -35.37044, 149.26359 -35.3..."
7,yass,2582,"POLYGON ((148.67996 -36.99305, 148.52357 -36.9..."
8,young,2594,"POLYGON ((149.07507 -36.24048, 149.07508 -36.2..."
9,kariong,2250,"POLYGON ((148.60438 -36.13514, 148.60447 -36.1..."


### Income dataset (Suburb income and population)
1. income
https://explore.data.abs.gov.au/vis?tm=2021%20income&pg=30&df[ds]=C21_SUA&df[id]=C21_G17_SUA&df[ag]=ABS&df[vs]=1.0.0&pd=2021%2C&dq=3._T._T...&ly[rs]=REGION

2. population 
https://explore.data.abs.gov.au/vis?tm=2021%20population&pg=0&df[ds]=PEOPLE_TOPICS&df[id]=ERP_COMP_LGA2021&df[ag]=ABS&df[vs]=1.0.0&pd=2021%2C&dq=10.LGA2021..A&ly[rw]=REGION&hc[dataflowId]=ERP_COMP_LGA2021&hc[People]=Population

In [66]:
income = spark.read.csv('../data/tables/ABS_C21_G17_SUA_1.0.0_3._T._T....csv', header=True)

In [68]:
income.show(5)

+--------------------+----------+------------------------------------+---------+--------------------+------------------------+--------------------+------------------------+---------+
|            DATAFLOW| SEXP: Sex|INCP: Total personal income (weekly)|AGEP: Age|      REGION: Region|REGION_TYPE: Region Type|        STATE: State|TIME_PERIOD: Time Period|OBS_VALUE|
+--------------------+----------+------------------------------------+---------+--------------------+------------------------+--------------------+------------------------+---------+
|ABS:C21_G17_SUA(1...|3: Persons|                           _T: Total|_T: Total|     5005: Esperance|    SUA: Significant ...|5: Western Australia|                    2021|     9700|
|ABS:C21_G17_SUA(1...|3: Persons|                           _T: Total|_T: Total| 4002: Mount Gambier|    SUA: Significant ...|  4: South Australia|                    2021|    24089|
|ABS:C21_G17_SUA(1...|3: Persons|                           _T: Total|_T: Total|4000:

In [73]:
income = income.select('REGION: Region', 'OBS_VALUE')
income.show(5)

+--------------------+---------+
|      REGION: Region|OBS_VALUE|
+--------------------+---------+
|     5005: Esperance|     9700|
| 4002: Mount Gambier|    24089|
|4000: Not in any ...|   226935|
|2020: Warragul - ...|    34506|
|1021: Morisset - ...|    23078|
+--------------------+---------+
only showing top 5 rows



In [71]:
population = spark.read.csv('../data/tables/ABS_ERP_COMP_LGA2021_1.0.0_10.LGA2021..A.csv', header=True)

In [72]:
population.show(5)

+--------------------+------------------------------+------------------------+-------------------+---------------+------------------------+---------+-----------------------------+------------------------------+--------------------------------+
|            DATAFLOW|POP_COMP: Population Component|REGION_TYPE: Region Type|     REGION: Region|FREQ: Frequency|TIME_PERIOD: Time Period|OBS_VALUE|UNIT_MEASURE: Unit of Measure|OBS_STATUS: Observation Status|OBS_COMMENT: Observation Comment|
+--------------------+------------------------------+------------------------+-------------------+---------------+------------------------+---------+-----------------------------+------------------------------+--------------------------------+
|ABS:ERP_COMP_LGA2...|          10: Estimated Res...|    LGA2021: Local Go...|      53570: Gingin|      A: Annual|                    2021|     5419|                PSNS: Persons|                          NULL|                            NULL|
|ABS:ERP_COMP_LGA2...|  