# Preliminary Analysis

BNPL Data timeline: 2021-2-28 to 2022-10-26

In [63]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import* 
from pyspark.sql.functions import regexp_replace, col, trim, split

In [64]:
spark = (
    SparkSession.builder.appName("Preliminary Analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.driver.memory","4G")
    .config("spark.executor.memory","4G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

# Table 1

In [65]:
consumer_fraud = spark.read.csv('../data/tables/tables 1/consumer_fraud_probability.csv', header=True, inferSchema=True)
merchant_fraud = spark.read.csv('../data/tables/tables 1/merchant_fraud_probability.csv', header=True, inferSchema=True)
consumer_user_details = spark.read.parquet('../data/tables/tables 1/consumer_user_details.parquet')
tbl_consumer = spark.read.csv('../data/tables/tables 1/tbl_consumer.csv', header=True, inferSchema=True)
tbl_merchants = spark.read.parquet('../data/tables/tables 1/tbl_merchants.parquet')

In [66]:
# split tbl_consumer table
# single column into multiple columns
split_col = split(tbl_consumer['name|address|state|postcode|gender|consumer_id'], r'\|')

# create separate columns for each part
tbl_consumer = tbl_consumer.withColumn('name', split_col.getItem(0)) \
                           .withColumn('address', split_col.getItem(1)) \
                           .withColumn('state', split_col.getItem(2)) \
                           .withColumn('postcode', split_col.getItem(3)) \
                           .withColumn('gender', split_col.getItem(4)) \
                           .withColumn('consumer_id', split_col.getItem(5))

tbl_consumer = tbl_consumer.drop('name|address|state|postcode|gender|consumer_id')

tbl_consumer.show(3)

+----------------+--------------------+-----+--------+------+-----------+
|            name|             address|state|postcode|gender|consumer_id|
+----------------+--------------------+-----+--------+------+-----------+
|Yolanda Williams|413 Haney Gardens...|   WA|    6935|Female|    1195503|
|      Mary Smith|     3764 Amber Oval|  NSW|    2782|Female|     179208|
|   Jill Jones MD|  40693 Henry Greens|   NT|     862|Female|    1194530|
+----------------+--------------------+-----+--------+------+-----------+
only showing top 3 rows



# Join all comsumer, merchant table

In [67]:
consumer_table = consumer_fraud.join(consumer_user_details, on="user_id", how="inner")
consumer_table = consumer_table.join(tbl_consumer, on="consumer_id", how="inner")
consumer_table.show(10)

+-----------+-------+--------------+------------------+-----------------+--------------------+-----+--------+-----------+
|consumer_id|user_id|order_datetime| fraud_probability|             name|             address|state|postcode|     gender|
+-----------+-------+--------------+------------------+-----------------+--------------------+-----+--------+-----------+
|    1174371|     95|    2021-09-22|10.950213110987248|      Linda Burns|76786 Stephanie I...|  NSW|    2352|     Female|
|    1174371|     95|    2021-11-03| 9.077685805360991|      Linda Burns|76786 Stephanie I...|  NSW|    2352|     Female|
|     921339|    152|    2021-12-17|14.821132072309535|       Tina Clark|     637 Daniel View|  NSW|    2161|     Female|
|     612215|    275|    2021-06-04| 12.90435899477071|Dr. Nicholas Hill|36154 Rivera Neck...|  VIC|    3147|Undisclosed|
|    1493354|    378|    2021-10-24|15.080740281581361| William Sullivan|716 Samuel Rapids...|  VIC|    3533|       Male|
|    1493354|    378|   

In [68]:
merchant_table = merchant_fraud.join(tbl_merchants, on="merchant_abn", how="inner")
merchant_table.show(10)

+------------+--------------+------------------+--------------------+--------------------+
|merchant_abn|order_datetime| fraud_probability|                name|                tags|
+------------+--------------+------------------+--------------------+--------------------+
| 11149063370|    2022-02-25| 51.01538421455241|     Et Arcu Limited|([art dealers and...|
| 11149063370|    2021-11-14|52.407803322764764|     Et Arcu Limited|([art dealers and...|
| 11149063370|    2021-08-28| 56.43761254995139|     Et Arcu Limited|([art dealers and...|
| 11470993597|    2021-09-28| 63.37734364737917|      Sed Associates|((watch, clock, a...|
| 11590404675|    2021-12-21|29.607818240092094|         Arcu Sed PC|((antique shops -...|
| 14530561097|    2021-09-15| 80.80054474543395|        Duis At Inc.|[[jewelry, watch,...|
| 15043504837|    2021-10-08|25.054391991473924|   Odio Incorporated|([jewelry, watch,...|
| 15043504837|    2021-12-14| 26.12523097610844|   Odio Incorporated|([jewelry, watch,...|

### Consumer table preprocessing

In [77]:
# check for invalid fraud probabilities
invalid_fraud_prob = consumer_table.filter((col("fraud_probability") < 0) | (col("fraud_probability") > 100))
invalid_fraud_prob.show(truncate=False)

+-----------+-------+--------------+-----------------+----+-------+-----+--------+------+
|consumer_id|user_id|order_datetime|fraud_probability|name|address|state|postcode|gender|
+-----------+-------+--------------+-----------------+----+-------+-----+--------+------+
+-----------+-------+--------------+-----------------+----+-------+-----+--------+------+



### Merchant table preprocessing

In [69]:
# convert all string to lowercase
merchant_table = merchant_table.withColumn("name", lower(col("name"))) \
                               .withColumn("tags", lower(col("tags")))



In [70]:
# convert all brackets to []

# replace '(' with '['
merchant_table = merchant_table.withColumn("tags_converted", regexp_replace(col("tags"), r'\(', '['))

# replace ')' with ']'
merchant_table = merchant_table.withColumn("tags_converted", regexp_replace(col("tags_converted"), r'\)', ']'))



In [71]:


# split the elements by '], [' to get the three parts
split_col = split(col("tags_converted"), r'\], \[')

# clean up each part and assign them to separate columns
merchant_table = merchant_table.withColumn("category", trim(regexp_replace(split_col.getItem(0), r'^\[|\]$', ''))) \
                               .withColumn("subcategory", trim(regexp_replace(split_col.getItem(1), r'^\[|\]$', ''))) \
                               .withColumn("take_rate", trim(regexp_replace(split_col.getItem(2), r'^\[take rate: |\]$', '')))

# keep only numeric values
merchant_table = merchant_table.withColumn("category", regexp_replace(col("category"), r'^\[|\]$', ''))
merchant_table = merchant_table.withColumn("take_rate", regexp_replace(col("take_rate"), r'[^\d.]+', ''))

merchant_table.drop('tags', 'tags_converted')

merchant_abn,order_datetime,fraud_probability,name,category,subcategory,take_rate
11149063370,2022-02-25,51.01538421455241,et arcu limited,art dealers and g...,b,4.84
11149063370,2021-11-14,52.40780332276477,et arcu limited,art dealers and g...,b,4.84
11149063370,2021-08-28,56.43761254995139,et arcu limited,art dealers and g...,b,4.84
11470993597,2021-09-28,63.37734364737917,sed associates,"watch, clock, and...",d,1.35
11590404675,2021-12-21,29.607818240092094,arcu sed pc,antique shops - s...,b,4.19
14530561097,2021-09-15,80.80054474543395,duis at inc.,"jewelry, watch, c...",c,1.69
15043504837,2021-10-08,25.054391991473924,odio incorporated,"jewelry, watch, c...",b,4.62
15043504837,2021-12-14,26.12523097610844,odio incorporated,"jewelry, watch, c...",b,4.62
15043504837,2021-08-29,59.77648897297805,odio incorporated,"jewelry, watch, c...",b,4.62
15157368385,2021-12-13,64.2774131928303,tempus non lacini...,artist supply and...,b,3.98


In [72]:
# check for invalid fraud probabilities
invalid_fraud_prob = merchant_table.filter((col("fraud_probability") < 0) | (col("fraud_probability") > 100))
invalid_fraud_prob.show(truncate=False)


+------------+--------------+-----------------+----+----+--------------+--------+-----------+---------+
|merchant_abn|order_datetime|fraud_probability|name|tags|tags_converted|category|subcategory|take_rate|
+------------+--------------+-----------------+----+----+--------------+--------+-----------+---------+
+------------+--------------+-----------------+----+----+--------------+--------+-----------+---------+



# Table 2 3 4 - transactions

In [73]:
# 3 transactions tables
tables_2 = spark.read.parquet('../data/tables/tables 2')
tables_3 = spark.read.parquet('../data/tables/tables 3')
tables_4 = spark.read.parquet('../data/tables/tables 4')

In [74]:
print('number of transactions in table 2 3 4: ', tables_2.count(), tables_3.count(), tables_4.count())


number of transactions in table 2 3 4:  3643266 4508106 6044133


In [75]:
# combine all transactions - 14195505 transactions with no duplicate record
transaction_table = tables_2.union(tables_3).union(tables_4)

In [76]:
# Check duplicate transaction records

# group by all columns and count occurrences
duplicates = transaction_table.groupBy(transaction_table.columns).count()

# keep only duplicate records
duplicates = duplicates.filter(col("count") > 1)

# duplicate row
duplicates.show()

                                                                                

+-------+------------+------------+--------+--------------+-----+
|user_id|merchant_abn|dollar_value|order_id|order_datetime|count|
+-------+------------+------------+--------+--------------+-----+
+-------+------------+------------+--------+--------------+-----+



                                                                                