## Transaction Data Visualisation

In [23]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import lit, when

# Create a spark session
spark = (
    SparkSession.builder.appName("BNPL Project")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

### Load Dataset

In [41]:
dataset = spark.read.parquet("../../data/curated/process_data.parquet/")
dataset.limit(10)


                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,state,postcode,gender,merchant_name,tag,revenue,rate,category,subcategory,merchant_fraud_probability,user_fraud_probability,estimated_region_population_2021_sum,persons_earners_2018-19_sum,mean_earnings_2018-19_avg,sum_earnings_2018-19_sum,median_earnings_2018-19_avg,med_age_earners_2018-19_avg
32709545238,561,14108,361.79,7f2cbe00-1d8c-49b...,2021-08-21,NSW,2293,Male,Tempor Est Founda...,"stationery, offic...",c,3.04,retail_and_wholes...,others_retailing,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0
19839532017,561,14108,157.0,01a35e05-f763-4a1...,2021-08-21,NSW,2293,Male,Pellentesque Habi...,"cable, satellite,...",b,4.94,info_media_and_te...,,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0
14639489823,561,14108,217.71,21ea8392-7fc6-467...,2021-08-21,NSW,2293,Male,Auctor Mauris Inc...,"furniture, home f...",a,6.36,retail_and_wholes...,household_goods_r...,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0
69666829657,561,14108,178.13,44a8f7af-113c-4e0...,2021-08-21,NSW,2293,Male,Sem Ut Institute,bicycle shops - s...,c,2.86,retail_and_wholes...,department_stores,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0
26148653604,561,14108,15.6,7a8996c8-220c-41d...,2021-08-21,NSW,2293,Male,Volutpat Nulla In...,"furniture, home f...",a,6.59,retail_and_wholes...,household_goods_r...,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0
29521780474,561,14108,7.24,17838284-d80d-400...,2021-08-19,NSW,2293,Male,At Sem Corp.,"cable, satellite,...",a,5.93,info_media_and_te...,,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0
43186523025,561,14108,69.02,d59a5f33-b131-465...,2021-08-22,NSW,2293,Male,Lorem Ipsum Sodal...,florists supplies...,b,4.47,retail_and_wholes...,department_stores,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0
46804135891,561,14108,4.6,c53d22fe-e476-446...,2021-08-14,NSW,2293,Male,Suspendisse Dui C...,"opticians, optica...",c,2.93,retail_and_wholes...,others_retailing,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0
93429143824,561,14108,64.55,9a398824-5d79-441...,2021-08-14,NSW,2293,Male,Fames Ac Associates,lawn and garden s...,c,1.64,retail_and_wholes...,food_retailing,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0
68216911708,561,14108,13.92,cae2549d-7585-47c...,2021-08-14,NSW,2293,Male,Placerat Eget Ven...,"computers, comput...",c,3.05,retail_and_wholes...,household_goods_r...,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0


## Outliers

In [42]:
category = dataset.select('category').distinct()
for cat in category.collect():
    subset = dataset.filter(F.col("category") == cat[0])
    if (cat == 'retail_and_wholesale_trade'):
        subcategory = subset.select('subcategory').distinct()
        for subcat in subcategory.collect():
            sub = subset.filter(F.col("subcategory") == subcat[0])
            q3, q1 = sub.approxQuantile('dollar_value', [0.75, 0.25], 0) 
            iqr = q3 - q1
            min = q1 - 1.5*iqr
            max = q3 + 1.5*iqr

            if ('isOutlier' in dataset.columns):
                dataset = dataset.withColumn('isOutlier', 
                            when((dataset.subcategory == subcat[0]) & (dataset.dollar_value >= min) & (dataset.dollar_value <= max), lit('False'))\
                            .when((dataset.subcategory == subcat[0]), lit('True'))
                            .otherwise(dataset.isOutlier))
            else:
                dataset = dataset.withColumn('isOutlier', 
                            when((dataset.subcategory == subcat[0]) & (dataset.dollar_value >= min) & (dataset.dollar_value <= max), lit('False'))\
                            .when((dataset.subcategory == subcat[0]), lit('True')))
    else:
        q3, q1 = subset.approxQuantile('dollar_value', [0.75, 0.25], 0) 
        iqr = q3 - q1
        min = q1 - 1.5*iqr
        max = q3 + 1.5*iqr

        if ('isOutlier' in dataset.columns):
            dataset = dataset.withColumn('isOutlier', 
                        when((dataset.category == cat[0]) & (dataset.dollar_value >= min) & (dataset.dollar_value <= max), lit('False'))\
                        .when((dataset.category == cat[0]), lit('True'))
                        .otherwise(dataset.isOutlier))
        else:
            dataset = dataset.withColumn('isOutlier', 
                        when((dataset.category == cat[0]) & (dataset.dollar_value >= min) & (dataset.dollar_value <= max), lit('False'))\
                        .when((dataset.category == cat[0]), lit('True')))



                                                                                

In [43]:
dataset.limit(2)

                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,state,postcode,gender,merchant_name,tag,revenue,rate,category,subcategory,merchant_fraud_probability,user_fraud_probability,estimated_region_population_2021_sum,persons_earners_2018-19_sum,mean_earnings_2018-19_avg,sum_earnings_2018-19_sum,median_earnings_2018-19_avg,med_age_earners_2018-19_avg,isOutlier
32709545238,561,14108,361.79,7f2cbe00-1d8c-49b...,2021-08-21,NSW,2293,Male,Tempor Est Founda...,"stationery, offic...",c,3.04,retail_and_wholes...,others_retailing,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0,True
19839532017,561,14108,157.0,01a35e05-f763-4a1...,2021-08-21,NSW,2293,Male,Pellentesque Habi...,"cable, satellite,...",b,4.94,info_media_and_te...,,,,9099,5350.0,66013.0,353171988.0,56552.0,38.0,False


## Distributions of Variables with and without Outliers

In [44]:
dataset_wo_outliers = dataset.filter(F.col("isOutlier") == 'False')

## Relationship between Features