# Fraudulent Transaction Classifier
---
We try to use 3 weeks of fraud probability data to build a Machine Learning model and classify all fraudulent transactions.

### 1. Datasets
We begin by loading our dataset onto the Jupyter Notebook

In [34]:
from pyspark.sql import SparkSession, Window, functions as F

# Create a spark session
spark = (
    SparkSession.builder.appName("BNPL Project")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

p = lambda x,type : f'../../data/{type}/{x}'

In [36]:
sdf = spark.read.parquet(p('process_data.parquet', 'curated'), header=True)
sdf.limit(1)

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,state,postcode,gender,merchant_name,tag,revenue,rate,category,subcategory,merchant_fraud_probability,user_fraud_probability,estimated_region_population_2021_sum,persons_earners_2018-19_sum,mean_earnings_2018-19_avg,sum_earnings_2018-19_sum,median_earnings_2018-19_avg,med_age_earners_2018-19_avg
31585975447,1656,8913,51.28,00001f53-b987-4b4...,2021-07-24,NSW,1163,Male,Dolor Dapibus Gra...,digital goods: bo...,b,3.12,retail_and_wholes...,household_goods_r...,,,31499,28552.0,70738.0,2019717290.0,28339.0,31.0


In [37]:
print('existing user_fraud_probability count:', sdf.where(F.col('user_fraud_probability').isNotNull()).count())
print('existing merchant_fraud_probability count:', sdf.where(F.col('merchant_fraud_probability').isNotNull()).count())

existing user_fraud_probability count: 71636
existing merchant_fraud_probability count: 4003


### 2. Visualisations

## **Testings**

1. Impute Missing Values (Probabily Mean??)
   if merchant_fraud_probs is null, look for its mean in other transactions, do the same for user_fraud_probability

In [38]:
merchant_window = Window.partitionBy(["merchant_abn"])
user_window = Window.partitionBy(["user_id"])

# 1. Impute `merchant_fraud_probability` Null Values 
sdf = sdf.withColumn('merchant_fraud_probability',
    F.when(F.col('merchant_fraud_probability').isNull(), 
    F.avg(F.col('merchant_fraud_probability')).over(merchant_window))\
        .otherwise(F.col('merchant_fraud_probability')))

# 2. Impute `user_fraud_probability` Null Values 
sdf = sdf.withColumn('user_fraud_probability',
    F.when(F.col('user_fraud_probability').isNull(), 
    F.avg(F.col('user_fraud_probability')).over(user_window))\
        .otherwise(F.col('user_fraud_probability')))

In [28]:
print('existing user_fraud_probability count:', sdf.where(F.col('user_fraud_probability').isNotNull()).count())
print('existing merchant_fraud_probability count:', sdf.where(F.col('merchant_fraud_probability').isNotNull()).count())

                                                                                

11032855

In [29]:
sdf.where(F.col('merchant_fraud_probability').isNotNull()).count()

                                                                                

583545

We will impute the final missing probabilities to 0.5 as we don't know their probabilties. So just 0.5 means they can be either one of them exclusivley. 

In [32]:
sdf = sdf.fillna(0.5, ['merchant_fraud_probability', 'user_fraud_probability'])

In [33]:
sdf

                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,state,postcode,gender,merchant_name,tag,revenue,rate,category,subcategory,merchant_fraud_probability,user_fraud_probability,estimated_region_population_2021_sum,persons_earners_2018-19_sum,mean_earnings_2018-19_avg,sum_earnings_2018-19_sum,median_earnings_2018-19_avg,med_age_earners_2018-19_avg
10023283211,1226530,19,291.32,526a2ebc-5203-44e...,2022-01-13,TAS,7276,Female,Felis Limited,"furniture, home f...",e,0.18,retail_and_wholes...,household_goods_r...,0.5,0.14,6086,3337.0,52656.0,175714684.0,43673.0,50.0
10648956813,1226530,19,27.93,87813983-3c53-49e...,2021-06-15,TAS,7276,Female,Proin Nisl Institute,"computers, comput...",a,6.66,retail_and_wholes...,household_goods_r...,0.5,0.14,6086,3337.0,52656.0,175714684.0,43673.0,50.0
15253672771,1226530,19,143.84,8dcbac1c-f0ab-4fe...,2021-12-11,TAS,7276,Female,Nascetur Ridiculu...,"gift, card, novel...",b,3.26,retail_and_wholes...,department_stores,0.5,0.14,6086,3337.0,52656.0,175714684.0,43673.0,50.0
35927874515,1226530,19,101.24,58fd2b2c-4e48-48e...,2021-10-24,TAS,7276,Female,Ac Orci Ut Indust...,shoe shops,a,5.85,retail_and_wholes...,department_stores,0.5,0.14,6086,3337.0,52656.0,175714684.0,43673.0,50.0
39759375662,1226530,19,448.74,43bfa43f-9f00-446...,2021-11-29,TAS,7276,Female,Elit Aliquam Auct...,"books, periodical...",a,6.96,info_media_and_te...,,0.5,0.14,6086,3337.0,52656.0,175714684.0,43673.0,50.0
43127814599,1226530,19,987.99,b815e072-7d31-4d9...,2021-07-18,TAS,7276,Female,Nam Ligula Elit F...,lawn and garden s...,b,3.58,retail_and_wholes...,food_retailing,0.5,0.14,6086,3337.0,52656.0,175714684.0,43673.0,50.0
45899477665,1226530,19,33.18,bcc502c4-54e4-4c8...,2021-10-24,TAS,7276,Female,At Incorporated,"books, periodical...",a,5.87,info_media_and_te...,,0.5,0.14,6086,3337.0,52656.0,175714684.0,43673.0,50.0
51561881468,1226530,19,122.88,6c2a1d46-cde5-47c...,2022-01-10,TAS,7276,Female,Tristique Pellent...,"cable, satellite,...",c,2.0,info_media_and_te...,,0.5,0.14,6086,3337.0,52656.0,175714684.0,43673.0,50.0
62170730910,1226530,19,51.47,2ab1cbbd-de4b-462...,2021-06-07,TAS,7276,Female,Volutpat Nunc Sit...,"gift, card, novel...",b,3.8,retail_and_wholes...,department_stores,0.5,0.14,6086,3337.0,52656.0,175714684.0,43673.0,50.0
62981771325,1226530,19,27.34,5dd90513-1ac2-41b...,2022-01-24,TAS,7276,Female,Eu Neque Pellente...,motor vehicle sup...,b,4.0,retail_and_wholes...,others_retailing,0.5,0.14,6086,3337.0,52656.0,175714684.0,43673.0,50.0
