In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/09/20 19:18:59 WARN Utils: Your hostname, DESKTOP-LNDD2A2 resolves to a loopback address: 127.0.1.1; using 172.20.232.118 instead (on interface eth0)
22/09/20 19:18:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/20 19:19:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/20 19:19:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/09/20 19:19:04 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/09/20 19:19:04 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/09/20 19:19:04 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/09/20 19:19:04 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.


## Combining datasets

In [2]:
sdf = spark.read.parquet("../data/curated/mergedf.parquet/")
print(sdf.count())
sdf.limit(5)

                                                                                

13613661


                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,postcode,gender,company_name,tags,take_rate,revenue_band
94472466107,30,13842,36.09451992152847,0dc80e20-901c-410...,2021-08-20,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a
21532935983,30,13842,71.1148505207073,5fbb2316-39b7-43b...,2021-08-21,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a
60956456424,30,13842,56.52469841268393,60bc5068-e775-4c4...,2021-08-19,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Ultricies Digniss...,"gift, card, novel...",4.69,b
39211701585,30,13842,105.80444352294496,810594a7-c21a-4dd...,2021-08-22,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Diam Eu Dolor PC,shoe shops,4.76,b
27326652377,30,13842,1179.908032136875,7ef554a5-02a8-435...,2021-08-22,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Tellus Aenean Cor...,music shops - mus...,6.33,a


In [3]:
postcodedf = spark.read.parquet("../data/curated/postcodedf.parquet/")
print(postcodedf.count())
postcodedf.limit(5)

5492


postcode,sa2_code
1008,117031337
1150,117031337
2100,122031429
2200,119011571
2338,110041201


In [4]:
pop_sdf = spark.read.parquet("../data/curated/pop_sdf.parquet/")
print(pop_sdf.count())
populationdf = pop_sdf.withColumnRenamed("SA2 code","sa2_code")
populationdf.limit(5)

2450


State/Terr,sa2_code,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old
New South Wales,101021007,Braidwood,4330,473,403,495,1472,1487
New South Wales,101021008,Karabar,8546,1082,1075,1818,2858,1713
New South Wales,101021009,Queanbeyan,11370,1275,916,3129,3681,2369
New South Wales,101021010,Queanbeyan - East,5093,588,406,1460,1718,921
New South Wales,101021012,Queanbeyan West -...,12743,1796,1910,2266,4933,1838


In [5]:
incomedf = spark.read.parquet("../data/curated/incomedf.parquet/")
print(incomedf.count())
incomedf.limit(5)

2239


sa2_code,num_earners,median_age,median_income,mean_income
206041126,14398,32,51696,73634
203021046,6701,45,57818,83648
202011020,7944,47,46635,57894
208031187,3158,43,52094,58104
203021047,8152,42,47651,56636


In [6]:
consumerfrauddf = spark.read.parquet("../data/curated/consumerfrauddf.parquet/")
print(consumerfrauddf.count())
consumerfrauddf = consumerfrauddf.withColumnRenamed("fraud_probability","consumer_fraud_%")
consumerfrauddf.limit(5)

34765


user_id,order_datetime,consumer_fraud_%
3753,2022-02-16,48.85325253622543
9646,2021-09-23,47.83931206340956
243,2021-09-02,50.88971939168309
3907,2021-10-07,38.58123424858352
14864,2021-11-29,27.072321329372105


In [7]:
merchantfrauddf = spark.read.parquet("../data/curated/merchantfrauddf.parquet/")
print(merchantfrauddf.count())
merchantfrauddf = merchantfrauddf.withColumnRenamed("fraud_probability","merchant_fraud_%")
merchantfrauddf.limit(5)

114


merchant_abn,order_datetime,merchant_fraud_%
19492220327,2021-11-28,44.40365864749536
31334588839,2021-10-02,42.75530083865367
19492220327,2021-12-22,38.867790051131095
82999039227,2021-12-19,94.1347004808891
90918180829,2021-09-02,43.32551731714902


External dataset joins

In [8]:
print(sdf.count())
finaldf = sdf.join(postcodedf, "postcode")
print(finaldf.count())
finaldf = finaldf.join(populationdf, "sa2_code")
print(finaldf.count())
finaldf = finaldf.join(incomedf, "sa2_code")
print(finaldf.count())
finaldf.limit(5)

13613661


                                                                                

23251565


                                                                                

20950595


                                                                                

20773062
22/09/20 19:20:35 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

sa2_code,postcode,merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,State/Terr,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old,num_earners,median_age,median_income,mean_income
216011410,3612,94472466107,30,13842,36.09451992152847,0dc80e20-901c-410...,2021-08-20,Tamara Stewart,352 Jessica Summit,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,Victoria,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788
204011058,3612,94472466107,30,13842,36.09451992152847,0dc80e20-901c-410...,2021-08-20,Tamara Stewart,352 Jessica Summit,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,Victoria,Nagambie,4507,441,443,570,1371,1682,2360,49,45235,53483
216011410,3612,21532935983,30,13842,71.1148505207073,5fbb2316-39b7-43b...,2021-08-21,Tamara Stewart,352 Jessica Summit,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,Victoria,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788
204011058,3612,21532935983,30,13842,71.1148505207073,5fbb2316-39b7-43b...,2021-08-21,Tamara Stewart,352 Jessica Summit,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,Victoria,Nagambie,4507,441,443,570,1371,1682,2360,49,45235,53483
216011410,3612,60956456424,30,13842,56.52469841268393,60bc5068-e775-4c4...,2021-08-19,Tamara Stewart,352 Jessica Summit,VIC,Female,Ultricies Digniss...,"gift, card, novel...",4.69,b,Victoria,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788


Fraud joins

In [9]:
print(finaldf.count())
finaldf = finaldf.join(consumerfrauddf, ["order_datetime", "user_id"], "leftouter")
print(finaldf.count())
finaldf = finaldf.join(merchantfrauddf, ["order_datetime", "merchant_abn"], "leftouter")
print(finaldf.count())
finaldf.limit(5)

                                                                                

20773062


                                                                                

20773062


                                                                                

20773062


                                                                                

order_datetime,merchant_abn,user_id,sa2_code,postcode,consumer_id,dollar_value,order_id,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,State/Terr,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-20,94472466107,13842,216011410,3612,30,36.09451992152847,0dc80e20-901c-410...,Tamara Stewart,352 Jessica Summit,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,Victoria,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,,
2021-08-20,94472466107,13842,204011058,3612,30,36.09451992152847,0dc80e20-901c-410...,Tamara Stewart,352 Jessica Summit,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,Victoria,Nagambie,4507,441,443,570,1371,1682,2360,49,45235,53483,,
2021-08-21,21532935983,13842,216011410,3612,30,71.1148505207073,5fbb2316-39b7-43b...,Tamara Stewart,352 Jessica Summit,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,Victoria,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,,
2021-08-21,21532935983,13842,204011058,3612,30,71.1148505207073,5fbb2316-39b7-43b...,Tamara Stewart,352 Jessica Summit,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,Victoria,Nagambie,4507,441,443,570,1371,1682,2360,49,45235,53483,,
2021-08-19,60956456424,13842,216011410,3612,30,56.52469841268393,60bc5068-e775-4c4...,Tamara Stewart,352 Jessica Summit,VIC,Female,Ultricies Digniss...,"gift, card, novel...",4.69,b,Victoria,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,,


In [10]:
finaldf = finaldf.drop("customer_name","address","State/Terr")
finaldf = finaldf.na.fill(0.1)
finaldf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,sa2_code,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-20,94472466107,13842,216011410,3612,30,36.09451992152847,0dc80e20-901c-410...,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,0.1,0.1
2021-08-20,94472466107,13842,204011058,3612,30,36.09451992152847,0dc80e20-901c-410...,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,Nagambie,4507,441,443,570,1371,1682,2360,49,45235,53483,0.1,0.1
2021-08-21,21532935983,13842,216011410,3612,30,71.1148505207073,5fbb2316-39b7-43b...,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,0.1,0.1
2021-08-21,21532935983,13842,204011058,3612,30,71.1148505207073,5fbb2316-39b7-43b...,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,Nagambie,4507,441,443,570,1371,1682,2360,49,45235,53483,0.1,0.1
2021-08-19,60956456424,13842,216011410,3612,30,56.52469841268393,60bc5068-e775-4c4...,VIC,Female,Ultricies Digniss...,"gift, card, novel...",4.69,b,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,0.1,0.1


## Dealing with current fraud data

In [11]:
MERCHANT_THRESH = 20
CONSUMER_THRESH = 20

print(finaldf.count())
testdf = finaldf.filter(finaldf["merchant_fraud_%"] < MERCHANT_THRESH)
testdf = finaldf.filter(finaldf["consumer_fraud_%"] < CONSUMER_THRESH)
print(testdf.count(), finaldf.count()-testdf.count())

                                                                                

20773062




20757549 15513


                                                                                

Keeping the fraud prob below 20% for each give us a good chance that most fraud data has been removed <br>
The removal of 15 thousand records is not alot considering we still have 20 million plus records still left <br>
The fact that most of the fraud transactions are now removed allows us to build a metric for determining future fraud

In [12]:
finaldf = finaldf.filter(finaldf["merchant_fraud_%"] < MERCHANT_THRESH)
finaldf = finaldf.filter(finaldf["consumer_fraud_%"] < CONSUMER_THRESH)

In [13]:
finaldf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,sa2_code,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-20,94472466107,13842,216011410,3612,30,36.09451992152847,0dc80e20-901c-410...,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,0.1,0.1
2021-08-20,94472466107,13842,204011058,3612,30,36.09451992152847,0dc80e20-901c-410...,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,Nagambie,4507,441,443,570,1371,1682,2360,49,45235,53483,0.1,0.1
2021-08-21,21532935983,13842,216011410,3612,30,71.1148505207073,5fbb2316-39b7-43b...,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,0.1,0.1
2021-08-21,21532935983,13842,204011058,3612,30,71.1148505207073,5fbb2316-39b7-43b...,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,Nagambie,4507,441,443,570,1371,1682,2360,49,45235,53483,0.1,0.1
2021-08-19,60956456424,13842,216011410,3612,30,56.52469841268393,60bc5068-e775-4c4...,VIC,Female,Ultricies Digniss...,"gift, card, novel...",4.69,b,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,0.1,0.1


## Dealing with future fraud

2 main ways to determine fraud:
- High number of transactions in a given day
- High transaction amounts

### Merchant

In [14]:
# this gets the stats for each merchant for each day
merchanttestdf = finaldf.groupBy("merchant_abn","order_datetime").agg(
    mean("dollar_value").alias("mean_amount"),
    min("dollar_value").alias("min_amount"),
    max("dollar_value").alias("max_amount"),
    count("dollar_value").alias("transactions"),
    ).orderBy(desc("transactions"))

merchanttestdf.limit(5)

                                                                                

merchant_abn,order_datetime,mean_amount,min_amount,max_amount,transactions
24852446429,2021-11-26,30.464173619816343,0.4388815965837197,168.92668885555742,1916
24852446429,2021-11-27,31.63655621518729,1.003369617502435,193.03651325623903,1783
86578477987,2021-11-26,33.86953965973572,0.1615814452824192,163.1881979840336,1779
24852446429,2021-11-29,30.50184795665985,0.7022808404239752,137.595051098878,1744
49891706470,2021-11-26,28.68215799516624,3.001709666041197,54.95671169593744,1701


In [15]:
# this gets the average stats for a merchant on any given day
merchanttestdf = merchanttestdf.groupBy("merchant_abn").agg(
    mean("mean_amount").alias("mean_amount"),
    mean("min_amount").alias("min_amount"),
    mean("max_amount").alias("max_amount"),
    mean("transactions").alias("mean_transactions"),
    min("transactions").alias("min_transactions"),
    max("transactions").alias("max_transactions"),
    ).orderBy(desc("max_transactions"))

merchanttestdf.limit(5)

                                                                                

merchant_abn,mean_amount,min_amount,max_amount,mean_transactions,min_transactions,max_transactions
24852446429,29.994179500150423,1.0013482034321513,132.63726075232657,727.2062706270627,442,1916
86578477987,34.97995533972652,1.172081829939173,153.98697166956993,685.6435643564356,412,1779
49891706470,28.94237492419656,3.1556904183096157,54.84419033696536,621.6039603960396,335,1701
64203420245,28.956123060148236,3.14451491936713,54.85378248897557,656.2095709570957,360,1681
46804135891,30.04504181207258,0.1048694392743808,190.9312773826076,588.2376237623762,331,1607


We now have a dataset that gives us all the statistics we need to determine if future merchant transactions are fraud <br>
for the time being if a merchant has > 1.5 x max_transactions in a day it is considered fraud <br>
if a merchant has > 1.5 x max_amount in a day it is considered fraud

### User

In [16]:
# this gets the stats for each user for each day
usertestdf = finaldf.groupBy("user_id","order_datetime").agg(
    mean("dollar_value").alias("mean_amount"),
    min("dollar_value").alias("min_amount"),
    max("dollar_value").alias("max_amount"),
    count("dollar_value").alias("transactions"),
    ).orderBy(desc("transactions"))

usertestdf.limit(5)

                                                                                

user_id,order_datetime,mean_amount,min_amount,max_amount,transactions
20931,2021-11-29,119.06473250477067,28.04893125562283,445.0802574763232,104
17417,2021-11-26,72.40206052677397,4.4464593786411095,157.8166304195111,98
6072,2021-11-22,269.3998632737218,6.712747985289963,622.847977814607,98
20651,2021-11-25,62.674804005227486,13.4184538369854,125.3852608556601,98
17417,2021-11-29,167.94750291512926,1.6711964900188083,719.3382556816264,98


In [17]:
# this gets the average stats for a user on any given day
usertestdf = usertestdf.groupBy("user_id").agg(
    mean("mean_amount").alias("mean_amount"),
    mean("min_amount").alias("min_amount"),
    mean("max_amount").alias("max_amount"),
    mean("transactions").alias("mean_transactions"),
    min("transactions").alias("min_transactions"),
    max("transactions").alias("max_transactions"),
    ).orderBy(desc("max_transactions"))

usertestdf.limit(5)



22/09/20 19:24:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/20 19:24:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 287:>                                                        (0 + 4) / 4]

22/09/20 19:25:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/09/20 19:25:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

user_id,mean_amount,min_amount,max_amount,mean_transactions,min_transactions,max_transactions
20931,164.31360829926368,126.11191593350348,209.56096073865885,20.1267217630854,13,104
6072,159.5675614949777,101.9518777911356,224.52093498921,21.89944134078212,14,98
20651,157.1074921825416,105.06627100387666,219.1010507216253,21.0,14,98
17417,147.88685884709733,104.75892839781037,196.83524356068264,21.51780821917808,14,98
13545,158.13796388277854,121.45901194379616,205.68937045590943,20.06830601092896,13,91


We now have a dataset that gives us all the statistics we need to determine if future user transactions are fraud <br>
for the time being if a user has > 1.5 x max_transactions in a day it is considered fraud <br>
if a user has > 1.5 x max_amount in a day it is considered fraud

Now remains the case where a user or merchant has little to no data to get these metrics from <br>
We could either set our own thresholds based on logical reasoning for fraud detection<br>
Or we could look at the distributions of existing data to determine thresholds for fraud detection