In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

## Combining datasets

In [2]:
sdf = spark.read.parquet("../data/curated/mergedf.parquet/")
print(sdf.count())
sdf.limit(5)

                                                                                

13613661


                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,postcode,gender,company_name,tags,take_rate,revenue_band
69666829657,226,5162,79.65159982605903,8765ef9f-dba6-407...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Sem Ut Institute,bicycle shops - s...,2.86,c
49891706470,226,5162,3.887089224741017,9ba8ebb2-6593-49f...,2022-07-13,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Non Vestibulum In...,tent and awning s...,5.8,a
70610974780,226,5162,52.820359204536665,bdf345c8-4c5d-48e...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Elit Aliquam PC,tent and awning s...,6.93,a
85276983280,226,5162,250.33729038347653,dcad871d-1b75-4a8...,2022-05-06,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Lacus Varius Corp.,florists supplies...,3.32,b
15582655078,226,5162,75.31904078962366,47ddf8e0-5f72-408...,2021-08-21,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Eu Odio Phasellus...,"gift, card, novel...",6.77,a


In [8]:
pop_sdf = spark.read.parquet("../data/curated/pop_sdf.parquet/")
pop_sdf = pop_sdf.withColumnRenamed("SA2 code","sa2_code")
pop_sdf.limit(5)

State/Terr,sa2_code,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old
New South Wales,115011556,Castle Hill - South,10436,1256,1454,1704,3718,2304
New South Wales,115011557,Castle Hill - West,5198,568,849,810,1928,1043
New South Wales,115011558,Cherrybrook,19135,1982,3084,2522,6760,4787
New South Wales,115011621,Kellyville - East,17748,2300,3032,2748,6701,2967
New South Wales,115011622,Kellyville - West,11417,1702,1453,2162,4067,2033


In [9]:
income_sdf = spark.read.parquet("../data/curated/incomedf.parquet/")
income_sdf.limit(5)

sa2_code,num_earners,median_age,median_income,mean_income
213011340,11238,46,51181,61177
213021341,7942,43,57585,69301
213021344,11216,41,66161,80219
213021345,3015,41,52078,60249
213021346,9996,47,63892,90668


### External dataset joins

After analysing the merge of the existing data with each of the postocde data (workbooks 3a and 3b), have concluded that we should use the postcode data not containing the ratio field and use the median of the matched records for each postcode to get a single value for income/population data for each postcode. 

The benefits of using weighted averaging provided by the postcode with ratio dataset would not, by our analysis, be worth the loss of 1,498,774 records. The choice of using median was based on the many outliers (see boxplots in workbooks 3a and 3b).

In [7]:
postcode_sdf = spark.read.parquet("../data/curated/postcodedf.parquet/")
print(postcode_sdf.count())
postcode_sdf.limit(5)

5492


postcode,sa2_code
1008,117031337
1150,117031337
2100,122031429
2200,119011571
2338,110041201


Join postcode with population data

In [10]:
print(pop_sdf.count())

2450


In [11]:
merged_sdf1 = pop_sdf.join(postcode_sdf, on="sa2_code", how="inner")
merged_sdf1.groupBy("sa2_code").count().count()

2083

Lost 367 records from population data for which there were no sa2 codes

In [12]:
merged_sdf1.orderBy("postcode").limit(5)

sa2_code,State/Terr,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old,postcode
801051049,Australian Capita...,Acton,2875,6,1528,1292,47,2,200
701011002,Northern Territory,Darwin City,7679,474,325,3322,2652,906,800
701011002,Northern Territory,Darwin City,7679,474,325,3322,2652,906,801
701011007,Northern Territory,Parap,2980,350,282,801,1047,500,804
701021010,Northern Territory,Alawa,2172,326,227,529,704,386,810


Get a single population value for each postcode and field by calculating the median

In [18]:
postcode_pop_sdf = merged_sdf1.groupBy("postcode") \
    .agg(
        F.percentile_approx(merged_sdf1.Total, 0.5).alias("total_pop"),
        F.percentile_approx(merged_sdf1['Under 10'], 0.5).alias("under10_pop"),
        F.percentile_approx(merged_sdf1.Adolescent, 0.5).alias("adolsc_pop"),
        F.percentile_approx(merged_sdf1['Young adult'], 0.5).alias("yng_adult_pop"),
        F.percentile_approx(merged_sdf1['Middle age'], 0.5).alias("mid_age_pop"),
        F.percentile_approx(merged_sdf1.Old, 0.5).alias("old_pop")
    )
    
postcode_pop_sdf.orderBy("postcode").limit(5)

postcode,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
200,2875,6,1528,1292,47,2
800,7679,474,325,3322,2652,906
801,7679,474,325,3322,2652,906
804,2980,350,282,801,1047,500
810,2657,326,274,541,845,386


Join postcode with income data

In [21]:
print(income_sdf.count())

2239


In [19]:
merged_sdf2 = income_sdf.join(postcode_sdf, on="sa2_code", how="inner")

In [20]:
merged_sdf2.groupBy("sa2_code").count().count()

2186

Lost 53 records for which there were no sa2 codes

In [22]:
merged_sdf2.orderBy("postcode").limit(5)

sa2_code,num_earners,median_age,median_income,mean_income,postcode
801051049,548,23,9306,16835,200
701011002,5909,33,60937,87791,800
701011002,5909,33,60937,87791,801
701011007,1873,40,75219,98872,804
701021010,1387,40,54188,61411,810


In [25]:
postcode_income_sdf = merged_sdf2.groupBy("postcode") \
    .agg(
        F.percentile_approx(merged_sdf2.num_earners, 0.5).alias("num_earners"),
        F.percentile_approx(merged_sdf2.median_age, 0.5).alias("median_age"),
        F.percentile_approx(merged_sdf2.median_income, 0.5).alias("median_income"),
        F.percentile_approx(merged_sdf2.mean_income, 0.5).alias("mean_income")
    )
    
postcode_income_sdf.orderBy("postcode").limit(5)

postcode,num_earners,median_age,median_income,mean_income
200,548,23,9306,16835
800,5909,33,60937,87791
801,5909,33,60937,87791
804,1873,40,75219,98872
810,1479,39,58753,67299


Combine all external data in a single spark dataframe

In [26]:
print(postcode_pop_sdf.count(), postcode_income_sdf.count())

2793 3160


In [27]:
external_data_sdf = postcode_pop_sdf.join(postcode_income_sdf, on="postcode", how="full")
print(external_data_sdf.count())
external_data_sdf.limit(5)

3162


postcode,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
200,2875,6,1528,1292,47,2,548,23,9306,16835
800,7679,474,325,3322,2652,906,5909,33,60937,87791
801,7679,474,325,3322,2652,906,5909,33,60937,87791
804,2980,350,282,801,1047,500,1873,40,75219,98872
810,2657,326,274,541,845,386,1479,39,58753,67299


Join external data with existing data

In [28]:
print(sdf.count())
sdf.limit(5)

13613661


                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,postcode,gender,company_name,tags,take_rate,revenue_band
69666829657,226,5162,79.65159982605903,8765ef9f-dba6-407...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Sem Ut Institute,bicycle shops - s...,2.86,c
49891706470,226,5162,3.887089224741017,9ba8ebb2-6593-49f...,2022-07-13,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Non Vestibulum In...,tent and awning s...,5.8,a
70610974780,226,5162,52.820359204536665,bdf345c8-4c5d-48e...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Elit Aliquam PC,tent and awning s...,6.93,a
85276983280,226,5162,250.33729038347653,dcad871d-1b75-4a8...,2022-05-06,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Lacus Varius Corp.,florists supplies...,3.32,b
15582655078,226,5162,75.31904078962366,47ddf8e0-5f72-408...,2021-08-21,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Eu Odio Phasellus...,"gift, card, novel...",6.77,a


In [35]:
final_sdf = sdf.join(external_data_sdf, on="postcode", how="inner")
print(final_sdf.count())
final_sdf.limit(5)

                                                                                

13394287


                                                                                

postcode,merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
6731,49891706470,58911,22131,25.782245737474312,90498b80-984f-43c...,2022-07-13,Paul Lopez,3908 David Squares,WA,Male,Non Vestibulum In...,tent and awning s...,5.8,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,96680767841,58911,22131,415.9547984625,829843bc-c571-493...,2021-08-19,Paul Lopez,3908 David Squares,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,43186523025,58911,22131,33.082322196774484,c12d2520-12c8-40f...,2022-07-13,Paul Lopez,3908 David Squares,WA,Male,Lorem Ipsum Sodal...,florists supplies...,4.47,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,66370248931,58911,22131,82.54046571771035,cc8ac5c5-0dfc-4e1...,2021-08-19,Paul Lopez,3908 David Squares,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,89726005175,58911,22131,58.46693553052902,b1f179f8-2c15-4f5...,2022-05-06,Paul Lopez,3908 David Squares,WA,Male,Est Nunc Consulting,tent and awning s...,6.01,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306


In [30]:
final_sdf = final_sdf.dropna(how="any")
final_sdf.count()

22/09/22 20:16:59 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

11818811

Null values occur because for some postcodes, population/income data did not exist. <br>
Removing records containing null values resulted in the loss of 1,575,476 records.

### Fraud joins

In [31]:
consumerfraud_sdf = spark.read.parquet("../data/curated/consumerfrauddf.parquet/")
consumerfraud_sdf = consumerfraud_sdf.withColumnRenamed("fraud_probability","consumer_fraud_%")
consumerfraud_sdf.count()

34765

In [32]:
merchantfraud_sdf = spark.read.parquet("../data/curated/merchantfrauddf.parquet/")
merchantfraud_sdf = merchantfraud_sdf.withColumnRenamed("fraud_probability","merchant_fraud_%")
merchantfraud_sdf.count()

114

In [34]:
print(final_sdf.count())
final_sdf = final_sdf.join(consumerfraud_sdf, ["order_datetime", "user_id"], "leftouter")
print(final_sdf.count())
final_sdf = final_sdf.join(merchantfraud_sdf, ["order_datetime", "merchant_abn"], "leftouter")
print(final_sdf.count())
final_sdf.limit(5)

                                                                                

11818811


                                                                                

11818811


                                                                                

11818811


                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,consumer_fraud_%.1,merchant_fraud_%
2022-07-13,49891706470,22131,6731,58911,25.782245737474312,90498b80-984f-43c...,Paul Lopez,3908 David Squares,WA,Male,Non Vestibulum In...,tent and awning s...,5.8,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,,,
2021-08-19,96680767841,22131,6731,58911,415.9547984625,829843bc-c571-493...,Paul Lopez,3908 David Squares,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,,,
2022-07-13,43186523025,22131,6731,58911,33.082322196774484,c12d2520-12c8-40f...,Paul Lopez,3908 David Squares,WA,Male,Lorem Ipsum Sodal...,florists supplies...,4.47,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,,,
2021-08-19,66370248931,22131,6731,58911,82.54046571771035,cc8ac5c5-0dfc-4e1...,Paul Lopez,3908 David Squares,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,,,
2022-05-06,89726005175,22131,6731,58911,58.46693553052902,b1f179f8-2c15-4f5...,Paul Lopez,3908 David Squares,WA,Male,Est Nunc Consulting,tent and awning s...,6.01,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,,,


In [37]:
final_sdf = final_sdf.drop("customer_name","address")
final_sdf = final_sdf.na.fill(0.1)
final_sdf.limit(5)

                                                                                

postcode,merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
6731,49891706470,58911,22131,25.782245737474312,90498b80-984f-43c...,2022-07-13,WA,Male,Non Vestibulum In...,tent and awning s...,5.8,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,96680767841,58911,22131,415.9547984625,829843bc-c571-493...,2021-08-19,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,43186523025,58911,22131,33.082322196774484,c12d2520-12c8-40f...,2022-07-13,WA,Male,Lorem Ipsum Sodal...,florists supplies...,4.47,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,66370248931,58911,22131,82.54046571771035,cc8ac5c5-0dfc-4e1...,2021-08-19,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,89726005175,58911,22131,58.46693553052902,b1f179f8-2c15-4f5...,2022-05-06,WA,Male,Est Nunc Consulting,tent and awning s...,6.01,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306


Total Records after joining:
11818811

## Dealing with current fraud data

In [11]:
MERCHANT_THRESH = 20
CONSUMER_THRESH = 20

print(final_sdf.count())
testdf = final_sdf.filter(final_sdf["merchant_fraud_%"] < MERCHANT_THRESH)
testdf = final_sdf.filter(final_sdf["consumer_fraud_%"] < CONSUMER_THRESH)
print(testdf.count(), final_sdf.count()-testdf.count())

                                                                                

20773062




20757549 15513


                                                                                

Keeping the fraud prob below 20% for each give us a good chance that most fraud data has been removed <br>
The removal of 15 thousand records is not alot considering we still have 20 million plus records still left <br>
The fact that most of the fraud transactions are now removed allows us to build a metric for determining future fraud

In [12]:
final_sdf = final_sdf.filter(final_sdf["merchant_fraud_%"] < MERCHANT_THRESH)
final_sdf = final_sdf.filter(final_sdf["consumer_fraud_%"] < CONSUMER_THRESH)

In [13]:
final_sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,sa2_code,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-20,94472466107,13842,216011410,3612,30,36.09451992152847,0dc80e20-901c-410...,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,0.1,0.1
2021-08-20,94472466107,13842,204011058,3612,30,36.09451992152847,0dc80e20-901c-410...,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,Nagambie,4507,441,443,570,1371,1682,2360,49,45235,53483,0.1,0.1
2021-08-21,21532935983,13842,216011410,3612,30,71.1148505207073,5fbb2316-39b7-43b...,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,0.1,0.1
2021-08-21,21532935983,13842,204011058,3612,30,71.1148505207073,5fbb2316-39b7-43b...,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,Nagambie,4507,441,443,570,1371,1682,2360,49,45235,53483,0.1,0.1
2021-08-19,60956456424,13842,216011410,3612,30,56.52469841268393,60bc5068-e775-4c4...,VIC,Female,Ultricies Digniss...,"gift, card, novel...",4.69,b,Rushworth,4166,416,418,531,1300,1501,2060,50,38359,45788,0.1,0.1


## Dealing with future fraud

2 main ways to determine fraud:
- High number of transactions in a given day
- High transaction amounts

### Merchant

In [14]:
# this gets the stats for each merchant for each day
merchanttestdf = final_sdf.groupBy("merchant_abn","order_datetime").agg(
    mean("dollar_value").alias("mean_amount"),
    min("dollar_value").alias("min_amount"),
    max("dollar_value").alias("max_amount"),
    count("dollar_value").alias("transactions"),
    ).orderBy(desc("transactions"))

merchanttestdf.limit(5)

                                                                                

merchant_abn,order_datetime,mean_amount,min_amount,max_amount,transactions
24852446429,2021-11-26,30.464173619816343,0.4388815965837197,168.92668885555742,1916
24852446429,2021-11-27,31.63655621518729,1.003369617502435,193.03651325623903,1783
86578477987,2021-11-26,33.86953965973572,0.1615814452824192,163.1881979840336,1779
24852446429,2021-11-29,30.50184795665985,0.7022808404239752,137.595051098878,1744
49891706470,2021-11-26,28.68215799516624,3.001709666041197,54.95671169593744,1701


In [15]:
# this gets the average stats for a merchant on any given day
merchanttestdf = merchanttestdf.groupBy("merchant_abn").agg(
    mean("mean_amount").alias("mean_amount"),
    mean("min_amount").alias("min_amount"),
    mean("max_amount").alias("max_amount"),
    mean("transactions").alias("mean_transactions"),
    min("transactions").alias("min_transactions"),
    max("transactions").alias("max_transactions"),
    ).orderBy(desc("max_transactions"))

merchanttestdf.limit(5)

                                                                                

merchant_abn,mean_amount,min_amount,max_amount,mean_transactions,min_transactions,max_transactions
24852446429,29.994179500150423,1.0013482034321513,132.63726075232657,727.2062706270627,442,1916
86578477987,34.97995533972652,1.172081829939173,153.98697166956993,685.6435643564356,412,1779
49891706470,28.94237492419656,3.1556904183096157,54.84419033696536,621.6039603960396,335,1701
64203420245,28.956123060148236,3.14451491936713,54.85378248897557,656.2095709570957,360,1681
46804135891,30.04504181207258,0.1048694392743808,190.9312773826076,588.2376237623762,331,1607


We now have a dataset that gives us all the statistics we need to determine if future merchant transactions are fraud <br>
for the time being if a merchant has > 1.5 x max_transactions in a day it is considered fraud <br>
if a merchant has > 1.5 x max_amount in a day it is considered fraud

### User

In [16]:
# this gets the stats for each user for each day
usertestdf = final_sdf.groupBy("user_id","order_datetime").agg(
    mean("dollar_value").alias("mean_amount"),
    min("dollar_value").alias("min_amount"),
    max("dollar_value").alias("max_amount"),
    count("dollar_value").alias("transactions"),
    ).orderBy(desc("transactions"))

usertestdf.limit(5)

                                                                                

user_id,order_datetime,mean_amount,min_amount,max_amount,transactions
20931,2021-11-29,119.06473250477067,28.04893125562283,445.0802574763232,104
17417,2021-11-26,72.40206052677397,4.4464593786411095,157.8166304195111,98
6072,2021-11-22,269.3998632737218,6.712747985289963,622.847977814607,98
20651,2021-11-25,62.674804005227486,13.4184538369854,125.3852608556601,98
17417,2021-11-29,167.94750291512926,1.6711964900188083,719.3382556816264,98


In [17]:
# this gets the average stats for a user on any given day
usertestdf = usertestdf.groupBy("user_id").agg(
    mean("mean_amount").alias("mean_amount"),
    mean("min_amount").alias("min_amount"),
    mean("max_amount").alias("max_amount"),
    mean("transactions").alias("mean_transactions"),
    min("transactions").alias("min_transactions"),
    max("transactions").alias("max_transactions"),
    ).orderBy(desc("max_transactions"))

usertestdf.limit(5)



22/09/20 19:24:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/20 19:24:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 287:>                                                        (0 + 4) / 4]

22/09/20 19:25:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/09/20 19:25:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

user_id,mean_amount,min_amount,max_amount,mean_transactions,min_transactions,max_transactions
20931,164.31360829926368,126.11191593350348,209.56096073865885,20.1267217630854,13,104
6072,159.5675614949777,101.9518777911356,224.52093498921,21.89944134078212,14,98
20651,157.1074921825416,105.06627100387666,219.1010507216253,21.0,14,98
17417,147.88685884709733,104.75892839781037,196.83524356068264,21.51780821917808,14,98
13545,158.13796388277854,121.45901194379616,205.68937045590943,20.06830601092896,13,91


We now have a dataset that gives us all the statistics we need to determine if future user transactions are fraud <br>
for the time being if a user has > 1.5 x max_transactions in a day it is considered fraud <br>
if a user has > 1.5 x max_amount in a day it is considered fraud

Now remains the case where a user or merchant has little to no data to get these metrics from <br>
We could either set our own thresholds based on logical reasoning for fraud detection<br>
Or we could look at the distributions of existing data to determine thresholds for fraud detection