In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/10/12 13:59:20 WARN Utils: Your hostname, DESKTOP-LNDD2A2 resolves to a loopback address: 127.0.1.1; using 172.21.140.219 instead (on interface eth0)
22/10/12 13:59:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/12 13:59:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Combining datasets

In [2]:
# All given data

sdf = spark.read.parquet("../data/curated/mergedf.parquet/")
print(sdf.count())
sdf.limit(5)

                                                                                

13613661


                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,postcode,gender,company_name,tags,take_rate,revenue_band
94472466107,30,13842,36.09451992152847,0dc80e20-901c-410...,2021-08-20,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a
21532935983,30,13842,71.1148505207073,5fbb2316-39b7-43b...,2021-08-21,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a
60956456424,30,13842,56.52469841268393,60bc5068-e775-4c4...,2021-08-19,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Ultricies Digniss...,"gift, card, novel...",4.69,b
39211701585,30,13842,105.80444352294496,810594a7-c21a-4dd...,2021-08-22,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Diam Eu Dolor PC,shoe shops,4.76,b
27326652377,30,13842,1179.908032136875,7ef554a5-02a8-435...,2021-08-22,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Tellus Aenean Cor...,music shops - mus...,6.33,a


In [3]:
# Population data

pop_sdf = spark.read.parquet("../data/curated/pop_sdf.parquet/")
pop_sdf = pop_sdf.withColumnRenamed("SA2 code","sa2_code")
pop_sdf.limit(5)

State/Terr,sa2_code,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old
New South Wales,101021007,Braidwood,4330,473,403,495,1472,1487
New South Wales,101021008,Karabar,8546,1082,1075,1818,2858,1713
New South Wales,101021009,Queanbeyan,11370,1275,916,3129,3681,2369
New South Wales,101021010,Queanbeyan - East,5093,588,406,1460,1718,921
New South Wales,101021012,Queanbeyan West -...,12743,1796,1910,2266,4933,1838


In [4]:
# Income data

income_sdf = spark.read.parquet("../data/curated/incomedf.parquet/")
income_sdf.limit(5)

sa2_code,num_earners,median_age,median_income,mean_income
206041126,14398,32,51696,73634
203021046,6701,45,57818,83648
202011020,7944,47,46635,57894
208031187,3158,43,52094,58104
203021047,8152,42,47651,56636


### External dataset joins

After analysing the merge of the existing data with each of the postcode datasets (notebooks 3a and 3b), we have concluded that we will use the postcode data not containing the ratio field, and use the median of the matched records for each postcode to get a single value for income/population data for each postcode. 

The benefits of using weighted averaging provided by the postcode with ratio dataset would not, by our analysis, be worth the loss of 1,498,774 records. Additionally, the choice of the median value vs the mean value was based on the many outliers (see boxplots in notebooks 3a and 3b).

In [5]:
# Postcode data

postcode_sdf = spark.read.parquet("../data/curated/postcodedf.parquet/")
print(postcode_sdf.count())
postcode_sdf.limit(5)

5492


postcode,sa2_code
1008,117031337
1150,117031337
2100,122031429
2200,119011571
2338,110041201


Join postcode with population data

In [6]:
print(pop_sdf.count())

2450


In [7]:
merged_sdf1 = pop_sdf.join(postcode_sdf, on="sa2_code", how="inner")
merged_sdf1.groupBy("sa2_code").count().count()

2083

Lost 367 records from population data for which there were no sa2 codes

In [8]:
merged_sdf1.orderBy("postcode").limit(5)

sa2_code,State/Terr,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old,postcode
801051049,Australian Capita...,Acton,2875,6,1528,1292,47,2,200
701011002,Northern Territory,Darwin City,7679,474,325,3322,2652,906,800
701011002,Northern Territory,Darwin City,7679,474,325,3322,2652,906,801
701011007,Northern Territory,Parap,2980,350,282,801,1047,500,804
701021010,Northern Territory,Alawa,2172,326,227,529,704,386,810


Get a single population value for each postcode and field by calculating the median

In [9]:
postcode_pop_sdf = merged_sdf1.groupBy("postcode") \
    .agg(
        F.percentile_approx(merged_sdf1.Total, 0.5).alias("total_pop"),
        F.percentile_approx(merged_sdf1['Under 10'], 0.5).alias("under10_pop"),
        F.percentile_approx(merged_sdf1.Adolescent, 0.5).alias("adolsc_pop"),
        F.percentile_approx(merged_sdf1['Young adult'], 0.5).alias("yng_adult_pop"),
        F.percentile_approx(merged_sdf1['Middle age'], 0.5).alias("mid_age_pop"),
        F.percentile_approx(merged_sdf1.Old, 0.5).alias("old_pop")
    )
    
postcode_pop_sdf.orderBy("postcode").limit(5)

                                                                                

postcode,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
200,2875,6,1528,1292,47,2
800,7679,474,325,3322,2652,906
801,7679,474,325,3322,2652,906
804,2980,350,282,801,1047,500
810,2657,326,274,541,845,386


Join postcode with income data

In [10]:
print(income_sdf.count())

2239


In [11]:
merged_sdf2 = income_sdf.join(postcode_sdf, on="sa2_code", how="inner")

In [12]:
merged_sdf2.groupBy("sa2_code").count().count()

2186

Lost 53 records for which there were no sa2 codes

In [13]:
merged_sdf2.orderBy("postcode").limit(5)

sa2_code,num_earners,median_age,median_income,mean_income,postcode
801051049,548,23,9306,16835,200
701011002,5909,33,60937,87791,800
701011002,5909,33,60937,87791,801
701011007,1873,40,75219,98872,804
701021010,1387,40,54188,61411,810


In [14]:
postcode_income_sdf = merged_sdf2.groupBy("postcode") \
    .agg(
        F.percentile_approx(merged_sdf2.num_earners, 0.5).alias("num_earners"),
        F.percentile_approx(merged_sdf2.median_age, 0.5).alias("median_age"),
        F.percentile_approx(merged_sdf2.median_income, 0.5).alias("median_income"),
        F.percentile_approx(merged_sdf2.mean_income, 0.5).alias("mean_income")
    )
    
postcode_income_sdf.orderBy("postcode").limit(5)

postcode,num_earners,median_age,median_income,mean_income
200,548,23,9306,16835
800,5909,33,60937,87791
801,5909,33,60937,87791
804,1873,40,75219,98872
810,1479,39,58753,67299


Combine all external data in a single spark dataframe

In [15]:
print(postcode_pop_sdf.count(), postcode_income_sdf.count())

2793 3160


In [16]:
external_data_sdf = postcode_pop_sdf.join(postcode_income_sdf, on="postcode", how="full")
print(external_data_sdf.count())
external_data_sdf.limit(5)

3162


                                                                                

postcode,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
200,2875,6,1528,1292,47,2,548,23,9306,16835
800,7679,474,325,3322,2652,906,5909,33,60937,87791
801,7679,474,325,3322,2652,906,5909,33,60937,87791
804,2980,350,282,801,1047,500,1873,40,75219,98872
810,2657,326,274,541,845,386,1479,39,58753,67299


In [17]:
external_data_sdf.write.mode("overwrite").parquet("../data/curated/externaldata.parquet")

                                                                                

Join external data with existing data

In [18]:
print(sdf.count())
sdf.limit(5)

13613661


                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,postcode,gender,company_name,tags,take_rate,revenue_band
94472466107,30,13842,36.09451992152847,0dc80e20-901c-410...,2021-08-20,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a
21532935983,30,13842,71.1148505207073,5fbb2316-39b7-43b...,2021-08-21,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a
60956456424,30,13842,56.52469841268393,60bc5068-e775-4c4...,2021-08-19,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Ultricies Digniss...,"gift, card, novel...",4.69,b
39211701585,30,13842,105.80444352294496,810594a7-c21a-4dd...,2021-08-22,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Diam Eu Dolor PC,shoe shops,4.76,b
27326652377,30,13842,1179.908032136875,7ef554a5-02a8-435...,2021-08-22,Tamara Stewart,352 Jessica Summit,VIC,3612,Female,Tellus Aenean Cor...,music shops - mus...,6.33,a


In [19]:
final_sdf = sdf.join(external_data_sdf, on="postcode", how="inner")
print(final_sdf.count())
final_sdf.limit(5)

                                                                                

13394287


                                                                                

postcode,merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
7252,40559163709,71385,13049,499.2280181835405,753b49dc-b04a-453...,2021-08-19,Joshua Hawkins,7313 Edwards Isle...,TAS,Male,Maecenas Iaculis ...,"computers, comput...",1.5,c,4446,504,516,592,1618,1216,2437,48,42860,50350
7252,49891706470,71385,13049,27.59743014174974,19c2c44d-4d9b-4e4...,2021-08-22,Joshua Hawkins,7313 Edwards Isle...,TAS,Male,Non Vestibulum In...,tent and awning s...,5.8,a,4446,504,516,592,1618,1216,2437,48,42860,50350
7252,45629217853,71385,13049,23.56428590811388,9ff40986-52cc-47d...,2021-08-22,Joshua Hawkins,7313 Edwards Isle...,TAS,Male,Lacus Consulting,"gift, card, novel...",6.98,a,4446,504,516,592,1618,1216,2437,48,42860,50350
7252,49891706470,71385,13049,13.272207348909346,233195e0-3482-451...,2021-07-15,Joshua Hawkins,7313 Edwards Isle...,TAS,Male,Non Vestibulum In...,tent and awning s...,5.8,a,4446,504,516,592,1618,1216,2437,48,42860,50350
7252,80324045558,71385,13049,139.35678856727068,3fd87986-f437-42f...,2021-07-15,Joshua Hawkins,7313 Edwards Isle...,TAS,Male,Ipsum Dolor Sit C...,"gift, card, novel...",1.47,c,4446,504,516,592,1618,1216,2437,48,42860,50350


In [20]:
final_sdf = final_sdf.dropna(how="any")
final_sdf.count()

22/10/12 14:01:47 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

11818811

Null values occur because for some postcodes, population/income data did not exist. <br>
Removing records containing null values resulted in the loss of 1,575,476 records.

### Fraud joins

In [21]:
# Read in consumer fraud data

consumerfraud_sdf = spark.read.parquet("../data/curated/consumerfrauddf.parquet/")
consumerfraud_sdf = consumerfraud_sdf.withColumnRenamed("fraud_probability","consumer_fraud_%")
consumerfraud_sdf.count()

34765

In [22]:
# Read in merchant fraud data

merchantfraud_sdf = spark.read.parquet("../data/curated/merchantfrauddf.parquet/")
merchantfraud_sdf = merchantfraud_sdf.withColumnRenamed("fraud_probability","merchant_fraud_%")
merchantfraud_sdf.count()

114

In [23]:
# join fraud data with all other data and check for rows dropped

print(final_sdf.count())
final_sdf = final_sdf.join(consumerfraud_sdf, ["order_datetime", "user_id"], "leftouter")
print(final_sdf.count())
final_sdf = final_sdf.join(merchantfraud_sdf, ["order_datetime", "merchant_abn"], "leftouter")
print(final_sdf.count())
final_sdf.limit(5)

                                                                                

11818811


                                                                                

11818811


                                                                                

11818811


                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-19,40559163709,13049,7252,71385,499.2280181835405,753b49dc-b04a-453...,Joshua Hawkins,7313 Edwards Isle...,TAS,Male,Maecenas Iaculis ...,"computers, comput...",1.5,c,4446,504,516,592,1618,1216,2437,48,42860,50350,,
2021-08-22,49891706470,13049,7252,71385,27.59743014174974,19c2c44d-4d9b-4e4...,Joshua Hawkins,7313 Edwards Isle...,TAS,Male,Non Vestibulum In...,tent and awning s...,5.8,a,4446,504,516,592,1618,1216,2437,48,42860,50350,,
2021-08-22,45629217853,13049,7252,71385,23.56428590811388,9ff40986-52cc-47d...,Joshua Hawkins,7313 Edwards Isle...,TAS,Male,Lacus Consulting,"gift, card, novel...",6.98,a,4446,504,516,592,1618,1216,2437,48,42860,50350,,
2021-07-15,49891706470,13049,7252,71385,13.272207348909346,233195e0-3482-451...,Joshua Hawkins,7313 Edwards Isle...,TAS,Male,Non Vestibulum In...,tent and awning s...,5.8,a,4446,504,516,592,1618,1216,2437,48,42860,50350,,
2021-07-15,80324045558,13049,7252,71385,139.35678856727068,3fd87986-f437-42f...,Joshua Hawkins,7313 Edwards Isle...,TAS,Male,Ipsum Dolor Sit C...,"gift, card, novel...",1.47,c,4446,504,516,592,1618,1216,2437,48,42860,50350,,


In [24]:
# Drop useless columns and replace fraud nulls with a small value representing its percentage

NULL_PERCENTAGE = 0.1

final_sdf = final_sdf.drop("customer_name","address")
final_sdf = final_sdf.na.fill(NULL_PERCENTAGE)
final_sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-19,40559163709,13049,7252,71385,499.2280181835405,753b49dc-b04a-453...,TAS,Male,Maecenas Iaculis ...,"computers, comput...",1.5,c,4446,504,516,592,1618,1216,2437,48,42860,50350,0.1,0.1
2021-08-22,49891706470,13049,7252,71385,27.59743014174974,19c2c44d-4d9b-4e4...,TAS,Male,Non Vestibulum In...,tent and awning s...,5.8,a,4446,504,516,592,1618,1216,2437,48,42860,50350,0.1,0.1
2021-08-22,45629217853,13049,7252,71385,23.56428590811388,9ff40986-52cc-47d...,TAS,Male,Lacus Consulting,"gift, card, novel...",6.98,a,4446,504,516,592,1618,1216,2437,48,42860,50350,0.1,0.1
2021-07-15,49891706470,13049,7252,71385,13.272207348909346,233195e0-3482-451...,TAS,Male,Non Vestibulum In...,tent and awning s...,5.8,a,4446,504,516,592,1618,1216,2437,48,42860,50350,0.1,0.1
2021-07-15,80324045558,13049,7252,71385,139.35678856727068,3fd87986-f437-42f...,TAS,Male,Ipsum Dolor Sit C...,"gift, card, novel...",1.47,c,4446,504,516,592,1618,1216,2437,48,42860,50350,0.1,0.1


Total Records after joining:
11818811

## Dealing with current fraud data

Things we know about the fraud datasets:
- if the transaction isn't in fraud dataset fraud probability is 0.01%
- So if we remove all the fraud data then we can be fairly certain that all the remaining data is accurate and fraud free
- this allows us to predict future fraud

In [25]:
# Check the affects of removing fraud transactions based on threshold 

MERCHANT_THRESH = 1
CONSUMER_THRESH = 1

print(final_sdf.count())
testdf = final_sdf.filter(final_sdf["merchant_fraud_%"] < MERCHANT_THRESH)
testdf = final_sdf.filter(final_sdf["consumer_fraud_%"] < CONSUMER_THRESH)
print(testdf.count(), final_sdf.count()-testdf.count())

                                                                                

11818811




11756516 62295


                                                                                

Keeping the fraud prob below 1% for each give us a extremely high chance that almost all fraud data has been removed <br>
The removal of 53 thousand records is not alot considering we still have 11 million plus records still left <br>
The fact that most of the fraud transactions are now removed allows us to build a metric for determining future fraud

In [26]:
# Remove fraud transactions based on threshold

final_sdf = final_sdf.filter(final_sdf["merchant_fraud_%"] < MERCHANT_THRESH)
final_sdf = final_sdf.filter(final_sdf["consumer_fraud_%"] < CONSUMER_THRESH)

In [27]:
final_sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-19,40559163709,13049,7252,71385,499.2280181835405,753b49dc-b04a-453...,TAS,Male,Maecenas Iaculis ...,"computers, comput...",1.5,c,4446,504,516,592,1618,1216,2437,48,42860,50350,0.1,0.1
2021-08-22,49891706470,13049,7252,71385,27.59743014174974,19c2c44d-4d9b-4e4...,TAS,Male,Non Vestibulum In...,tent and awning s...,5.8,a,4446,504,516,592,1618,1216,2437,48,42860,50350,0.1,0.1
2021-08-22,45629217853,13049,7252,71385,23.56428590811388,9ff40986-52cc-47d...,TAS,Male,Lacus Consulting,"gift, card, novel...",6.98,a,4446,504,516,592,1618,1216,2437,48,42860,50350,0.1,0.1
2021-07-15,49891706470,13049,7252,71385,13.272207348909346,233195e0-3482-451...,TAS,Male,Non Vestibulum In...,tent and awning s...,5.8,a,4446,504,516,592,1618,1216,2437,48,42860,50350,0.1,0.1
2021-07-15,80324045558,13049,7252,71385,139.35678856727068,3fd87986-f437-42f...,TAS,Male,Ipsum Dolor Sit C...,"gift, card, novel...",1.47,c,4446,504,516,592,1618,1216,2437,48,42860,50350,0.1,0.1


In [28]:
# Save final dataframe used for modelling

final_sdf.write.mode("overwrite").parquet('../data/curated/finaldf.parquet')

                                                                                

## Dealing with future fraud

2 main ways to determine fraud:
- High number of transactions in a given day
- High transaction amounts

### Merchant

In [29]:
# Get the stats for each merchant for each day

final_sdf = spark.read.parquet("../data/curated/finaldf.parquet/")

merchanttestdf = final_sdf.groupBy("merchant_abn","order_datetime").agg(
    mean("dollar_value").alias("mean_amount"),
    max("dollar_value").alias("max_amount"),
    count("dollar_value").alias("transactions"),
    ).orderBy(desc("transactions"))

merchanttestdf.limit(5)

                                                                                

merchant_abn,order_datetime,mean_amount,max_amount,transactions
24852446429,2021-11-26,30.497711726468896,168.92668885555742,1057
24852446429,2021-11-27,31.122764122744787,193.03651325623903,975
24852446429,2021-11-29,30.79315047835757,137.595051098878,974
64203420245,2021-11-26,29.42978721927976,54.91857916779532,970
86578477987,2021-11-26,33.97228016929187,163.1881979840336,961


In [30]:
# Get the average stats for a merchant on any given day

merchanttestdf2 = merchanttestdf.groupBy("merchant_abn").agg(
    mean("mean_amount").alias("mean_amount"),
    max("max_amount").alias("max_amount"),
    mean("transactions").alias("mean_transactions"),
    max("transactions").alias("max_transactions"),
    stddev("transactions").alias("transaction_sd")
    ).orderBy(desc("max_transactions"))

merchanttestdf2.limit(5)

                                                                                

merchant_abn,mean_amount,max_amount,mean_transactions,max_transactions,transaction_sd
24852446429,30.030071405361,266.8120329005647,413.6980198019802,1057,88.64735351564971
64203420245,28.928305828390407,54.99964100649264,372.4422442244224,970,80.53990826941647
86578477987,34.99207784086906,293.2330883770684,389.73267326732673,961,83.21378117219548
49891706470,28.96185062870421,54.99997317014122,353.8019801980198,922,76.98159313772817
46804135891,30.00737334678103,402.34853312614206,334.96534653465346,876,72.60250971548544


In [31]:
# Get standard deviation for merchants transaction amounts

stddevdf = final_sdf.groupBy("merchant_abn").agg(stddev("dollar_value").alias("amount_sd"))
stddevdf.limit(5)

                                                                                

merchant_abn,amount_sd
73256306726,230.4986067967289
48214071373,192.43853288880413
38700038932,713.0931587735073
73841664453,54.02538393317098
83412691377,24.483086120347316


In [32]:
# Join all merchant stat dataframes

print(merchanttestdf2.count())
future_merchant_frauddf = merchanttestdf2.join(stddevdf, "merchant_abn")
print(future_merchant_frauddf.count())
future_merchant_frauddf.limit(5)

                                                                                

4018


                                                                                

4018


                                                                                

merchant_abn,mean_amount,max_amount,mean_transactions,max_transactions,transaction_sd,amount_sd
83412691377,35.0125111179469,201.92111142420248,20.397689768976896,57,6.1317702357686885,24.483086120347316
73256306726,283.65593969249045,2074.313458274829,7.462046204620462,26,3.2774277009648385,230.4986067967289
38700038932,1271.329590829926,5444.634544750903,9.633663366336634,23,3.5588728058858585,713.0931587735073
35344855546,87.57164165103826,393.1052476633007,2.4831460674157304,14,1.443981047548301,66.22265951595399
15613631617,302.952730967887,1393.5644128685904,2.805100182149362,9,1.5031267408536493,196.27471261822632


We now have a dataset that gives us all the statistics we need to determine if future merchant transactions are fraud <br>
for the time being if a merchant has > 1.5 x standard deviation above max_transactions in a day it is considered fraud <br>
if a merchant has > 1.5 x standard deviation above max_amount in a day it is considered fraud

In [33]:
# Saves dataframe for predicitng future merchant fraud

future_merchant_frauddf = future_merchant_frauddf.drop("mean_amount","mean_transactions")

future_merchant_frauddf.write.mode("overwrite").parquet("../data/curated/future_merchant_fraud.parquet")

                                                                                

### User

In [34]:
# Gets the stats for each user for each day

usertestdf = final_sdf.groupBy("user_id","order_datetime").agg(
    mean("dollar_value").alias("mean_amount"),
    max("dollar_value").alias("max_amount"),
    count("dollar_value").alias("transactions"),
    ).orderBy(desc("transactions"))

usertestdf.limit(5)

                                                                                

user_id,order_datetime,mean_amount,max_amount,transactions
5369,2021-12-26,153.7430761832681,1170.4578846318166,12
11002,2021-11-29,64.75970088494984,265.0702191117789,11
1537,2021-11-29,103.2461320990576,404.2955766727425,10
6671,2021-11-29,86.35832209092791,324.45234932908204,10
7760,2021-11-29,141.4723800333298,665.8989628155817,10


In [35]:
# Gets the average stats for a user on any given day

usertestdf2 = usertestdf.groupBy("user_id").agg(
    mean("mean_amount").alias("mean_amount"),
    max("max_amount").alias("max_amount"),
    mean("transactions").alias("mean_transactions"),
    max("transactions").alias("max_transactions"),
    stddev("transactions").alias("transaction_std")
    ).orderBy(desc("max_transactions"))

usertestdf2.limit(5)

                                                                                

user_id,mean_amount,max_amount,mean_transactions,max_transactions,transaction_std
5369,150.90859987007502,4044.791996213037,1.5746478873239436,12,0.9782548611719448
11002,143.30148154439888,3756.619241683773,1.5495750708215297,11,0.9405366520887029
6594,122.23995708508409,1631.7559754340205,1.565459610027855,10,0.9126848158508852
6629,151.57884023838062,4040.9159838767896,1.5501355013550135,10,0.9373333086951994
13985,152.21129536632154,3391.9412364035425,1.568181818181818,10,0.9493792189219764


In [36]:
# Get standard deviation for users transaction amounts

stddevdf = final_sdf.groupBy("user_id").agg(stddev("dollar_value").alias("amount_sd"))
stddevdf.limit(5)

                                                                                

user_id,amount_sd
18147,339.50434940643896
26,318.0838669337013
21899,217.48701636428467
3764,344.04850553279755
5409,288.922033264638


In [37]:
# Join all user stat dataframes

print(usertestdf2.count())
future_user_frauddf = usertestdf2.join(stddevdf, "user_id")
print(future_user_frauddf.count())
future_user_frauddf.limit(5)

                                                                                

20906


                                                                                

20906


                                                                                

user_id,mean_amount,max_amount,mean_transactions,max_transactions,transaction_std,amount_sd
19979,154.28539185829345,8150.115728134807,1.516304347826087,9,0.84476618165645,400.16781038916304
23492,153.55353346463673,4097.470542751887,1.4808743169398908,9,0.8232347161406164,294.7157993790497
12568,127.10156526244836,2042.0161577505255,1.5493333333333332,8,0.8757538133134091,239.9449439981541
15663,143.37670911767736,2697.749698358689,1.541899441340782,8,0.8349365671068217,267.9393245380722
15437,157.63434236717413,4791.825368197756,1.569060773480663,7,0.8132787862153956,377.4357418105274


We now have a dataset that gives us all the statistics we need to determine if future user transactions are fraud <br>
for the time being if a user has > 1.5 x standard deviation above max_transactions in a day it is considered fraud <br>
if a user has > 1.5 x standard deviation above max_amount in a day it is considered fraud

Now remains the case where a user or merchant has little to no data to get these metrics from <br>
We could either set our own thresholds based on logical reasoning for fraud detection<br>
Or we could look at the distributions of existing data to determine thresholds for fraud detection

In [38]:
# Saves dataframe for predicitng future user fraud

future_user_frauddf = future_user_frauddf.drop("mean_amount","mean_transactions")

future_user_frauddf.write.mode("overwrite").parquet("../data/curated/future_user_fraud.parquet")

                                                                                