In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/10/09 17:45:31 WARN Utils: Your hostname, AryansLaptop resolves to a loopback address: 127.0.1.1; using 172.26.64.252 instead (on interface eth0)
22/10/09 17:45:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/09 17:45:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/09 17:45:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Combining datasets

In [2]:
# All given data

sdf = spark.read.parquet("../data/curated/mergedftemp.parquet/")
print(sdf.count())
sdf.limit(5)

                                                                                

13614675


                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,postcode,gender,company_name,tags,take_rate,revenue_band
69666829657,226,5162,79.65159982605903,8765ef9f-dba6-407...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Sem Ut Institute,bicycle shops - s...,2.86,c
50321300271,226,5162,594.2915496790856,79f2842d-f8b2-4fd...,2022-06-01,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Augue Industries,bicycle shops - s...,4.24,b
70610974780,226,5162,52.820359204536665,bdf345c8-4c5d-48e...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Elit Aliquam PC,tent and awning s...,6.93,a
17324645993,226,5162,27.12729568273566,0a44d623-e325-4fc...,2022-05-12,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Eget Metus In Cor...,tent and awning s...,5.73,a
15582655078,226,5162,75.31904078962366,47ddf8e0-5f72-408...,2021-08-21,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Eu Odio Phasellus...,"gift, card, novel...",6.77,a


In [3]:
# Population data

pop_sdf = spark.read.parquet("../data/curated/pop_sdf.parquet/")
pop_sdf = pop_sdf.withColumnRenamed("SA2 code","sa2_code")
pop_sdf.limit(5)

State/Terr,sa2_code,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old
New South Wales,121011686,Lane Cove,15931,2246,1683,3108,5838,3056
New South Wales,121011687,Willoughby,13024,1829,1862,1679,4949,2705
New South Wales,121021403,Asquith - Mount C...,22134,2949,3092,3674,8221,4198
New South Wales,121021404,Berowra - Brookly...,11793,1513,1805,1444,4385,2646
New South Wales,121021406,Normanhurst - Tho...,19199,2395,2868,2589,6962,4385


In [4]:
# Income data

income_sdf = spark.read.parquet("../data/curated/incomedf.parquet/")
income_sdf.limit(5)

sa2_code,num_earners,median_age,median_income,mean_income
213011340,11238,46,51181,61177
213021341,7942,43,57585,69301
213021344,11216,41,66161,80219
213021345,3015,41,52078,60249
213021346,9996,47,63892,90668


### External dataset joins

After analysing the merge of the existing data with each of the postocde data (workbooks 3a and 3b), have concluded that we should use the postcode data not containing the ratio field and use the median of the matched records for each postcode to get a single value for income/population data for each postcode. 

The benefits of using weighted averaging provided by the postcode with ratio dataset would not, by our analysis, be worth the loss of 1,498,774 records. The choice of using median was based on the many outliers (see boxplots in workbooks 3a and 3b).

In [5]:
# Postcode data

postcode_sdf = spark.read.parquet("../data/curated/postcodedf.parquet/")
print(postcode_sdf.count())
postcode_sdf.limit(5)

5492


postcode,sa2_code
1008,117031337
1150,117031337
2100,122031429
2200,119011571
2338,110041201


Join postcode with population data

In [6]:
print(pop_sdf.count())

2450


In [7]:
merged_sdf1 = pop_sdf.join(postcode_sdf, on="sa2_code", how="inner")
merged_sdf1.groupBy("sa2_code").count().count()

2083

Lost 367 records from population data for which there were no sa2 codes

In [8]:
merged_sdf1.orderBy("postcode").limit(5)

sa2_code,State/Terr,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old,postcode
801051049,Australian Capita...,Acton,2875,6,1528,1292,47,2,200
701011002,Northern Territory,Darwin City,7679,474,325,3322,2652,906,800
701011002,Northern Territory,Darwin City,7679,474,325,3322,2652,906,801
701011007,Northern Territory,Parap,2980,350,282,801,1047,500,804
701021013,Northern Territory,Brinkin - Nakara,3615,372,409,1140,1086,608,810


Get a single population value for each postcode and field by calculating the median

In [9]:
postcode_pop_sdf = merged_sdf1.groupBy("postcode") \
    .agg(
        F.percentile_approx(merged_sdf1.Total, 0.5).alias("total_pop"),
        F.percentile_approx(merged_sdf1['Under 10'], 0.5).alias("under10_pop"),
        F.percentile_approx(merged_sdf1.Adolescent, 0.5).alias("adolsc_pop"),
        F.percentile_approx(merged_sdf1['Young adult'], 0.5).alias("yng_adult_pop"),
        F.percentile_approx(merged_sdf1['Middle age'], 0.5).alias("mid_age_pop"),
        F.percentile_approx(merged_sdf1.Old, 0.5).alias("old_pop")
    )
    
postcode_pop_sdf.orderBy("postcode").limit(5)

                                                                                

postcode,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
200,2875,6,1528,1292,47,2
800,7679,474,325,3322,2652,906
801,7679,474,325,3322,2652,906
804,2980,350,282,801,1047,500
810,2657,326,274,541,845,386


Join postcode with income data

In [10]:
print(income_sdf.count())

2239


In [11]:
merged_sdf2 = income_sdf.join(postcode_sdf, on="sa2_code", how="inner")

In [12]:
merged_sdf2.groupBy("sa2_code").count().count()

2186

Lost 53 records for which there were no sa2 codes

In [13]:
merged_sdf2.orderBy("postcode").limit(5)

sa2_code,num_earners,median_age,median_income,mean_income,postcode
801051049,548,23,9306,16835,200
701011002,5909,33,60937,87791,800
701011002,5909,33,60937,87791,801
701011007,1873,40,75219,98872,804
701021024,1229,41,58778,65990,810


In [14]:
postcode_income_sdf = merged_sdf2.groupBy("postcode") \
    .agg(
        F.percentile_approx(merged_sdf2.num_earners, 0.5).alias("num_earners"),
        F.percentile_approx(merged_sdf2.median_age, 0.5).alias("median_age"),
        F.percentile_approx(merged_sdf2.median_income, 0.5).alias("median_income"),
        F.percentile_approx(merged_sdf2.mean_income, 0.5).alias("mean_income")
    )
    
postcode_income_sdf.orderBy("postcode").limit(5)

postcode,num_earners,median_age,median_income,mean_income
200,548,23,9306,16835
800,5909,33,60937,87791
801,5909,33,60937,87791
804,1873,40,75219,98872
810,1479,39,58753,67299


Combine all external data in a single spark dataframe

In [15]:
print(postcode_pop_sdf.count(), postcode_income_sdf.count())

2793 3160


In [16]:
external_data_sdf = postcode_pop_sdf.join(postcode_income_sdf, on="postcode", how="full")
print(external_data_sdf.count())
external_data_sdf.limit(5)

3162


postcode,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
200,2875,6,1528,1292,47,2,548,23,9306,16835
800,7679,474,325,3322,2652,906,5909,33,60937,87791
801,7679,474,325,3322,2652,906,5909,33,60937,87791
804,2980,350,282,801,1047,500,1873,40,75219,98872
810,2657,326,274,541,845,386,1479,39,58753,67299


In [17]:
external_data_sdf.write.mode("overwrite").parquet("../data/curated/externaldata.parquet")

                                                                                

Join external data with existing data

In [17]:
print(sdf.count())
sdf.limit(5)

13614675


merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,postcode,gender,company_name,tags,take_rate,revenue_band
69666829657,226,5162,79.65159982605903,8765ef9f-dba6-407...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Sem Ut Institute,bicycle shops - s...,2.86,c
50321300271,226,5162,594.2915496790856,79f2842d-f8b2-4fd...,2022-06-01,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Augue Industries,bicycle shops - s...,4.24,b
70610974780,226,5162,52.820359204536665,bdf345c8-4c5d-48e...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Elit Aliquam PC,tent and awning s...,6.93,a
17324645993,226,5162,27.12729568273566,0a44d623-e325-4fc...,2022-05-12,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Eget Metus In Cor...,tent and awning s...,5.73,a
15582655078,226,5162,75.31904078962366,47ddf8e0-5f72-408...,2021-08-21,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Eu Odio Phasellus...,"gift, card, novel...",6.77,a


In [18]:
final_sdf = sdf.join(external_data_sdf, on="postcode", how="inner")
print(final_sdf.count())
final_sdf.limit(5)

                                                                                

13395285


                                                                                

postcode,merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
6731,71350572766,58911,22131,38.35886219595664,fd0cfc42-44b3-467...,2021-09-18,Paul Lopez,3908 David Squares,WA,Male,Dictum Placerat A...,"cable, satellite,...",1.57,c,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,96680767841,58911,22131,415.9547984625,829843bc-c571-493...,2021-08-19,Paul Lopez,3908 David Squares,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,79417999332,58911,22131,48.40729616802216,55069672-41f7-4dc...,2021-10-06,Paul Lopez,3908 David Squares,WA,Male,Phasellus At Company,"gift, card, novel...",4.95,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,66370248931,58911,22131,82.54046571771035,cc8ac5c5-0dfc-4e1...,2021-08-19,Paul Lopez,3908 David Squares,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306
6731,82081024598,58911,22131,43.24936460061825,1632e377-0b6d-447...,2021-10-06,Paul Lopez,3908 David Squares,WA,Male,Placerat Orci Ass...,digital goods: bo...,2.4,c,8374,1373,1185,2200,2600,1016,2907,39,48034,56306


In [19]:
final_sdf = final_sdf.dropna(how="any")
final_sdf.count()

22/10/09 16:34:29 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

11819686

Null values occur because for some postcodes, population/income data did not exist. <br>
Removing records containing null values resulted in the loss of 1,575,476 records.

### Fraud joins

In [20]:
# Read in consumer fraud data

consumerfraud_sdf = spark.read.parquet("../data/curated/consumerfrauddf.parquet/")
consumerfraud_sdf = consumerfraud_sdf.withColumnRenamed("fraud_probability","consumer_fraud_%")
consumerfraud_sdf.count()

34765

In [21]:
# Read in merchant fraud data

merchantfraud_sdf = spark.read.parquet("../data/curated/merchantfrauddf.parquet/")
merchantfraud_sdf = merchantfraud_sdf.withColumnRenamed("fraud_probability","merchant_fraud_%")
merchantfraud_sdf.count()

114

In [22]:
# join fraud data with all other data and check for rows dropped

print(final_sdf.count())
final_sdf = final_sdf.join(consumerfraud_sdf, ["order_datetime", "user_id"], "leftouter")
print(final_sdf.count())
final_sdf = final_sdf.join(merchantfraud_sdf, ["order_datetime", "merchant_abn"], "leftouter")
print(final_sdf.count())
final_sdf.limit(5)

                                                                                

11819686


                                                                                

11819686


                                                                                

11819686


                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-09-18,71350572766,22131,6731,58911,38.35886219595664,fd0cfc42-44b3-467...,Paul Lopez,3908 David Squares,WA,Male,Dictum Placerat A...,"cable, satellite,...",1.57,c,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,,
2021-08-19,96680767841,22131,6731,58911,415.9547984625,829843bc-c571-493...,Paul Lopez,3908 David Squares,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,,
2021-10-06,79417999332,22131,6731,58911,48.40729616802216,55069672-41f7-4dc...,Paul Lopez,3908 David Squares,WA,Male,Phasellus At Company,"gift, card, novel...",4.95,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,,
2021-08-19,66370248931,22131,6731,58911,82.54046571771035,cc8ac5c5-0dfc-4e1...,Paul Lopez,3908 David Squares,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,,
2021-10-06,82081024598,22131,6731,58911,43.24936460061825,1632e377-0b6d-447...,Paul Lopez,3908 David Squares,WA,Male,Placerat Orci Ass...,digital goods: bo...,2.4,c,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,,


In [23]:
# Drop useless columns and replace fraud nulls with a small value representing its percentage

NULL_PERCENTAGE = 0.1

final_sdf = final_sdf.drop("customer_name","address")
final_sdf = final_sdf.na.fill(NULL_PERCENTAGE)
final_sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-09-18,71350572766,22131,6731,58911,38.35886219595664,fd0cfc42-44b3-467...,WA,Male,Dictum Placerat A...,"cable, satellite,...",1.57,c,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-08-19,96680767841,22131,6731,58911,415.9547984625,829843bc-c571-493...,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-10-06,79417999332,22131,6731,58911,48.40729616802216,55069672-41f7-4dc...,WA,Male,Phasellus At Company,"gift, card, novel...",4.95,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-08-19,66370248931,22131,6731,58911,82.54046571771035,cc8ac5c5-0dfc-4e1...,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-10-06,82081024598,22131,6731,58911,43.24936460061825,1632e377-0b6d-447...,WA,Male,Placerat Orci Ass...,digital goods: bo...,2.4,c,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1


Total Records after joining:
11818811

## Dealing with current fraud data

Things we know about the fraud datasets:
- if the transaction isn't in fraud dataset fraud probability is 0.01%
- So if we remove all the fraud data then we can be fairly certain that all the remaining data is accurate and fraud free
- this allows us to predict future fraud

In [24]:
# Check the affects of removing fraud transactions based on threshold 

MERCHANT_THRESH = 1
CONSUMER_THRESH = 1

print(final_sdf.count())
testdf = final_sdf.filter(final_sdf["merchant_fraud_%"] < MERCHANT_THRESH)
testdf = final_sdf.filter(final_sdf["consumer_fraud_%"] < CONSUMER_THRESH)
print(testdf.count(), final_sdf.count()-testdf.count())

                                                                                

11819686




11757387 62299


                                                                                

Keeping the fraud prob below 1% for each give us a extremely high chance that almost all fraud data has been removed <br>
The removal of 53 thousand records is not alot considering we still have 11 million plus records still left <br>
The fact that most of the fraud transactions are now removed allows us to build a metric for determining future fraud

In [25]:
# Remove fraud transactions based on threshold

final_sdf = final_sdf.filter(final_sdf["merchant_fraud_%"] < MERCHANT_THRESH)
final_sdf = final_sdf.filter(final_sdf["consumer_fraud_%"] < CONSUMER_THRESH)

In [29]:
final_sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-09-18,71350572766,22131,6731,58911,38.35886219595664,fd0cfc42-44b3-467...,WA,Male,Dictum Placerat A...,"cable, satellite,...",1.57,c,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-08-19,96680767841,22131,6731,58911,415.9547984625,829843bc-c571-493...,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-10-06,79417999332,22131,6731,58911,48.40729616802216,55069672-41f7-4dc...,WA,Male,Phasellus At Company,"gift, card, novel...",4.95,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-08-19,66370248931,22131,6731,58911,82.54046571771035,cc8ac5c5-0dfc-4e1...,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-10-06,82081024598,22131,6731,58911,43.24936460061825,1632e377-0b6d-447...,WA,Male,Placerat Orci Ass...,digital goods: bo...,2.4,c,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1


In [30]:
# Save final dataframe used for modelling

final_sdf.write.mode("overwrite").parquet('../data/curated/finaldf.parquet')



22/10/09 16:31:03 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 96.51% for 14 writers
22/10/09 16:31:03 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 90.08% for 15 writers
22/10/09 16:31:03 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 84.45% for 16 writers


[Stage 494:>                                                      (0 + 16) / 17]

22/10/09 16:31:21 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 90.08% for 15 writers
22/10/09 16:31:21 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 96.51% for 14 writers
22/10/09 16:31:21 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 90.08% for 15 writers
22/10/09 16:31:21 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 96.51% for 14 writers


                                                                                

## Dealing with future fraud

2 main ways to determine fraud:
- High number of transactions in a given day
- High transaction amounts

### Merchant

In [2]:
# Get the stats for each merchant for each day

final_sdf = spark.read.parquet("../data/curated/finaldf.parquet/")

merchanttestdf = final_sdf.groupBy("merchant_abn","order_datetime").agg(
    mean("dollar_value").alias("mean_amount"),
    max("dollar_value").alias("max_amount"),
    count("dollar_value").alias("transactions"),
    ).orderBy(desc("transactions"))

merchanttestdf.limit(5)

                                                                                

merchant_abn,order_datetime,mean_amount,max_amount,transactions
24852446429,2021-11-26,30.497711726468893,168.92668885555742,1057
24852446429,2021-11-27,31.122764122744787,193.03651325623903,975
24852446429,2021-11-29,30.79315047835758,137.595051098878,974
64203420245,2021-11-26,29.429787219279756,54.91857916779532,970
86578477987,2021-11-26,33.97228016929187,163.1881979840336,961


In [3]:
# Get the average stats for a merchant on any given day

merchanttestdf2 = merchanttestdf.groupBy("merchant_abn").agg(
    mean("mean_amount").alias("mean_amount"),
    max("max_amount").alias("max_amount"),
    mean("transactions").alias("mean_transactions"),
    max("transactions").alias("max_transactions"),
    stddev("transactions").alias("transaction_sd")
    ).orderBy(desc("max_transactions"))

merchanttestdf2.limit(5)

                                                                                

merchant_abn,mean_amount,max_amount,mean_transactions,max_transactions,transaction_sd
24852446429,30.030071405361,266.8120329005647,413.6980198019802,1057,88.64735351564971
64203420245,28.928305828390407,54.99964100649264,372.4422442244224,970,80.53990826941647
86578477987,34.99207784086906,293.2330883770684,389.73267326732673,961,83.21378117219548
49891706470,28.96185062870421,54.99997317014122,353.8019801980198,922,76.98159313772817
46804135891,29.99873399521746,402.34853312614206,335.0643564356436,876,72.63330850440376


In [4]:
# Get standard deviation for merchants transaction amounts

stddevdf = final_sdf.groupBy("merchant_abn").agg(stddev("dollar_value").alias("amount_sd"))
stddevdf.limit(5)

merchant_abn,amount_sd
15613631617,196.27471261822637
83412691377,24.48308612034732
48214071373,192.43853288880416
34440496342,51.47665843603984
35344855546,66.222659515954


In [5]:
# Join all merchant stat dataframes

print(merchanttestdf2.count())
future_merchant_frauddf = merchanttestdf2.join(stddevdf, "merchant_abn")
print(future_merchant_frauddf.count())
future_merchant_frauddf.limit(5)

                                                                                

4018


                                                                                

4018


                                                                                

merchant_abn,mean_amount,max_amount,mean_transactions,max_transactions,transaction_sd,amount_sd
83412691377,35.0125111179469,201.92111142420248,20.397689768976896,57,6.1317702357686885,24.48308612034732
86662713230,52.27163302064317,277.15842150670414,30.706270627062707,88,8.428965817137628,30.128423087149255
10648956813,64.57249819775156,102.99670668490096,31.358085808580856,78,8.082763558526128,22.14248423572274
11944993446,73.47109138419792,481.8779170481174,15.795379537953796,39,5.244537033779819,51.82757117534096
90568944804,867.2720789192232,4684.169469909291,14.96351575456053,31,4.66933115121088,592.1492638953868


We now have a dataset that gives us all the statistics we need to determine if future merchant transactions are fraud <br>
for the time being if a merchant has > 1.5 x standard deviation above max_transactions in a day it is considered fraud <br>
if a merchant has > 1.5 x standard deviation above max_amount in a day it is considered fraud

In [6]:
# Saves dataframe for predicitng future merchant fraud

future_merchant_frauddf = future_merchant_frauddf.drop("mean_amount","mean_transactions")

future_merchant_frauddf.write.mode("overwrite").parquet("../data/curated/future_merchant_fraud.parquet")

                                                                                

### User

In [7]:
# Gets the stats for each user for each day

usertestdf = final_sdf.groupBy("user_id","order_datetime").agg(
    mean("dollar_value").alias("mean_amount"),
    max("dollar_value").alias("max_amount"),
    count("dollar_value").alias("transactions"),
    ).orderBy(desc("transactions"))

usertestdf.limit(5)

                                                                                

user_id,order_datetime,mean_amount,max_amount,transactions
5369,2021-12-26,153.7430761832681,1170.4578846318166,12
11002,2021-11-29,64.75970088494984,265.0702191117789,11
6671,2021-11-29,86.35832209092791,324.45234932908204,10
6629,2021-11-26,140.90392507645333,677.5298977903775,10
13985,2021-11-27,95.1525980004112,315.88093383565024,10


In [8]:
# Gets the average stats for a user on any given day

usertestdf2 = usertestdf.groupBy("user_id").agg(
    mean("mean_amount").alias("mean_amount"),
    max("max_amount").alias("max_amount"),
    mean("transactions").alias("mean_transactions"),
    max("transactions").alias("max_transactions"),
    stddev("transactions").alias("transaction_std")
    ).orderBy(desc("max_transactions"))

usertestdf2.limit(5)

                                                                                

user_id,mean_amount,max_amount,mean_transactions,max_transactions,transaction_std
5369,150.90859987007502,4044.791996213037,1.5746478873239436,12,0.9782548611719448
11002,143.30148154439885,3756.619241683773,1.5495750708215297,11,0.9405366520887024
6594,122.2399570850841,1631.7559754340205,1.565459610027855,10,0.9126848158508848
6629,151.57884023838062,4040.9159838767896,1.5501355013550135,10,0.9373333086951994
13985,152.21129536632154,3391.9412364035425,1.568181818181818,10,0.949379218921976


In [9]:
# Get standard deviation for users transaction amounts

stddevdf = final_sdf.groupBy("user_id").agg(stddev("dollar_value").alias("amount_sd"))
stddevdf.limit(5)

user_id,amount_sd
4590,254.4461914237544
19158,306.49392143056724
8075,316.8869028620271
12148,200.3684543568255
9444,387.6909305329309


In [10]:
# Join all user stat dataframes

print(usertestdf2.count())
future_user_frauddf = usertestdf2.join(stddevdf, "user_id")
print(future_user_frauddf.count())
future_user_frauddf.limit(5)

                                                                                

20906


                                                                                

20906


                                                                                

user_id,mean_amount,max_amount,mean_transactions,max_transactions,transaction_std,amount_sd
19979,154.28539185829345,8150.115728134807,1.516304347826087,9,0.84476618165645,400.1678103891637
23492,153.55353346463673,4097.470542751887,1.4808743169398908,9,0.8232347161406164,294.71579937904977
12568,127.10156526244836,2042.0161577505255,1.5493333333333332,8,0.875753813313409,239.9449439981541
15663,143.37670911767736,2697.749698358689,1.541899441340782,8,0.8349365671068216,267.9393245380722
15437,157.63434236717413,4791.825368197756,1.569060773480663,7,0.8132787862153956,377.43574181052736


We now have a dataset that gives us all the statistics we need to determine if future user transactions are fraud <br>
for the time being if a user has > 1.5 x standard deviation above max_transactions in a day it is considered fraud <br>
if a user has > 1.5 x standard deviation above max_amount in a day it is considered fraud

Now remains the case where a user or merchant has little to no data to get these metrics from <br>
We could either set our own thresholds based on logical reasoning for fraud detection<br>
Or we could look at the distributions of existing data to determine thresholds for fraud detection

In [11]:
# Saves dataframe for predicitng future user fraud

future_user_frauddf = future_user_frauddf.drop("mean_amount","mean_transactions")

future_user_frauddf.write.mode("overwrite").parquet("../data/curated/future_user_fraud.parquet")

                                                                                