In [1]:
from pyspark.sql import functions as F, SparkSession

In [2]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("consumer model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.execturo.memory", "2g")
    .getOrCreate()
)

24/09/08 09:39:07 WARN Utils: Your hostname, qinsitaodeMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 100.92.15.134 instead (on interface en0)
24/09/08 09:39:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/08 09:39:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [21]:
# Load data
consumer_info = spark.read.parquet('../data/curated/consumer_info.parquet')
transaction_records = spark.read.parquet('../data/curated/transaction_records.parquet')
fraudulent_consumer_rate = spark.read.parquet('../data/curated/consumer_fraud_rate.parquet')

In [58]:
fraudulent_consumer_with_info = consumer_info.join(fraudulent_consumer_rate, on="consumer_id", how="inner")


In [71]:
# Average fraud probability in each postcode or state
fraudulent_consumer_group_by_postcode = fraudulent_consumer_with_info.groupBy(["postcode"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_postcode"))

fraudulent_consumer_group_by_state = fraudulent_consumer_with_info.groupBy(["state"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_state"))

In [22]:
# Get average fraud prob for each consumer
average_fraudulent_consumer_rate = fraudulent_consumer_rate.groupBy("consumer_id").agg(F.avg("fraud_probability").alias("average_fraud_probability"))

In [35]:
fraudulent_consumer_rate.count()

34864

In [30]:
# number of unique available fraudulent consumer
average_fraudulent_consumer_rate.where(average_fraudulent_consumer_rate["average_fraud_probability"]>0).count()

20128

In [4]:
# Add consumer info to transaction records
consumer_transaction_records = transaction_records.join(consumer_info, on="consumer_id", how="inner")

In [None]:
# order has really high variance and value
consumer_transaction_records.select("dollar_value").summary()

In [48]:
# analysis order value, consider the variance of order value and purchase frequency
consumer_transaction_value_analysis =  consumer_transaction_records.groupBy("consumer_id", "state", "postcode") \
                                        .agg(
                                            F.avg("dollar_value").alias("average_dollar_value"),
                                            F.min("dollar_value").alias("min_dollar_value"),
                                            F.max("dollar_value").alias("max_dollar_value"),
                                            F.count("dollar_value").alias("transaction_count"),
                                            F.stddev("dollar_value").alias("stddev_dollar_value")
                                        )

In [78]:
# Prepare data frame for modelling
fraudulent_consumer_summary = average_fraudulent_consumer_rate \
    .join(consumer_transaction_value_analysis, on="consumer_id", how="left") \
    .join(fraudulent_consumer_group_by_postcode, on="postcode", how="inner") \
    .join(fraudulent_consumer_group_by_state, on="state", how="inner")

In [79]:
fraudulent_consumer_summary


                                                                                

20128

In [12]:
consumer_transaction_records

consumer_id,merchant_abn,dollar_value,order_id,order_datetime,name,gender,state,postcode
1059280,79417999332,136.06570809815838,23acbb7b-cf98-458...,2021-11-26,Cameron Adams,Male,QLD,4563
1195503,46451548968,72.61581642788431,76bab304-fa2d-400...,2021-11-26,Yolanda Williams,Female,WA,6935
986886,89518629617,3.0783487174439297,a2ae446a-2959-41c...,2021-11-26,Maria Riley,Female,SA,5157
1195503,49167531725,51.58228625503599,7080c274-17f7-4cc...,2021-11-26,Yolanda Williams,Female,WA,6935
986886,31101120643,25.2281149424178,8e301c0f-06ab-45c...,2021-11-26,Maria Riley,Female,SA,5157
179208,67978471888,691.5028234458998,0380e9ad-b0e8-420...,2021-11-26,Mary Smith,Female,NSW,2782
986886,60956456424,102.13952056640888,5ac3da9c-5147-452...,2021-11-26,Maria Riley,Female,SA,5157
179208,47644196714,644.5220654863093,4e368e44-86f8-4de...,2021-11-26,Mary Smith,Female,NSW,2782
267457,39649557865,209.12780951421405,4d78cd01-4bab-494...,2021-11-26,Jasmine Ford,Undisclosed,NSW,2625
1194530,88402174457,141.0387993699113,c50c957d-ecfc-430...,2021-11-26,Jill Jones MD,Female,NT,862


# Idea
1. Time Frequency feature: https://ieeexplore.ieee.org/document/9399421/