In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

In [3]:
sdf = spark.read.parquet("../data/curated/full_data_with_fraud/")
sdf.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- SA2_code: integer (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- consumer_id: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- mean_total_income: integer (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: float (nullable = true)
 |-- is_fraud: integer (nullable = true)



In [4]:
# split the dataset 
train_sdf = sdf.filter((F.col('order_datetime') >= '2021-02-28') & (F.col('order_datetime') < '2021-08-28'))
label_sdf = sdf.filter((F.col('order_datetime') >= '2022-02-28') & (F.col('order_datetime') < '2022-08-28'))

In [5]:
train_sdf.count(), label_sdf.count()

(3465784, 4169494)

### Created Features
The following features are created by aggregating records from 2021-2 to 2021-8 for each merchant. They are used as features in the model.
1. Total number of consumers
2. Average transaction dollar value
3. Total number of transactions
4. Mean income of consumers
5. revenue level
6. BNPL revenue = take rate * total transaction
7. Number of distinct postcode
8. Tag (one hot encoding)

The following features are created by aggregating records from 2022-2 to 2022-8 for each merchant. These features are to be predicted by the model and are used as features in the final ranking system.
1. Total number of consumers
2. BNPL revenue
3. Total number of transactions

In [6]:
train_data = train_sdf.groupBy('merchant_abn')\
      .agg(
         F.countDistinct('consumer_id').alias('total_num_consumer'),
         F.mean('dollar_value').alias('avg_dollar_value'),
         F.countDistinct('order_id').alias('total_num_transaction'),
         F.mean('mean_total_income').alias('mean_income'),
         F.first('revenue_level').alias('revenue_level'),
         F.sum(F.col('dollar_value') * F.col('take_rate')).alias('total_revenue'),
         F.countDistinct('postcode').alias('total_num_postcode'),
         F.first('tags').alias('tag'),
      )
train_data

merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,tag
10023283211,810,208.75059444042955,825,62478.72363636363,e,30999.46450621179,720,furniture
10142254217,734,40.9846955092763,753,62668.40903054449,b,130235.42105701164,645,cable
10187291046,88,110.32619101999413,88,60917.59090909091,b,31941.63845375101,88,watch
10192359162,107,451.1432080236007,107,65637.3831775701,a,305563.80254357896,107,music
10206519221,2246,39.18969916140156,2372,62113.16231028668,a,589353.5212290141,1620,gift
10255988167,218,389.5552654520502,218,63146.61926605504,b,366867.58137013006,211,computer
10264435225,1241,114.09004324213888,1275,61996.07843137255,c,347660.89952947287,1019,watch
10279061213,131,312.3494031465132,131,63731.02290076336,a,233640.4786085125,128,computer
10323485998,2481,129.06002950289465,2627,62801.365816520745,a,2241059.0557689457,1731,furniture
10342410215,238,378.0169713940928,238,61579.6512605042,a,570397.3822040077,227,computer


In [7]:
label = label_sdf.groupBy('merchant_abn')\
      .agg(
         F.countDistinct('consumer_id').alias('y_total_num_consumer'),
         F.sum(F.col('dollar_value') * F.col('take_rate')).alias('y_total_revenue'),
         F.countDistinct('order_id').alias('y_total_num_transaction')
      )

In [8]:
train_data.count(), label.count() 

(3981, 3997)

In [9]:
train_data = train_data.join(label, ["merchant_abn"], how="left") 
# use left join here since if no historical data is provided, we cannot predict the future value of a merchant
train_data

merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,tag,y_total_num_consumer,y_total_revenue,y_total_num_transaction
10023283211,810,208.75059444042955,825,62478.72363636363,e,30999.46450621179,720,furniture,981,39036.25170314089,1003
10142254217,734,40.98469550927631,753,62668.40903054449,b,130235.42105701164,645,cable,918,154467.92091896123,925
10187291046,88,110.32619101999413,88,60917.59090909091,b,31941.63845375101,88,watch,99,41683.21121325837,100
10192359162,107,451.1432080236007,107,65637.3831775701,a,305563.80254357896,107,music,111,330982.8195157424,111
10206519221,2246,39.18969916140156,2372,62113.16231028668,a,589353.5212290141,1620,gift,2662,666766.4067054288,2811
10255988167,218,389.5552654520502,218,63146.61926605504,b,366867.5813701301,211,computer,237,390460.7068389275,238
10264435225,1241,114.09004324213888,1275,61996.07843137255,c,347660.8995294729,1019,watch,1519,435003.6795629895,1566
10279061213,131,312.3494031465132,131,63731.02290076336,a,233640.47860851244,128,computer,161,273146.77071188006,161
10323485998,2481,129.06002950289457,2627,62801.365816520745,a,2241059.0557689453,1731,furniture,2974,2765582.918877613,3191
10342410215,238,378.0169713940928,238,61579.6512605042,a,570397.3822040077,227,computer,275,682110.1735125553,277


In [10]:
train_data.write.format('parquet').mode('overwrite').save("../data/curated/train_data")