# Project 2 - Group 13

## 0 - Begin Spark

In [176]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "2g")
    .getOrCreate()
)

## 1 - Data cleaning

### 1.1 - Consumer fraud probability

In [177]:
consumer_fraud_probability_df = spark.read.csv('../data/tables/tables/consumer_fraud_probability.csv', header = True)
consumer_fraud_probability_df = consumer_fraud_probability_df.withColumnRenamed('order_datetime', 'consumer_datetime')
print(f'Number of entires = {consumer_fraud_probability_df.count()}')
consumer_fraud_probability_df.limit(5)

Number of entires = 34864


user_id,consumer_datetime,fraud_probability
6228,2021-12-19,97.6298077657765
21419,2021-12-10,99.24738020302328
5606,2021-10-17,84.05825045251777
3101,2021-04-17,91.42192091901347
22239,2021-10-19,94.70342477508036


### 1.2 - Consumer user details

In [178]:
consumer_user_details_df = spark.read.parquet('../data/tables/tables/consumer_user_details.parquet')
print(f'Number of entires = {consumer_user_details_df.count()}')
consumer_user_details_df.limit(5)

Number of entires = 499999


user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975


### 1.3 - Merchant fraud probability

In [179]:
merchant_fraud_probability_df = spark.read.csv('../data/tables/tables/merchant_fraud_probability.csv', header = True)
merchant_fraud_probability_df = merchant_fraud_probability_df.withColumnRenamed('order_datetime', 'merchant_datetime')

print(f'Number of entires = {merchant_fraud_probability_df.count()}')
merchant_fraud_probability_df.limit(5)

Number of entires = 114


merchant_abn,merchant_datetime,fraud_probability
19492220327,2021-11-28,44.40365864749536
31334588839,2021-10-02,42.75530083865367
19492220327,2021-12-22,38.867790051131095
82999039227,2021-12-19,94.1347004808891
90918180829,2021-09-02,43.32551731714902


### 1.4 - TBL Consumer

In [180]:
import pandas as pd
tbl_consumer_df = pd.read_csv('../data/tables/tables/tbl_consumer.csv', sep="|")

tbl_consumer_pdf = spark.createDataFrame(tbl_consumer_df)
print(f'Number of entires = {tbl_consumer_pdf.count()}')
tbl_consumer_pdf.limit(5)

24/09/01 14:43:00 WARN TaskSetManager: Stage 596 contains a task of very large size (2276 KiB). The maximum recommended task size is 1000 KiB.


Number of entires = 499999


24/09/01 14:43:01 WARN TaskSetManager: Stage 599 contains a task of very large size (2276 KiB). The maximum recommended task size is 1000 KiB.
24/09/01 14:43:01 WARN TaskSetManager: Stage 600 contains a task of very large size (2276 KiB). The maximum recommended task size is 1000 KiB.


name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975


### 1.5 - TBL merchants

In [167]:
tbl_merchants_df = spark.read.parquet('../data/tables/tables/tbl_merchants.parquet')
print(f'Number of entires = {tbl_merchants_df.count()}')
tbl_merchants_df.limit(5)

Number of entires = 4026


name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162


### 1.6 - Transactions

#### 1.6.1 - Tables 2

In [168]:
tables_2_df = spark.read.parquet('../data/tables/tables 2/transactions_20210228_20210827_snapshot')
print(f'Number of entires = {tables_2_df.count()}')
tables_2_df.limit(5)

Number of entires = 3643266


user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20


In [169]:
tables_3_df = spark.read.parquet('../data/tables/tables 3/transactions_20210828_20220227_snapshot')
print(f'Number of entires = {tables_3_df.count()}')
tables_3_df.limit(5)

Number of entires = 4508106


user_id,merchant_abn,dollar_value,order_id,order_datetime
14935,79417999332,136.06570809815838,23acbb7b-cf98-458...,2021-11-26
1,46451548968,72.61581642788431,76bab304-fa2d-400...,2021-11-26
14936,89518629617,3.0783487174439297,a2ae446a-2959-41c...,2021-11-26
1,49167531725,51.58228625503599,7080c274-17f7-4cc...,2021-11-26
14936,31101120643,25.2281149424178,8e301c0f-06ab-45c...,2021-11-26


In [187]:
tables_4_df = spark.read.parquet('../data/tables/tables 4/transactions_20220228_20220828_snapshot')
print(f'Number of entires = {tables_4_df.count()}')
tables_4_df.limit(5)


Number of entires = 6044133


user_id,merchant_abn,dollar_value,order_id,order_datetime
11139,96152467973,16.213590228273233,785b0080-9e4b-471...,2022-08-20
1,98973094975,86.97955945703498,2560f7b0-ee5d-4b3...,2022-08-20
11139,56762458844,31.513502323509197,0311717b-8b5b-410...,2022-08-20
1,89502033586,124.18468694868491,f8891626-f098-45b...,2022-08-20
11139,96161808980,61.620445567668966,d90a421f-f1da-4bf...,2022-08-20


In [188]:
transactions_df = tables_2_df.union(tables_3_df).union(tables_4_df)
print(f'Number of entires = {transactions_df.count()}')
transactions_df.limit(5)

Number of entires = 14195505


user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20


In [189]:
from pyspark.sql.window import Window

# Join tbl_merchant based on ABN
transactions_df = transactions_df.join(F.broadcast(tbl_merchants_df),
                                       transactions_df.merchant_abn == tbl_merchants_df.merchant_abn).drop(tbl_merchants_df.merchant_abn)

# Join fraud probability based on ABN and date time to ensure correct fraud probability
joined_df = transactions_df.join(F.broadcast(
    merchant_fraud_probability_df),
    (transactions_df['merchant_abn'] == merchant_fraud_probability_df['merchant_abn']) &
    (transactions_df['order_datetime'] >= merchant_fraud_probability_df['merchant_datetime']),
    how='inner'
).drop(merchant_fraud_probability_df.merchant_abn)

# Use Window function to rank based on merchant_date, keeping the most recent one per order
window_spec = Window.partitionBy("user_id", "order_datetime", "merchant_abn").orderBy(F.col("merchant_datetime").desc())

ranked_df = joined_df.withColumn("rank", F.row_number().over(window_spec))

# Filter to keep only the most recent valid merchant_date
transactions_df = ranked_df.filter(F.col("rank") == 1).drop("rank")

# Join fraud probability based on user id and datetime
joined_df = transactions_df.join(F.broadcast(
    consumer_fraud_probability_df),
    (transactions_df['user_id'] == consumer_fraud_probability_df['user_id']) &
    (transactions_df['order_datetime'] >= consumer_fraud_probability_df['consumer_datetime']),
    how='inner'
).drop(consumer_fraud_probability_df.user_id)

# Use Window function to rank based on merchant_date, keeping the most recent one per order
window_spec = Window.partitionBy("user_id", "order_datetime").orderBy(F.col("consumer_datetime").desc())

ranked_df = joined_df.withColumn("rank", F.row_number().over(window_spec))

# Filter to keep only the most recent valid merchant_date
transactions_df = ranked_df.filter(F.col("rank") == 1).drop("rank")

# Join Consumer Details based on user id
transactions_df = transactions_df.join(F.broadcast(consumer_user_details_df),
                                       transactions_df.user_id == consumer_user_details_df.user_id).drop(consumer_user_details_df.user_id)

# Join Consumer Details based on user id
transactions_df = transactions_df.join(F.broadcast(tbl_consumer_pdf),
                                       transactions_df.consumer_id == tbl_consumer_pdf.consumer_id).drop(tbl_consumer_pdf.consumer_id)

transactions_df.limit(5)

24/09/01 14:48:06 WARN TaskSetManager: Stage 652 contains a task of very large size (2276 KiB). The maximum recommended task size is 1000 KiB.
24/09/01 14:48:10 WARN TaskSetManager: Stage 663 contains a task of very large size (2276 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime,name,tags,merchant_datetime,fraud_probability,consumer_datetime,fraud_probability.1,consumer_id,name.1,address,state,postcode,gender
3,94493496784,42.23885411248212,36486127-44c5-4f0...,2022-09-21,Dictum Phasellus ...,"[(gift, card, nov...",2021-11-26,30.57903215900633,2021-11-03,8.300636455314633,1194530,Jill Jones MD,40693 Henry Greens,NT,862,Female
9,21439773999,12.185449928604612,b409ec84-7588-409...,2022-10-12,Mauris Non Institute,"([cable, satellit...",2021-11-26,28.504479048104585,2021-12-13,10.58055311139687,650435,Stephanie Reyes,5813 Denise Land ...,NSW,2482,Female
14,93558142492,182.2576600993008,9df0cbc7-2ef0-499...,2022-06-26,Dolor Quisque Inc.,"((shoe shops), (b...",2021-11-26,31.343502316867784,2021-12-03,16.605682602965032,1343547,Lance Butler,8943 Kenneth Camp,VIC,3332,Male
15,90568944804,940.3586108346468,ac2c4931-c60e-412...,2022-03-08,Diam Eu Dolor LLC,[(tent and awNing...,2021-11-29,33.172494688960434,2021-11-27,16.99959017524405,1463076,Paul Abbott,60495 Ryan Hill,QLD,4512,Male
15,11149063370,7510.903729010571,fc233aa9-caed-421...,2022-07-03,Et Arcu Limited,([art dealers and...,2022-02-25,51.01538421455241,2021-11-27,16.99959017524405,1463076,Paul Abbott,60495 Ryan Hill,QLD,4512,Male
