# Project 2 - Group 13

## 0 - Begin Spark

In [13]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "2g")
    .getOrCreate()
)

24/08/28 10:19:09 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## 1 - Data cleaning

### 1.1 - Consumer fraud probability

In [35]:
consumer_fraud_probability_df = spark.read.csv('../data/tables/tables/consumer_fraud_probability.csv', header = True, inferSchema = True)
print(f'Number of entires = {consumer_fraud_probability_df.count()}')
consumer_fraud_probability_df.limit(5)

                                                                                

Number of entires = 34864


user_id,order_datetime,fraud_probability
6228,2021-12-19,97.6298077657765
21419,2021-12-10,99.24738020302328
5606,2021-10-17,84.05825045251777
3101,2021-04-17,91.42192091901347
22239,2021-10-19,94.70342477508036


In [36]:
consumer_fraud_probability_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: double (nullable = true)



### 1.2 - Consumer user details

In [37]:
consumer_user_details_df = spark.read.parquet('../data/tables/tables/consumer_user_details.parquet', inferSchema = True)
print(f'Number of entires = {consumer_user_details_df.count()}')
consumer_user_details_df.limit(5)

Number of entires = 499999


user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975


In [38]:
consumer_user_details_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- consumer_id: long (nullable = true)



### 1.3 - Merchant fraud probability

In [33]:
merchant_fraud_probability_df = spark.read.csv('../data/tables/tables/merchant_fraud_probability.csv', header = True, inferSchema = True)
print(f'Number of entires = {merchant_fraud_probability_df.count()}')
merchant_fraud_probability_df.limit(5)

Number of entires = 114


merchant_abn,order_datetime,fraud_probability
19492220327,2021-11-28,44.40365864749536
31334588839,2021-10-02,42.75530083865367
19492220327,2021-12-22,38.867790051131095
82999039227,2021-12-19,94.1347004808891
90918180829,2021-09-02,43.32551731714902


In [34]:
merchant_fraud_probability_df.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: double (nullable = true)



### 1.4 - TBL Consumer

In [31]:
tbl_consumer_df = spark.read.option("delimiter", '|').csv('../data/tables/tables/tbl_consumer.csv', header = True, inferSchema = True)
print(f'Number of entires = {tbl_consumer_df.count()}')
tbl_consumer_df.limit(5)

                                                                                

Number of entires = 499999


name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975


In [None]:
from pyspark.sql import types
import org.apache.spark.sql.types.{IntegerType StringType}

val consumerSchema = seq

In [32]:
tbl_consumer_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- consumer_id: integer (nullable = true)



### 1.5 - TBL merchants

In [18]:
tbl_merchants_df = spark.read.parquet('../data/tables/tables/tbl_merchants.parquet')
print(f'Number of entires = {tbl_merchants_df.count()}')
tbl_merchants_df.limit(5)

Number of entires = 4026


name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162


In [30]:
tbl_merchants_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- merchant_abn: long (nullable = true)



### 1.6 - Transactions

#### 1.6.1 - Tables 2

In [20]:
tables_2_df = spark.read.parquet('../data/tables/tables 2/transactions_20210228_20210827_snapshot')
print(f'Number of entires = {tables_2_df.count()}')
tables_2_df.limit(5)

                                                                                

Number of entires = 3643266


user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20


In [21]:
tables_3_df = spark.read.parquet('../data/tables/tables 2/transactions_20220228_20220828_snapshot')
print(f'Number of entires = {tables_3_df.count()}')
tables_3_df.limit(5)

                                                                                

Number of entires = 6044133


user_id,merchant_abn,dollar_value,order_id,order_datetime
11139,96152467973,16.213590228273233,785b0080-9e4b-471...,2022-08-20
1,98973094975,86.97955945703498,2560f7b0-ee5d-4b3...,2022-08-20
11139,56762458844,31.513502323509197,0311717b-8b5b-410...,2022-08-20
1,89502033586,124.18468694868491,f8891626-f098-45b...,2022-08-20
11139,96161808980,61.620445567668966,d90a421f-f1da-4bf...,2022-08-20


In [22]:
tables_4_df = spark.read.parquet('../data/tables/tables 2/transactions_20210828_20220227_snapshot')
print(f'Number of entires = {tables_4_df.count()}')
tables_4_df.limit(5)


                                                                                

Number of entires = 4508106


user_id,merchant_abn,dollar_value,order_id,order_datetime
14935,79417999332,136.06570809815838,23acbb7b-cf98-458...,2021-11-26
1,46451548968,72.61581642788431,76bab304-fa2d-400...,2021-11-26
14936,89518629617,3.0783487174439297,a2ae446a-2959-41c...,2021-11-26
1,49167531725,51.58228625503599,7080c274-17f7-4cc...,2021-11-26
14936,31101120643,25.2281149424178,8e301c0f-06ab-45c...,2021-11-26


In [26]:
transactions_df = tables_2_df.union(tables_3_df).union(tables_4_df)
print(f'Number of entires = {transactions_df.count()}')
transactions_df.limit(20)

                                                                                

Number of entires = 14195505


user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20
3,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,2021-08-20
18479,67609108741,86.4040605836911,d0e180f0-cb06-42a...,2021-08-20
3,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,2021-08-20
18482,70501974849,68.75486276223054,8505fb33-b69a-412...,2021-08-20
4,49891706470,48.89796461900801,ed11e477-b09f-4ae...,2021-08-20
