Import pyspark

In [82]:
#import spark
from pyspark.sql import SparkSession
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 ass2 BNPL group 28")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [83]:
# Read the merchants and consumer parquet data
merchants = spark.read.parquet('../data/tables/tbl_merchants.parquet')
consumers = spark.read.parquet('../data/tables/consumer_user_details.parquet')
transactions = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot')
consumers_csv = spark.read.options(header='True', inferSchema='True', delimiter='|').csv('../data/tables/tbl_consumer.csv')


In [84]:
# The merchants data size
print(merchants.count(),len(merchants.columns))

4026 3


In [85]:
merchants.limit(5)

name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162


In [86]:
# The mechants data column types
merchants.dtypes

[('name', 'string'), ('tags', 'string'), ('merchant_abn', 'bigint')]

In [87]:
consumers.limit(5)

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975


In [88]:
consumers_csv.groupBy('state','postcode').count().limit(5)

state,postcode,count
VIC,3279,175
SA,5261,154
NSW,2334,181
NSW,1440,150
NSW,1430,149


In [89]:
consumers_csv.limit(5)

name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975


In [90]:
transactions.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20


In [91]:
transactions.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [92]:
# get data sizes of transactions and consumers
print("The transactions size is")
print(transactions.count(),len(transactions.columns))

print("The consumers size is")
print(consumers.count(),len(consumers.columns))

The transactions size is
3643266 5
The consumers size is
499999 2


In [93]:
## left outer join transaction data with consumers data by user_id.
new_transaction = transactions.join(consumers, transactions.user_id == consumers.user_id, "leftouter").drop(consumers.user_id)
new_transaction.limit(5)


user_id,merchant_abn,dollar_value,order_id,order_datetime,consumer_id
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20,651338
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20,179208
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20,467663
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20,1194530
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20,467663


In [94]:
new_transaction = new_transaction.join(merchants, new_transaction.merchant_abn == merchants.merchant_abn, "leftouter").drop(merchants.merchant_abn)
new_transaction.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,consumer_id,name,tags
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20,651338,Cursus Non Egesta...,"[(furniture, home..."
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20,179208,Commodo Associates,"[(opticians, optI..."
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20,467663,Lobortis Ultrices...,((music shops - m...
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20,1194530,Ultricies Digniss...,"([gift, card, Nov..."
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20,467663,Dictum Phasellus ...,"[(gift, card, nov..."


In [95]:
new_transaction = new_transaction.join(consumers_csv, new_transaction.consumer_id == consumers_csv.consumer_id, "leftouter").drop(consumers_csv.consumer_id)
new_transaction.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,consumer_id,name,tags,name.1,address,state,postcode,gender
18480,77338620996,516.6146266349506,69b5003f-d2ad-496...,2021-08-26,1398491,Fames Ac Turpis LLC,"[(computers, comp...",John Fischer,4770 Mayer Passag...,QLD,4887,Male
18480,41663117354,57.27189678911508,c9740db1-9cbb-4a5...,2021-06-11,1398491,Litora Torquent I...,"((watch, clock, a...",John Fischer,4770 Mayer Passag...,QLD,4887,Male
18480,29362189014,34.872615881373385,77744155-905d-4e1...,2021-08-09,1398491,Vestibulum Limited,((digital goods: ...,John Fischer,4770 Mayer Passag...,QLD,4887,Male
18480,66571984047,157.0,8004db92-a41b-4b3...,2021-06-22,1398491,Volutpat Company,"((cable, satellit...",John Fischer,4770 Mayer Passag...,QLD,4887,Male
18480,58377425534,718.2841062647379,c6304905-8527-493...,2021-06-22,1398491,At Corp.,((artist supply a...,John Fischer,4770 Mayer Passag...,QLD,4887,Male


In [96]:
new_transaction.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- consumer_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)



In [109]:
## try read curated data
curated_csv = spark.read.options(header = True).csv('../data/curated/full_data.csv')
curated_parquet = spark.read.parquet('../data/curated/full_data.parquet')

In [110]:
curated_csv.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,consumer_id,merchant_name,tags,user_name,address,state,postcode,gender
5663,68559320474,127.87619236848516,92d204da-b3e0-4fe...,2021-08-21,739,Aliquam Auctor As...,([antique shops -...,Cory Hancock,9246 Valerie Lock,NT,880,Male
5663,76819856970,26.58889741187781,c4d1da2b-70e9-453...,2021-04-15,739,Egestas Blandit Ltd,((tent and awning...,Cory Hancock,9246 Valerie Lock,NT,880,Male
5663,66571984047,157.0,560e2457-6945-48b...,2021-08-19,739,Volutpat Company,"((cable, satellit...",Cory Hancock,9246 Valerie Lock,NT,880,Male
5663,16256895427,49.18863399914058,d5679cae-7b2d-4a9...,2021-04-15,739,Tempus Non Founda...,[(lawn and garden...,Cory Hancock,9246 Valerie Lock,NT,880,Male
5663,48534842833,125.6566406864822,56577231-956c-403...,2021-08-22,739,Dictum Eu Inc.,"((gift, card, nov...",Cory Hancock,9246 Valerie Lock,NT,880,Male


In [111]:
curated_parquet.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,consumer_id,merchant_name,tags,user_name,address,state,postcode,gender
5663,68559320474,127.87619236848516,92d204da-b3e0-4fe...,2021-08-21,739,Aliquam Auctor As...,([antique shops -...,Cory Hancock,9246 Valerie Lock,NT,880,Male
5663,66571984047,157.0,560e2457-6945-48b...,2021-08-19,739,Volutpat Company,"((cable, satellit...",Cory Hancock,9246 Valerie Lock,NT,880,Male
5663,48534842833,125.6566406864822,56577231-956c-403...,2021-08-22,739,Dictum Eu Inc.,"((gift, card, nov...",Cory Hancock,9246 Valerie Lock,NT,880,Male
5663,72434058183,70.76516555623456,e238aac2-4d9a-499...,2021-08-14,739,Ipsum Primis Corp...,"[[bOoks, periodic...",Cory Hancock,9246 Valerie Lock,NT,880,Male
5663,37304535258,584.1910006855837,d3325232-457e-410...,2021-08-15,739,Tempus Non Institute,"((shoe shops), (b...",Cory Hancock,9246 Valerie Lock,NT,880,Male


In [112]:
curated_csv.count() == curated_parquet.count()

True

In [113]:
curated_csv.count()

3643266