### Pyspark initialization

In [2]:
#import spark
from pyspark.sql import SparkSession
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 ass2 BNPL group 28")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/09/07 11:30:14 WARN Utils: Your hostname, Rudyletsgo resolves to a loopback address: 127.0.1.1; using 172.17.87.180 instead (on interface eth0)
22/09/07 11:30:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/09/07 11:30:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/09/07 11:30:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Read data

In [3]:
# Read the merchants and consumer parquet data
merchants = spark.read.parquet('../data/tables/tbl_merchants.parquet')
consumers = spark.read.parquet('../data/tables/consumer_user_details.parquet')
transactions = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot')
consumers_csv = spark.read.options(header='True', inferSchema='True', delimiter='|').csv('../data/tables/tbl_consumer.csv')


### General understanding of each dataset

In [4]:
# The merchants data size
print(merchants.count(),len(merchants.columns))

4026 3


In [5]:
merchants.limit(5)

name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162


In [6]:
# The mechants data column types
merchants.dtypes

[('name', 'string'), ('tags', 'string'), ('merchant_abn', 'bigint')]

In [7]:
consumers.limit(5)

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975


In [8]:
consumers_csv.groupBy('state','postcode').count().limit(5)

                                                                                

state,postcode,count
VIC,3279,175
SA,5261,154
NSW,2334,181
NSW,1440,150
NSW,1430,149


In [9]:
consumers_csv.limit(5)

name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975


In [10]:
transactions.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20


In [11]:
transactions.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [12]:
# get data sizes of transactions and consumers
print("The transactions size is")
print(transactions.count(),len(transactions.columns))

print("The consumers size is")
print(consumers.count(),len(consumers.columns))

The transactions size is
3643266 5
The consumers size is
499999 2


### Join all datasets together

In [13]:
## left outer join transaction data with consumers data by user_id.
new_transaction = transactions.join(consumers, transactions.user_id == consumers.user_id, "leftouter").drop(consumers.user_id)
new_transaction.limit(5)


user_id,merchant_abn,dollar_value,order_id,order_datetime,consumer_id
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20,651338
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20,179208
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20,467663
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20,1194530
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20,467663


In [14]:
new_transaction = new_transaction.join(merchants, new_transaction.merchant_abn == merchants.merchant_abn, "leftouter").drop(merchants.merchant_abn)
new_transaction.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,consumer_id,name,tags
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20,651338,Cursus Non Egesta...,"[(furniture, home..."
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20,179208,Commodo Associates,"[(opticians, optI..."
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20,467663,Lobortis Ultrices...,((music shops - m...
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20,1194530,Ultricies Digniss...,"([gift, card, Nov..."
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20,467663,Dictum Phasellus ...,"[(gift, card, nov..."


In [15]:
new_transaction = new_transaction.join(consumers_csv, new_transaction.consumer_id == consumers_csv.consumer_id, "leftouter").drop(consumers_csv.consumer_id)
new_transaction.limit(5)

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime,consumer_id,name,tags,name.1,address,state,postcode,gender
5630,60956456424,145.26081329000152,1e14adeb-8e13-44f...,2021-08-21,28242,Ultricies Digniss...,"([gift, card, Nov...",Philip Crawford,7487 Serrano Gard...,NT,841,Undisclosed
5630,48534649627,120.25889985200416,08476339-f383-4ab...,2021-08-15,28242,Dignissim Maecena...,"[[opticians, oPti...",Philip Crawford,7487 Serrano Gard...,NT,841,Undisclosed
5630,60956456424,135.5412540082104,aacfd47a-438b-47f...,2021-08-15,28242,Ultricies Digniss...,"([gift, card, Nov...",Philip Crawford,7487 Serrano Gard...,NT,841,Undisclosed
5630,89932674734,95.37693966478514,6d5790c9-0eef-453...,2021-08-16,28242,Nulla Vulputate C...,((aRtist supply a...,Philip Crawford,7487 Serrano Gard...,NT,841,Undisclosed
5630,14089706307,440.1209771148284,43d1361a-1101-41a...,2021-08-16,28242,Donec Institute,[(computer progra...,Philip Crawford,7487 Serrano Gard...,NT,841,Undisclosed


In [16]:
new_transaction.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- consumer_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)



### Curated data exploration

In [17]:
## try read curated data
curated_csv = spark.read.options(header = True).csv('../data/curated/full_data.csv')
curated_parquet = spark.read.parquet('../data/curated/full_data.parquet')

In [18]:
curated_csv.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,consumer_id,merchant_name,tags,user_name,address,state,postcode,gender
19087,97861055416,120.23855660098252,5dbcd6ea-38eb-4e6...,2021-08-20,5538,Morbi Neque Inc.,"[[coMputers, comp...",Alexandra Davis,90889 Williams Lo...,QLD,4417,Female
19087,86578477987,72.28588946469911,e6afee18-ee2b-447...,2021-08-20,5538,Leo In Consulting,"[[watch, clock, a...",Alexandra Davis,90889 Williams Lo...,QLD,4417,Female
19087,29936037423,29.50053489475463,d7e18a71-5e10-4e0...,2021-08-19,5538,Sodales Incorporated,"((hobby, toy and ...",Alexandra Davis,90889 Williams Lo...,QLD,4417,Female
19087,43061683632,18.123279553123165,19511b77-2d85-402...,2021-08-19,5538,A Facilisis PC,"([watch, clock, a...",Alexandra Davis,90889 Williams Lo...,QLD,4417,Female
19087,45629217853,21.107949409694672,1b29ade5-b0cd-47f...,2021-08-19,5538,Lacus Consulting,"[[gift, Card, nov...",Alexandra Davis,90889 Williams Lo...,QLD,4417,Female


In [19]:
curated_parquet.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,consumer_id,merchant_name,tags,user_name,address,state,postcode,gender
19087,97861055416,120.23855660098252,5dbcd6ea-38eb-4e6...,2021-08-20,5538,Morbi Neque Inc.,"[[coMputers, comp...",Alexandra Davis,90889 Williams Lo...,QLD,4417,Female
19087,86578477987,72.28588946469911,e6afee18-ee2b-447...,2021-08-20,5538,Leo In Consulting,"[[watch, clock, a...",Alexandra Davis,90889 Williams Lo...,QLD,4417,Female
19087,29936037423,29.50053489475463,d7e18a71-5e10-4e0...,2021-08-19,5538,Sodales Incorporated,"((hobby, toy and ...",Alexandra Davis,90889 Williams Lo...,QLD,4417,Female
19087,43061683632,18.123279553123165,19511b77-2d85-402...,2021-08-19,5538,A Facilisis PC,"([watch, clock, a...",Alexandra Davis,90889 Williams Lo...,QLD,4417,Female
19087,45629217853,21.107949409694672,1b29ade5-b0cd-47f...,2021-08-19,5538,Lacus Consulting,"[[gift, Card, nov...",Alexandra Davis,90889 Williams Lo...,QLD,4417,Female


In [20]:
curated_csv.count() == curated_parquet.count()

                                                                                

True

In [21]:
print(curated_csv.count(),len(curated_csv.columns))

3643266 13


In [35]:
curated_parquet.dtypes

[('user_id', 'bigint'),
 ('merchant_abn', 'bigint'),
 ('dollar_value', 'double'),
 ('order_id', 'string'),
 ('order_datetime', 'date'),
 ('consumer_id', 'bigint'),
 ('merchant_name', 'string'),
 ('tags', 'string'),
 ('user_name', 'string'),
 ('address', 'string'),
 ('state', 'string'),
 ('postcode', 'int'),
 ('gender', 'string')]

### Null data filtering

In [37]:
from pyspark.sql.functions import col,isnan, when, count
curated_csv.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in curated_csv.columns]
   ).show()



+-------+------------+------------+--------+--------------+-----------+-------------+------+---------+-------+-----+--------+------+
|user_id|merchant_abn|dollar_value|order_id|order_datetime|consumer_id|merchant_name|  tags|user_name|address|state|postcode|gender|
+-------+------------+------------+--------+--------------+-----------+-------------+------+---------+-------+-----+--------+------+
|      0|           0|           0|       0|             0|          0|       149228|149228|        0|      0|    0|       0|     0|
+-------+------------+------------+--------+--------------+-----------+-------------+------+---------+-------+-----+--------+------+



                                                                                

In [None]:
from pyspark.sql.functions import col
print(curated_parquet.filter(col("name").isNotNull()).count())

In [39]:
merchants

name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162
Fusce Company,"[(gift, card, nov...",10206519221
Aliquam Enim Inco...,"[(computers, comP...",10255988167
Ipsum Primis Ltd,"[[watch, clock, a...",10264435225
Pede Ultrices Ind...,([computer progra...,10279061213
Nunc Inc.,"[(furniture, home...",10323485998


22/09/07 12:05:46 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 200508 ms exceeds timeout 120000 ms
22/09/07 12:05:46 WARN SparkContext: Killing executors is not supported by current scheduler.
