In [82]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

#import spark
from pyspark.sql import SparkSession
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 ass2 BNPL group 28")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [83]:
curated_csv = spark.read.options(header = True).csv('../data/curated/full_data.csv')

curated_csv = curated_csv.withColumn('dollar_value', curated_csv.dollar_value.cast(FloatType())) \
    .withColumn('avg_personal_income_weekly', curated_csv.avg_personal_income_weekly.cast(FloatType())) \
        .withColumn('take_rate', curated_csv.take_rate.cast(FloatType())) \
            .withColumn('total_population', curated_csv.total_population.cast(IntegerType())) \
                .withColumn('is_fraud', when(curated_csv.is_fraud == 'False', 0).otherwise(1))

In [84]:
curated_csv.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,merchant_name,postcode,field,revenue_level,take_rate,total_population,avg_personal_income_weekly,is_fraud
626,41705715409,37.95,721e73d8-d2e5-4ac...,2021-11-23,Augue Eu LLP,4101,"furniture, home f...",a,6.05,40576,966.8,0
626,35733444320,51.82,69a7f2ba-6622-447...,2021-11-23,Arcu Ac Limited,4101,"watch, clock, and...",b,3.41,40576,966.8,0
15739,52509095251,142.77,6ba8b570-607d-45e...,2021-11-23,Justo Proin Assoc...,1238,artist supply and...,c,1.91,9656,798.0,0
15739,45629217853,8.11,b2830fc4-1f66-4d5...,2021-11-23,Lacus Consulting,1238,"gift, card, novel...",a,6.98,9656,798.0,0
802,33344911835,331.42,da944869-6c0a-46f...,2021-11-23,Aliquam Arcu Ltd,2866,florists supplies...,a,5.58,26562,778.3,0


In [85]:
curated_csv.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- merchant_abn: string (nullable = true)
 |-- dollar_value: float (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- field: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: float (nullable = true)
 |-- total_population: integer (nullable = true)
 |-- avg_personal_income_weekly: float (nullable = true)
 |-- is_fraud: integer (nullable = false)



In [86]:
## Count the number of transactions made by each merchant.
merchant_transction_count = curated_csv.groupBy('merchant_abn').count()
merchant_transction_count = merchant_transction_count.withColumnRenamed('count', 'transaction_count')

print("Number of distinct merchants:", merchant_transction_count.count())
merchant_transction_count.show(5)

                                                                                

Number of distinct merchants: 4026




+------------+-----------------+
|merchant_abn|transaction_count|
+------------+-----------------+
| 60978195146|            24012|
| 73792600690|              177|
| 31245723081|              375|
| 33651513345|             2220|
| 36866208934|              918|
+------------+-----------------+
only showing top 5 rows



                                                                                

In [87]:
## Total revenue of a merchant in a given time
merchant_revenue = curated_csv.groupBy('merchant_abn', 'field', 'take_rate', 'revenue_level').sum('dollar_value').withColumnRenamed('sum(dollar_value)', "total_revenue")
merchant_revenue = merchant_revenue.withColumn("total_revenue", round(merchant_revenue["total_revenue"], 2))

merchant_revenue.show(5)



+------------+--------------------+---------+-------------+-------------+
|merchant_abn|               field|take_rate|revenue_level|total_revenue|
+------------+--------------------+---------+-------------+-------------+
| 26445720989|opticians, optica...|     4.28|            b|    694446.65|
| 62773208456|watch, clock, and...|     2.62|            c|    829321.63|
| 74309678848|computers, comput...|     5.72|            a|    152179.78|
| 14430838529|cable, satellite,...|     2.03|            c|   1118149.05|
| 18355044772|tent and awning s...|     4.35|            b|     31597.03|
+------------+--------------------+---------+-------------+-------------+
only showing top 5 rows



                                                                                

In [88]:
## Count the number of Fraud transaction of each merchant
merchant_fraud_count = curated_csv.groupBy('merchant_abn').sum('is_fraud').withColumnRenamed('sum(is_fraud)', 'fraud_count')

merchant_fraud_count.show(5)




+------------+-----------+
|merchant_abn|fraud_count|
+------------+-----------+
| 60978195146|          6|
| 73792600690|          2|
| 31245723081|          0|
| 33651513345|          0|
| 36866208934|          0|
+------------+-----------+
only showing top 5 rows



                                                                                

In [89]:
## Calculate mean income of all consumers of each merchant.
merchant_consumer_income = curated_csv.groupBy('merchant_abn', 'postcode','avg_personal_income_weekly').count()

merchant_consumer_income = merchant_consumer_income.groupBy('merchant_abn') \
    .agg(sum(col('avg_personal_income_weekly')*col('count'))/sum('count')) \
        .withColumnRenamed('(sum((avg_personal_income_weekly * count)) / sum(count))', 'mean_consumer_income')

merchant_consumer_income.show(5)

[Stage 348:>                                                      (0 + 20) / 23]

22/10/01 17:32:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:12 WARN RowBasedKeyValueBatch: Calling spill() on



22/10/01 17:32:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:18 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 350:>                                                      (0 + 20) / 21]

+------------+--------------------+
|merchant_abn|mean_consumer_income|
+------------+--------------------+
| 11633090957|   794.5365864291839|
| 21552853184|   771.5008728826931|
| 31245723081|     799.88319921875|
| 33026294208|   771.8371071485603|
| 33651513345|   800.5700445226721|
+------------+--------------------+
only showing top 5 rows



                                                                                

In [90]:
merchant_info = merchant_transction_count.join(merchant_revenue, ['merchant_abn']) \
    .join(merchant_consumer_income, ['merchant_abn']) \
        .join(merchant_fraud_count, ['merchant_abn'])

In [91]:
merchant_info.show(5)

[Stage 355:>(20 + 3) / 23][Stage 356:>(0 + 17) / 23][Stage 357:> (0 + 0) / 23]3]

22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 355:>(21 + 2) / 23][Stage 356:>(0 + 18) / 23][Stage 357:> (0 + 0) / 23]

22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 355:>(22 + 1) / 23][Stage 356:>(0 + 19) / 23][Stage 357:> (0 + 0) / 23]

22/10/01 17:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:28 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 356:>(0 + 20) / 23][Stage 357:> (0 + 0) / 23][Stage 359:>  (0 + 0) / 2]

22/10/01 17:32:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:30 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 356:>(19 + 4) / 23][Stage 357:>(0 + 16) / 23][Stage 359:>  (0 + 0) / 2]

22/10/01 17:32:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 356:>(21 + 2) / 23][Stage 357:>(0 + 18) / 23][Stage 359:>  (0 + 0) / 2]

22/10/01 17:32:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/10/01 17:32:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 17:32:37 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 366:>                                                      (0 + 20) / 21]

22/10/01 17:32:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

+------------+-----------------+--------------------+---------+-------------+-------------+--------------------+-----------+
|merchant_abn|transaction_count|               field|take_rate|revenue_level|total_revenue|mean_consumer_income|fraud_count|
+------------+-----------------+--------------------+---------+-------------+-------------+--------------------+-----------+
| 11633090957|              287|lawn and garden s...|      4.0|            b|     42067.29|   794.5365864291839|          1|
| 21552853184|              229|computer programm...|     3.64|            b|     42540.13|   771.5008728826931|          0|
| 31245723081|              375|          shoe shops|     6.28|            a|     56052.68|     799.88319921875|          0|
| 33026294208|              159|lawn and garden s...|     3.39|            b|      40337.7|   771.8371071485603|          0|
| 33651513345|             2220|florists supplies...|      6.0|            a|    827241.74|   800.5700445226721|          0|


In [97]:
## count number of merchant in each field
merchant_info.select('field').groupBy('field').count().sort(col('count').desc())

                                                                                

field,count
digital goods: bo...,190
artist supply and...,187
computer programm...,182
shoe shops,182
"gift, card, novel...",178
tent and awning s...,171
florists supplies...,171
"computers, comput...",169
"furniture, home f...",166
bicycle shops - s...,163
