In [18]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

#import spark
from pyspark.sql import SparkSession
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 ass2 BNPL group 28")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [6]:
curated_csv = spark.read.options(header = True).csv('../data/curated/full_data.csv')

curated_csv = curated_csv.withColumn('dollar_value', curated_csv.dollar_value.cast(FloatType())) \
    .withColumn('avg_personal_income_weekly', curated_csv.avg_personal_income_weekly.cast(FloatType())) \
        .withColumn('take_rate', curated_csv.take_rate.cast(FloatType())) \
            .withColumn('total_population', curated_csv.total_population.cast(IntegerType())) \
                .withColumn('is_fraud', when(curated_csv.is_fraud == 'False', 0).otherwise(1))

In [7]:
curated_csv.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,merchant_name,postcode,field,revenue_level,take_rate,total_population,avg_personal_income_weekly,is_fraud
1730,60956456424,142.52,4ab37080-8690-428...,2021-08-20,Ultricies Digniss...,1135,"gift, card, novel...",b,4.69,9656,798.0,0
17274,24852446429,22.63,19000be3-b4a2-4cc...,2021-09-14,Erat Vitae LLP,1135,florists supplies...,c,2.94,9656,798.0,0
1730,43127814599,902.82,28698120-ee5a-42d...,2021-08-20,Nam Ligula Elit F...,1135,lawn and garden s...,b,3.58,9656,798.0,0
17454,21439773999,68.94,c9afee3b-1675-45e...,2021-09-14,Mauris Non Institute,1135,"cable, satellite,...",a,6.1,9656,798.0,0
6737,51279178333,19.73,c8fb7b8a-1224-47e...,2021-08-20,Neque LLC,1135,music shops - mus...,c,1.83,9656,798.0,0


In [8]:
curated_csv.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- merchant_abn: string (nullable = true)
 |-- dollar_value: float (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- field: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: float (nullable = true)
 |-- total_population: integer (nullable = true)
 |-- avg_personal_income_weekly: float (nullable = true)
 |-- is_fraud: integer (nullable = false)



In [9]:
## Count the number of transactions made by each merchant.
merchant_transction_count = curated_csv.groupBy('merchant_abn').count()
merchant_transction_count = merchant_transction_count.withColumnRenamed('count', 'transaction_count')

print("Number of distinct merchants:", merchant_transction_count.count())
merchant_transction_count.show(5)

                                                                                

Number of distinct merchants: 4026




+------------+-----------------+
|merchant_abn|transaction_count|
+------------+-----------------+
| 60978195146|            24012|
| 31245723081|              375|
| 67165527084|             1124|
| 36866208934|              918|
| 21552853184|              229|
+------------+-----------------+
only showing top 5 rows



                                                                                

In [10]:
## Total revenue of a merchant in a given time
merchant_revenue = curated_csv.groupBy('merchant_abn', 'field', 'take_rate', 'revenue_level').sum('dollar_value').withColumnRenamed('sum(dollar_value)', "total_revenue")
merchant_revenue = merchant_revenue.withColumn("total_revenue", round(merchant_revenue["total_revenue"], 2))

merchant_revenue.show(5)



+------------+--------------------+---------+-------------+-------------+
|merchant_abn|               field|take_rate|revenue_level|total_revenue|
+------------+--------------------+---------+-------------+-------------+
| 62773208456|watch, clock, and...|     2.62|            c|    829321.63|
| 14430838529|cable, satellite,...|     2.03|            c|   1118149.05|
| 26445720989|opticians, optica...|     4.28|            b|    694446.65|
| 77990903737|bicycle shops - s...|     3.88|            b|    975183.17|
| 44313899037|furniture, home f...|     4.49|            b|    611146.99|
+------------+--------------------+---------+-------------+-------------+
only showing top 5 rows



                                                                                

In [11]:
## Count the number of Fraud transaction of each merchant
merchant_fraud_count = curated_csv.groupBy('merchant_abn').sum('is_fraud').withColumnRenamed('sum(is_fraud)', 'fraud_count')

merchant_fraud_count.show(5)




+------------+-----------+
|merchant_abn|fraud_count|
+------------+-----------+
| 60978195146|          6|
| 31245723081|          0|
| 67165527084|          0|
| 36866208934|          0|
| 21552853184|          0|
+------------+-----------+
only showing top 5 rows



                                                                                

In [12]:
## Calculate mean income of all consumers of each merchant.
merchant_consumer_income = curated_csv.groupBy('merchant_abn', 'postcode','avg_personal_income_weekly').count()

merchant_consumer_income = merchant_consumer_income.groupBy('merchant_abn') \
    .agg(sum(col('avg_personal_income_weekly')*col('count'))/sum('count')) \
        .withColumnRenamed('(sum((avg_personal_income_weekly * count)) / sum(count))', 'mean_consumer_income')

#merchant_consumer_income.show(5)

In [13]:
merchant_info = merchant_transction_count.join(merchant_revenue, ['merchant_abn']) \
    .join(merchant_consumer_income, ['merchant_abn']) \
        .join(merchant_fraud_count, ['merchant_abn'])

In [14]:
merchant_info.show(5)

22/10/01 20:50:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 20:50:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 20:50:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 20:50:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 20:50:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 20:50:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 20:50:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 20:50:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 20:50:50 WARN RowBasedKeyValueBatch: Calling spill() on

+------------+-----------------+--------------------+---------+-------------+-------------+--------------------+-----------+
|merchant_abn|transaction_count|               field|take_rate|revenue_level|total_revenue|mean_consumer_income|fraud_count|
+------------+-----------------+--------------------+---------+-------------+-------------+--------------------+-----------+
| 11633090957|              287|lawn and garden s...|      4.0|            b|     42067.29|   794.5365864291839|          1|
| 21552853184|              229|computer programm...|     3.64|            b|     42540.13|   771.5008728826931|          0|
| 31245723081|              375|          shoe shops|     6.28|            a|     56052.68|     799.88319921875|          0|
| 33026294208|              159|lawn and garden s...|     3.39|            b|      40337.7|   771.8371071485603|          0|
| 33651513345|             2220|florists supplies...|      6.0|            a|    827241.74|   800.5700445226721|          0|


                                                                                

In [15]:
## count number of merchant in each field
merchant_info.select('field').groupBy('field').count().sort(col('count').desc())

                                                                                

field,count
digital goods: bo...,190
artist supply and...,187
computer programm...,182
shoe shops,182
"gift, card, novel...",178
florists supplies...,171
tent and awning s...,171
"computers, comput...",169
"furniture, home f...",166
health and beauty...,163


Select the most popular postcode in each merchants customers

In [44]:
merchant_main_customers = curated_csv.groupBy('merchant_abn','postcode','total_population').count()
merchant_main_customers = merchant_main_customers.withColumnRenamed('count', 'postcode_count')

In [45]:
windowDept = Window.partitionBy("merchant_abn").orderBy(col("postcode_count").desc())
# Rank the popular customer from area for each merchant
merchant_main_customers = merchant_main_customers.withColumn("row",row_number().over(windowDept))
# Filter the top 5 popular area
merchant_main_customers = merchant_main_customers.filter(col("row") <= 5)

In [46]:
merchant_main_customers = merchant_main_customers.groupBy('merchant_abn').sum('total_population') \
    .withColumnRenamed('sum(total_population)', 'main_business_area_popu')

In [47]:
merchant_main_customers 

22/10/01 22:54:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:54:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:54:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:54:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:54:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:54:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:54:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:54:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:54:09 WARN RowBasedKeyValueBatch: Calling spill() on

merchant_abn,main_business_area_popu
11633090957,39259
21552853184,156626
31245723081,65159
33026294208,164626
33651513345,86495
36866208934,251283
37629693091,49618
38918664617,50979
53074973777,218418
60963420870,166483


In [48]:
merchant_info = merchant_info.join(merchant_main_customers, ['merchant_abn'])

In [49]:
merchant_info

22/10/01 22:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/01 22:55:34 WARN RowBasedKeyValueBatch: Calling spill() on

merchant_abn,transaction_count,field,take_rate,revenue_level,total_revenue,mean_consumer_income,fraud_count,main_business_area_popu
11633090957,287,lawn and garden s...,4.0,b,42067.29,794.5365864291839,1,39259
21552853184,229,computer programm...,3.64,b,42540.13,771.5008728826931,0,156626
31245723081,375,shoe shops,6.28,a,56052.68,799.88319921875,0,65159
33026294208,159,lawn and garden s...,3.39,b,40337.7,771.8371071485603,0,164626
33651513345,2220,florists supplies...,6.0,a,827241.74,800.5700445226721,0,86495
36866208934,918,computer programm...,3.43,b,177024.9,790.7605664288557,0,251283
37629693091,403,"cable, satellite...",3.02,c,37731.39,784.379155994349,0,49618
38918664617,13,"jewelry, watch, c...",3.59,b,264025.01,809.4230769230769,7,50979
53074973777,210,"watch, clock, and...",5.71,a,31071.99,784.0638093494233,0,218418
60963420870,71,"stationery, offic...",4.17,b,34229.5,813.6380279970841,0,166483
