In [42]:
from pyspark.sql import SparkSession, functions as F
import pandas as pd

# Create a spark session
spark = (
    SparkSession.builder.appName("BNPL Project")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

sdf = spark.read.parquet("../data/curated/process_data.parquet/")

cols = ['merchant_abn', 'user_id', 'dollar_value', 'order_datetime', 'rate', 'category', 'subcategory', 'estimated_region_population_2021_sum',	'persons_earners_2018-19_sum', 'mean_earnings_2018-19_avg', 'sum_earnings_2018-19_sum',	'median_earnings_2018-19_avg', 'med_age_earners_2018-19_avg']
sdf = sdf.select(cols)

sdf = sdf.withColumn('rate', F.col('rate') / 100)

sdf.limit(10)

merchant_abn,user_id,dollar_value,order_datetime,rate,category,subcategory,estimated_region_population_2021_sum,persons_earners_2018-19_sum,mean_earnings_2018-19_avg,sum_earnings_2018-19_sum,median_earnings_2018-19_avg,med_age_earners_2018-19_avg
27093785141,1,366.23,2021-11-17,0.0273,retail_and_wholes...,others_retailing,10636,5795.0,62144.0,360125906.0,52618.0,41.0
49125619545,1,113.59,2022-06-28,0.0566,info_media_and_te...,,10636,5795.0,62144.0,360125906.0,52618.0,41.0
28510201393,1,587.4,2021-09-24,0.0186,retail_and_wholes...,department_stores,10636,5795.0,62144.0,360125906.0,52618.0,41.0
49167531725,1,51.58,2021-11-26,0.0642,retail_and_wholes...,household_goods_r...,10636,5795.0,62144.0,360125906.0,52618.0,41.0
43719937438,1,196.41,2022-05-22,0.0501,info_media_and_te...,,10636,5795.0,62144.0,360125906.0,52618.0,41.0
49278214714,1,7.84,2022-08-10,0.0599,retail_and_wholes...,household_goods_r...,10636,5795.0,62144.0,360125906.0,52618.0,41.0
46007761675,1,100.22,2022-10-24,0.0572,retail_and_wholes...,household_goods_r...,10636,5795.0,62144.0,360125906.0,52618.0,41.0
50315283629,1,115.82,2021-11-13,0.0176,retail_and_wholes...,food_retailing,10636,5795.0,62144.0,360125906.0,52618.0,41.0
46654841462,1,18.5,2022-09-03,0.0381,rental_hiring_and...,,10636,5795.0,62144.0,360125906.0,52618.0,41.0
50315283629,1,63.88,2022-10-13,0.0176,retail_and_wholes...,food_retailing,10636,5795.0,62144.0,360125906.0,52618.0,41.0


### 1. **Identify** Merchants with the greatest gross and lowest volatility in income and the greatest evidence of performance

In [91]:
sdf.groupBy('merchant_abn').agg(
        F.sum('dollar_value').alias('income_total'),
        F.stddev('dollar_value').alias('income_deviation'),
        F.mean('rate').alias('rate'),
        F.count('merchant_abn').alias('count_merchant'))\
    .withColumn('income_total', F.col('income_total') * F.col('rate'))\
    .withColumn('income_deviation', F.col('income_deviation') * F.col('rate'))\
    .orderBy(F.col('count_merchant').desc(), F.col('income_total').desc(), F.col('income_deviation').asc())

                                                                                

merchant_abn,income_total,income_deviation,rate,count_merchant
24852446429,255744.1228860644,0.6226536011043033,0.0294000000000074,289513
86578477987,613410.4864420284,1.5931132977937756,0.0643000000000028,272674
64203420245,215914.12099394572,0.4297217319521721,0.0285999999999927,260780
49891706470,415952.5372599575,0.870864359218618,0.0579999999999939,247526
46804135891,206349.62331796187,0.8804237642008566,0.0292999999999945,234397
45629217853,585975.3666878993,2.5631389491234278,0.069799999999988,228219
89726005175,535420.1324840862,1.4298403330446383,0.0601000000000097,215963
43186523025,404176.7518500293,1.1599064349194936,0.0447000000000032,200913
80324045558,106133.55576602292,0.5410840739271982,0.0147000000000031,196816
63290521567,285260.531759968,1.5744092874763478,0.0647999999999926,181841


### 2. **Ranking** Merchants by Categories

In [102]:
def topNmerchants(sdf, categories, N):
    merchants = {}

    for category in categories:
        tdf = sdf.where(F.col('category') == category)
        tdf = tdf.groupBy('merchant_abn').agg(
            F.sum('dollar_value').alias('income_total'),
            F.stddev('dollar_value').alias('income_deviation'),
            F.mean('rate').alias('rate'),
            F.count('merchant_abn').alias('count_merchant'))\
        .withColumn('income_total', F.col('income_total') * F.col('rate'))\
        .withColumn('income_deviation', F.col('income_deviation') * F.col('rate'))\
        .orderBy(F.col('count_merchant').desc(), F.col('income_total').desc(), F.col('income_deviation').asc())\
        
        merchants[category] = tdf.select('merchant_abn').limit(N).toPandas()['merchant_abn'].to_list()

    return pd.DataFrame(merchants)

In [105]:
categories = ['retail_and_wholesale_trade', 'rental_hiring_and_real_estate', 'arts_and_recreation', 'info_media_and_telecommunications', 'others']

topNmerchants(sdf, categories, N=100)

                                                                                

Unnamed: 0,retail_and_wholesale_trade,rental_hiring_and_real_estate,arts_and_recreation,info_media_and_telecommunications,others
0,24852446429,21232022824,98166254020,21439773999,18158387243
1,86578477987,96834893748,86710922099,35223308778,46451548968
2,64203420245,61112604295,48624093501,84703983173,88699453206
3,49891706470,74093231988,34564250941,46298404088,94072762560
4,46804135891,60829135130,17208085375,35909341340,60706491699
...,...,...,...,...,...
95,86662713230,18814588565,99785979138,54278310951,17496459532
96,74019238521,65462135385,58680132589,65204103269,85726387922
97,30122382323,66741987958,31400548982,78798828265,45940422862
98,52065114842,76618399784,52266154043,47374330858,72647107998


In [106]:
topNmerchants(sdf, categories, N=10)

                                                                                

Unnamed: 0,retail_and_wholesale_trade,rental_hiring_and_real_estate,arts_and_recreation,info_media_and_telecommunications,others
0,24852446429,21232022824,98166254020,21439773999,18158387243
1,86578477987,96834893748,86710922099,35223308778,46451548968
2,64203420245,61112604295,48624093501,84703983173,88699453206
3,49891706470,74093231988,34564250941,46298404088,94072762560
4,46804135891,60829135130,17208085375,35909341340,60706491699
5,45629217853,46654841462,41305045812,67400260923,55501929396
6,89726005175,54474048212,57223200264,41944909975,75104340635
7,43186523025,30209373786,27851049264,17488304283,43073797016
8,80324045558,92773401740,77924688488,29521780474,36607451384
9,63290521567,82812059627,37106509177,69703285964,99803748393


22/10/06 15:33:25 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1207400 ms exceeds timeout 120000 ms
22/10/06 15:33:25 WARN SparkContext: Killing executors is not supported by current scheduler.
