In [121]:
from pyspark.sql import SparkSession, functions as F
import pandas as pd

# Create a spark session
spark = (
    SparkSession.builder.appName("BNPL Project")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

sdf = spark.read.parquet("../data/curated/process_data.parquet/")

cols = ['merchant_name', 'user_id', 'dollar_value', 'order_datetime', 'rate', 'category', 'subcategory', 'estimated_region_population_2021_sum',	'persons_earners_2018-19_sum', 'mean_earnings_2018-19_avg', 'sum_earnings_2018-19_sum',	'median_earnings_2018-19_avg', 'med_age_earners_2018-19_avg']
sdf = sdf.select(cols)

sdf = sdf.withColumn('rate', F.col('rate') / 100)

sdf.limit(10)

merchant_name,user_id,dollar_value,order_datetime,rate,category,subcategory,estimated_region_population_2021_sum,persons_earners_2018-19_sum,mean_earnings_2018-19_avg,sum_earnings_2018-19_sum,median_earnings_2018-19_avg,med_age_earners_2018-19_avg
Placerat Orci Ins...,1,366.23,2021-11-17,0.0273,retail_and_wholes...,others_retailing,10636,5795.0,62144.0,360125906.0,52618.0,41.0
Adipiscing Elit C...,1,113.59,2022-06-28,0.0566,info_media_and_te...,,10636,5795.0,62144.0,360125906.0,52618.0,41.0
Quam Curabitur Co...,1,587.4,2021-09-24,0.0186,retail_and_wholes...,department_stores,10636,5795.0,62144.0,360125906.0,52618.0,41.0
Felis Institute,1,51.58,2021-11-26,0.0642,retail_and_wholes...,household_goods_r...,10636,5795.0,62144.0,360125906.0,52618.0,41.0
Class Aptent Taci...,1,196.41,2022-05-22,0.0501,info_media_and_te...,,10636,5795.0,62144.0,360125906.0,52618.0,41.0
Augue Malesuada LLP,1,7.84,2022-08-10,0.0599,retail_and_wholes...,household_goods_r...,10636,5795.0,62144.0,360125906.0,52618.0,41.0
Est Nunc Ullamcor...,1,100.22,2022-10-24,0.0572,retail_and_wholes...,household_goods_r...,10636,5795.0,62144.0,360125906.0,52618.0,41.0
Iaculis Aliquet D...,1,115.82,2021-11-13,0.0176,retail_and_wholes...,food_retailing,10636,5795.0,62144.0,360125906.0,52618.0,41.0
Iaculis Enim Corp.,1,18.5,2022-09-03,0.0381,rental_hiring_and...,,10636,5795.0,62144.0,360125906.0,52618.0,41.0
Iaculis Aliquet D...,1,63.88,2022-10-13,0.0176,retail_and_wholes...,food_retailing,10636,5795.0,62144.0,360125906.0,52618.0,41.0


### 1. **Identify** Merchants with the greatest gross and lowest volatility in income and the greatest evidence of performance

In [122]:
fullSdf = sdf.groupBy('merchant_name').agg(
        F.sum('dollar_value').alias('income_total'),
        F.stddev('dollar_value').alias('income_deviation'),
        F.mean('rate').alias('rate'),
        F.count('merchant_name').alias('count_merchant'))\
    .withColumn('income_total', F.col('income_total') * F.col('rate'))\
    .withColumn('income_deviation', F.col('income_deviation') * F.col('rate'))\
    .orderBy(F.col('count_merchant').desc(), F.col('income_total').desc(), F.col('income_deviation').asc())

### 2. **Ranking** Merchants by Categories

In [123]:
def topNmerchants(sdf, categories, N):
    merchants = {}

    for category in categories:
        tdf = sdf.where(F.col('category') == category)
        tdf = tdf.groupBy('merchant_name').agg(
            F.sum('dollar_value').alias('income_total'),
            F.stddev('dollar_value').alias('income_deviation'),
            F.mean('rate').alias('rate'),
            F.count('merchant_name').alias('count_merchant'))\
        .withColumn('income_total', F.col('income_total') * F.col('rate'))\
        .withColumn('income_deviation', F.col('income_deviation') * F.col('rate'))\
        .orderBy(F.col('count_merchant').desc(), F.col('income_total').desc(), F.col('income_deviation').asc())\
        
        merchants[category] = tdf.select('merchant_name').limit(N).toPandas()['merchant_name'].to_list()

    return pd.DataFrame(merchants)

In [124]:
categories = ['retail_and_wholesale_trade', 'rental_hiring_and_real_estate', 'arts_and_recreation', 'info_media_and_telecommunications', 'others']

In [125]:
topNmerchants(sdf, categories, N=10)

                                                                                

Unnamed: 0,retail_and_wholesale_trade,rental_hiring_and_real_estate,arts_and_recreation,info_media_and_telecommunications,others
0,Erat Vitae LLP,Quis Massa Mauris Corporation,Magna Sed Industries,Mauris Non Institute,Nec Tellus Ltd
1,Leo In Consulting,Quam A Felis Incorporated,Ac Urna Consulting,Euismod In Corp.,Tempus Eu Ligula Limited
2,Pede Nonummy Corp.,Vel Lectus Cum LLC,Mi Consulting,Amet Consulting,Sed Nec Inc.
3,Non Vestibulum Industries,Mi Eleifend Company,Lorem LLP,Feugiat Sed Nec Institute,Natoque Consulting
4,Suspendisse Dui Corporation,Tellus Imperdiet Non Inc.,Volutpat Ornare Facilisis Associates,Arcu Sed Eu Incorporated,Gravida Praesent Corp.
5,Lacus Consulting,Iaculis Enim Corp.,Ligula Tortor Incorporated,Eleifend PC,Mauris Sagittis Corp.
6,Est Nunc Consulting,Leo Morbi Limited,Massa Suspendisse Corp.,Et Nunc Consulting,Nisl Elementum Ltd
7,Lorem Ipsum Sodales Industries,Laoreet Inc.,Turpis Nulla Foundation,Posuere Cubilia Curae Corporation,Molestie Arcu Corporation
8,Ipsum Dolor Sit Corporation,Massa Limited,Elit Curabitur Sed PC,At Sem Corp.,Feugiat Lorem Incorporated
9,Vehicula Pellentesque Corporation,Fermentum Institute,Nullam Scelerisque Ltd,Suspendisse Incorporated,Risus Odio Auctor Foundation


In [126]:
topNmerchants(sdf, categories, N=100)

                                                                                

Unnamed: 0,retail_and_wholesale_trade,rental_hiring_and_real_estate,arts_and_recreation,info_media_and_telecommunications,others
0,Erat Vitae LLP,Quis Massa Mauris Corporation,Magna Sed Industries,Mauris Non Institute,Nec Tellus Ltd
1,Leo In Consulting,Quam A Felis Incorporated,Ac Urna Consulting,Euismod In Corp.,Tempus Eu Ligula Limited
2,Pede Nonummy Corp.,Vel Lectus Cum LLC,Mi Consulting,Amet Consulting,Sed Nec Inc.
3,Non Vestibulum Industries,Mi Eleifend Company,Lorem LLP,Feugiat Sed Nec Institute,Natoque Consulting
4,Suspendisse Dui Corporation,Tellus Imperdiet Non Inc.,Volutpat Ornare Facilisis Associates,Arcu Sed Eu Incorporated,Gravida Praesent Corp.
...,...,...,...,...,...
95,Vestibulum Accumsan Associates,Dolor Inc.,Elit Curabitur LLP,Nullam Institute,Nonummy Ultricies Ornare Institute
96,Ac Limited,Vitae Velit LLC,Ut Nec Urna PC,Est Ac Mattis Ltd,Vivamus Sit Associates
97,Ipsum Company,Aliquam Gravida Incorporated,Non Egestas PC,Diam LLC,Interdum Curabitur Institute
98,Gravida Non PC,At Lacus LLP,Lobortis Tellus Justo Foundation,Aliquam Ultrices Iaculis Corporation,Eu Sem Associates
