In [1]:
from pyspark.sql import SparkSession, functions as F

In [2]:
# Create a spark session
spark = (
    SparkSession.builder.appName("BNPL Project")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/10/03 20:46:00 WARN Utils: Your hostname, DESKTOP-1ML24G5 resolves to a loopback address: 127.0.1.1; using 172.24.125.18 instead (on interface eth0)
22/10/03 20:46:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/03 20:46:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [18]:
dataset = spark.read.parquet("./../data/curated/process_data.parquet")

### To confirm the number of distinct merchants and whether each merchant has a unique name and ABN.

In [20]:
# Unprocessed merchant file load in:

merchants = spark.read.parquet("../data/tables/tbl_merchants.parquet")

In [21]:
# Number of rows present:

merchants.count()

4026

In [76]:
# Number of distinct merchants (name).

noDistinctMerchants = dataset.select(F.countDistinct("merchant_name"))
noDistinctMerchants.show()



+-----------------------------+
|count(DISTINCT merchant_name)|
+-----------------------------+
|                         4026|
+-----------------------------+



                                                                                

In [77]:
# Number of distinct merchants (ABN).

noDistinctMerchantABNs = dataset.select(F.countDistinct("merchant_abn"))
noDistinctMerchantABNs.show()



+----------------------------+
|count(DISTINCT merchant_abn)|
+----------------------------+
|                        4026|
+----------------------------+



                                                                                

### TOP 100 MERCHANT FUNCTION:<br>

INPUT: file without fraudelent transactions (according to model used).<br>

OUTPUT: top hundred merchants.<br>

This function groups PRIMARILY by "merchant_name" and the sum of each of their sales. A new column ("total_merchant_earning_dollar") is then created by multiplying the<br>
"total_sale_dollars" and the "rate" from each merchant. This gives the amount the BNPL earns from each merchant. The top hundred merchants will then simply be the merchants<br>
that the BNPL firm earns the most from.

In [78]:
def top100_merchants(file):


    TWO = 2
    HUNDRED = 100

    no_of_decimal_places = 2

    # Aggregate:
    merchants_total_sales = file \
        .groupBy("merchant_name", "merchant_abn", "category", "subcategory","revenue", "rate") \
        .agg(
                F.round(F.sum("dollar_value"), no_of_decimal_places).alias("total_sale_dollars")
        )

    # Formation of new column by multiplying the "rate" and "total_sale_dollars". This tells us the amount of money the BNPL firm recieves from each merchant.
    merchants_total_sales = merchants_total_sales.withColumn("total_merchant_earning_dollars", F.round((F.col("rate") * F.col("total_sale_dollars")), TWO))

    # Sort in descending order to get the top hundred merchants that the BNPL firm earns the most from. 
    top100 = merchants_total_sales.orderBy("total_merchant_earning_dollars", ascending = False).limit(HUNDRED)

    return top100

In [80]:
top100 = top100_merchants(dataset)
top100.show(truncate = False)



+-----------------------------+------------+---------------------------------+-----------------------------------------------+-------+------+------------------+------------------------------+
|merchant_name                |merchant_abn|category                         |subcategory                                    |revenue|rate  |total_sale_dollars|total_merchant_earning_dollars|
+-----------------------------+------------+---------------------------------+-----------------------------------------------+-------+------+------------------+------------------------------+
|Amet Risus Inc.              |79827781481 |retail_and_wholesale_trade       |household_goods_retailing                      |a      |0.0682|9734168.57        |663870.3                      |
|Dignissim Maecenas Foundation|48534649627 |retail_and_wholesale_trade       |others_retailing                               |a      |0.0664|9408958.04        |624754.81                     |
|Orci In Consequat Corporation|323610575

                                                                                

### TOP 10 MERCHANT FROM EACH CATEGORY FUNCTION:<br>

INPUT: file without fraudelent transactions (according to model used).<br>

OUTPUT: top ten merchants from each category.<br>

This function is very similar to the top 100 merchant function, but the only difference is that it partitions the aggregation by "category". This allows to find the top ten merchant from each category.

In [81]:
# CODE ATTRIBUTE:
# https://sparkbyexamples.com/pyspark/pyspark-retrieve-top-n-from-each-group-of-dataframe/#:~:text=Conclusion,get%20the%20top%20n%20records.


from pyspark.sql.window import Window

def top10_eachCategory(file):

    TWO = 2
    TEN = 10

    no_of_decimal_places = 2

    # Aggregate:
    merchants_total_sales = file \
        .groupBy("merchant_name", "merchant_abn", "category", "subcategory","revenue", "rate") \
        .agg(
                F.round(F.sum("dollar_value"), no_of_decimal_places).alias("total_sale_dollars")
        )

    # Formation of new column by multiplying the "rate" and "total_sale_dollars". This tells us the amount of money the BNPL firm recieves from each merchant.
    merchants_total_sales = merchants_total_sales.withColumn("total_merchant_earning_dollars", F.round((F.col("rate") * F.col("total_sale_dollars")), TWO))
    
    # Partition by "category" and "total_merchant_earning_dollars" rather than group by in descending order.
    windowDept = Window.partitionBy("category").orderBy(F.col("total_merchant_earning_dollars").desc())

    # Get the top ten categories from each catgeory.
    top10_eachCategory = merchants_total_sales.withColumn("row", F.row_number().over(windowDept)).filter(F.col("row") <= TEN)

    return top10_eachCategory


In [82]:
top10 = top10_eachCategory(dataset)
top10.show(truncate = False)



+----------------------------+------------+---------------------------------+-----------+-------+------+------------------+------------------------------+---+
|merchant_name               |merchant_abn|category                         |subcategory|revenue|rate  |total_sale_dollars|total_merchant_earning_dollars|row|
+----------------------------+------------+---------------------------------+-----------+-------+------+------------------+------------------------------+---+
|Magna Sed Industries        |98166254020 |arts_and_recreation              |null       |a      |0.0596|7475497.13        |445539.63                     |1  |
|Ac Urna Consulting          |86710922099 |arts_and_recreation              |null       |b      |0.0425|4997242.44        |212382.8                      |2  |
|Nullam Scelerisque Ltd      |37106509177 |arts_and_recreation              |null       |a      |0.0654|866872.15         |56693.44                      |3  |
|Et Arcu Limited             |11149063370 |art

                                                                                